diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 5dc6e74b8f3333f9585c9f67690babedfea1b320..a807911147939e7d13a26be27e7f7c3ab86ed52a 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -133,7 +133,7 @@ cc_test(version_test SRCS version_test.cc DEPS version) cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version) -cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc memory_optimize_helper) +cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc) nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) @@ -204,7 +204,6 @@ cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry proto_desc) -cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS inplace_op_pass op_registry proto_desc op_info memory_optimize_helper pass_builder) cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 30fc77e532f4686029ac587ded8383a0058ebc01..6ec60700f0e17d81c036627c979f36da8fb25426 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -62,7 +62,7 @@ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope d cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass inplace_op_pass buffer_shared_inplace_op_pass buffer_shared_cross_op_memory_reuse_pass) +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass buffer_shared_inplace_op_pass buffer_shared_cross_op_memory_reuse_pass) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope @@ -92,6 +92,6 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS multi_devices_graph_print_pass multi_devices_graph_check_pass fuse_elewise_add_act_pass multi_batch_merge_pass fuse_relu_depthwise_conv_pass - memory_optimize_pass lock_free_optimize_pass + lock_free_optimize_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass - fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass record_skip_memory_opt_vars_pass) + fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass) diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 003ca23085c075bf26b05c71ef4c8c5d22963e61..464226b4a8284be6d43bb8f87a9e556777513a76 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -24,8 +24,6 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph_printer.h" #include "paddle/fluid/framework/ir/graph_to_program_pass.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h" DECLARE_bool(use_mkldnn); @@ -51,17 +49,13 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { ResolveOptionConfliction(); AppendPrintGraphPass("graph_viz_pass", "_original_graph"); - // Note(zcd): record_skip_memory_opt_vars_pass should - // be the first pass. - AppendPass("record_skip_memory_opt_vars_pass"); AppendPassWithCheck(strategy_.enable_sequential_execution_, "sequential_execution_pass"); AppendPassWithCheck(strategy_.sync_batch_norm_, "sync_batch_norm_pass"); AppendOpFusePasses(); AppendPrintGraphPass("graph_viz_pass", "_fused_graph"); - // TODO(dev-paddle): memory optimize pass should be placed last. - AppendMemoryOptimizePasses(); + AppendMultiDevPass(); AppendMultiGraphOptPasses(); @@ -147,23 +141,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { } } - void AppendMemoryOptimizePasses() { // Append Memory Optimize Pass - // TODO(zjl): refactor MemoryOptimizePass to fit - // new strategy, which does not need to set - // var.persistable = True - if (strategy_.use_legacy_memory_optimize_strategy_) { - AppendPassWithCheck(strategy_.enable_inplace_, "inplace_pass"); - } - // NOTE(dzh): memory optimize should be a runtime pass. - // However, after multi_devices_pass, VarHandle, OpHandle is - // the de-fact IR, any reuse on Graph is meaningless. - // A side-effect of that, memory optimize cannot forsee the fetched vars - // , so fetchlist should be set persistable before call the Run interface. - if (strategy_.use_legacy_memory_optimize_strategy_) { - AppendPassWithCheck(strategy_.memory_optimize_, "memory_optimize_pass"); - } - } - void SetCollectiveContext() const { CollectiveContext *context = CollectiveContext::GetInstance(); context->endpoints_ = strategy_.trainers_endpoints_; @@ -330,9 +307,6 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, "GPU, skipped."; continue; } - } else if (pass->Type() == "inplace_pass") { - pass->Erase(ir::kUseCuda); - pass->Set(ir::kUseCuda, new bool(use_cuda)); } else if (pass->Type() == "mkldnn_placement_pass") { pass->Set("mkldnn_enabled_op_types", new std::unordered_set(mkldnn_enabled_op_types_)); @@ -365,12 +339,10 @@ USE_PASS(all_reduce_mode_multi_devices_pass); USE_PASS(dist_multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); -USE_PASS(memory_optimize_pass); USE_PASS(sequential_execution_pass); USE_PASS(all_reduce_deps_pass); USE_PASS(backward_optimizer_op_deps_pass); USE_PASS(modify_op_lock_and_record_event_pass); -USE_PASS(inplace_pass); USE_PASS(lock_free_optimize_pass); USE_PASS(coalesce_grad_tensor_pass); USE_PASS(graph_to_program_pass); @@ -379,7 +351,6 @@ USE_PASS(fuse_sgd_op_pass); USE_PASS(fuse_momentum_op_pass); USE_PASS(fuse_all_reduce_op_pass); USE_PASS(runtime_context_cache_pass); -USE_PASS(record_skip_memory_opt_vars_pass); #ifdef PADDLE_WITH_MKLDNN USE_PASS(mkldnn_placement_pass); #endif diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 8b767222324d525ee5b8f38e37c55fa2d653190d..929cb51b8454b0220c5dbe6a7b82af6af06c1d53 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -19,6 +19,7 @@ #include #include #include +#include "boost/optional.hpp" #include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" @@ -108,14 +109,14 @@ struct BuildStrategy { // FLAGS_use_mkldnn=false std::unordered_set mkldnn_enabled_op_types_; - bool memory_optimize_{false}; + // By default, memory_optimize would be opened if gc is disabled, and + // be closed if gc is enabled. + // Users can forcely enable/disable memory_optimize by setting True/False. + boost::optional memory_optimize_{boost::none}; // Turn on inplace by default. bool enable_inplace_{true}; - // TODO(zjl): Remove this flag when MemoryOptimizePass is refactored - bool use_legacy_memory_optimize_strategy_{false}; - // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, // num_trainers is 1, so the current fields of build_strategy doesn't tell if // it's distributed model. diff --git a/paddle/fluid/framework/inplace_op_inference.h b/paddle/fluid/framework/inplace_op_inference.h index fdc0c2023cc2e3b3838ef0c66914f8c927cc18c9..95fd5b046a5db56713beb52effcaf1818c715358 100644 --- a/paddle/fluid/framework/inplace_op_inference.h +++ b/paddle/fluid/framework/inplace_op_inference.h @@ -13,13 +13,8 @@ // limitations under the License. #pragma once -#include -#include #include #include -#include -#include "glog/logging.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/type_defs.h" diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc deleted file mode 100644 index 3e3c8864326f0558968d1c1ffe477a29a1afada0..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/inplace_op_inference_test.cc +++ /dev/null @@ -1,329 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#include -#include -#include -#include -#include -#include "gtest/gtest.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" -#include "paddle/fluid/framework/ir/pass_builder.h" -#include "paddle/fluid/framework/op_info.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/framework/var_type_inference.h" - -USE_PASS(inplace_pass); - -namespace paddle { -namespace framework { - -std::unique_ptr CreateInplacePass() { - auto pass = ir::PassRegistry::Instance().Get("inplace_pass"); - pass->Set(ir::kUseCuda, new bool(true)); - return pass; -} - -class NOP : public OperatorBase { - public: - NOP(const std::string& type, const VariableNameMap& inputs, - const VariableNameMap& outputs, const AttributeMap& attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const Scope& scope, - const platform::Place& place) const override {} -}; - -class SingleOpMaker : public OpProtoAndCheckerMaker { - public: - void Make() { - AddInput("X", "").AsDuplicable(); - AddOutput("Out", ""); - AddComment(""); - } -}; - -class SingleGradOpMaker : public framework::SingleGradOpDescMaker { - public: - using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; - - protected: - std::unique_ptr Apply() const override { - auto* op = new framework::OpDesc(); - op->SetType("single_op_grad"); - op->SetInput("Out", OutputGrad("Out")); - op->SetOutput(framework::GradVarName("X"), InputGrad("X")); - return std::unique_ptr(op); - } -}; - -class SingleOpShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* ctx) const override { - ctx->HasInput("X"); - ctx->HasOutput("Out"); - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - } -}; - -class SingleGradOpShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* ctx) const override { - ctx->HasInput(framework::GradVarName("Out")); - ctx->HasOutput(framework::GradVarName("X")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out")); - } -}; - -class MultiOutOpMaker : public OpProtoAndCheckerMaker { - public: - void Make() { - AddInput("X", "").AsDuplicable(); - AddInput("Y", "").AsDuplicable(); - AddInput("Z", "").AsDuplicable(); - AddOutput("Out", ""); - AddOutput("YOut", ""); - AddOutput("ZOut", ""); - AddOutput("NotReuseOut", ""); - AddComment(""); - } -}; - -class MultiOutShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* ctx) const override { - ctx->ShareDim("X", "Out"); - ctx->ShareDim("Y", "YOut"); - ctx->ShareDim("Z", "ZOut"); - } -}; - -class MultiGradOpMaker : public framework::SingleGradOpDescMaker { - public: - using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; - - protected: - std::unique_ptr Apply() const override { - auto* op = new framework::OpDesc(); - op->SetType("multi_out_grad"); - op->SetInput("X", Input("X")); - op->SetOutput(framework::GradVarName("Y"), OutputGrad("YOut")); - op->SetOutput(framework::GradVarName("X"), OutputGrad("Out")); - op->SetOutput(framework::GradVarName("Z"), OutputGrad("ZOut")); - return std::unique_ptr(op); - } -}; - -class MultiOutGradShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* ctx) const override { - ctx->SetOutputDim(framework::GradVarName("Y"), - ctx->GetInputDim(framework::GradVarName("YOut"))); - ctx->SetOutputDim(framework::GradVarName("X"), - ctx->GetInputDim(framework::GradVarName("Out"))); - ctx->SetOutputDim(framework::GradVarName("Z"), - ctx->GetInputDim(framework::GradVarName("ZOut"))); - } -}; - -class MultiOutInplaceInToOut : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const OpDesc& op_desc, bool use_cuda) const override { - return std::unordered_map{ - {"X", "Out"}, {"Y", "YOut"}, {"Z", "ZOut"}, - }; - } -}; - -class MultiOutGradInplaceInToOut : public framework::InplaceOpInference { - public: - std::unordered_map operator()( - const OpDesc& op_desc, bool use_cuda) const override { - return std::unordered_map{ - {framework::GradVarName("YOut"), framework::GradVarName("Y")}, - {framework::GradVarName("Out"), framework::GradVarName("X")}, - {framework::GradVarName("ZOut"), framework::GradVarName("Z")}, - }; - } -}; - -} // namespace framework -} // namespace paddle - -namespace f = paddle::framework; -REGISTER_OPERATOR(single_op, f::NOP, f::SingleOpMaker, f::SingleGradOpMaker, - f::SingleOpInplaceInToOut, f::SingleOpShapeInference); -REGISTER_OPERATOR(single_op_grad, f::NOP, f::SingleOpInplaceInToOut, - f::SingleGradOpShapeInference); -REGISTER_OPERATOR(multi_out_op, f::NOP, f::MultiOutOpMaker, f::MultiGradOpMaker, - f::MultiOutInplaceInToOut, f::MultiOutShapeInference); -REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut, - f::MultiOutGradShapeInference); - -namespace paddle { -namespace framework { - -void FakeSuccData(ProgramDesc* prog) { // NOLINT - prog->MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); - prog->MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128}); - prog->MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); - prog->MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); - prog->MutableBlock(0)->Var("test2_out"); - prog->MutableBlock(0)->Var("test2_out")->SetShape({64, 32, 128, 128}); -} - -void FakeNoInplaceData(ProgramDesc* prog) { // NOLINT - prog->MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); - prog->MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128}); - prog->MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); - prog->MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); - prog->MutableBlock(0)->Var("test2_out"); - prog->MutableBlock(0)->Var("test2_out")->SetShape({64, 31, 128, 128}); -} - -ir::Node* GetNodeFromGraph(ir::Graph* g, std::string name) { - ir::Node* op_node = nullptr; - for (auto& item : g->Nodes()) { - if (item->Name() == name) { - op_node = item; - break; - } - } - return op_node; -} - -std::unique_ptr test_SingleOpInplaceInToOut( - std::unique_ptr g) { - auto pass = CreateInplacePass(); - ir::Node* op_node = GetNodeFromGraph(g.get(), "single_op"); - EXPECT_NE(op_node, nullptr); - pass->Apply(g.get()); - return g; -} - -TEST(InferInplace, SingleOpInplaceInToOut) { - ProgramDesc prog; - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("single_op"); - op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); - op->SetOutput("Out", {"test2_out"}); - - FakeSuccData(&prog); - std::unique_ptr g(new ir::Graph(prog)); - g->Set(ir::kMemOptSkipVars, new std::unordered_set()); - g = test_SingleOpInplaceInToOut(std::move(g)); - auto op_node = GetNodeFromGraph(g.get(), "single_op"); - - EXPECT_EQ(op_node->outputs[0]->Name(), "test2_a"); -} - -TEST(InferInplace, SingleOpInplaceInToOutNoInplace) { - ProgramDesc prog; - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("single_op"); - op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); - op->SetOutput("Out", {"test2_out"}); - - FakeNoInplaceData(&prog); - std::unique_ptr g(new ir::Graph(prog)); - g->Set(ir::kMemOptSkipVars, new std::unordered_set()); - g = test_SingleOpInplaceInToOut(std::move(g)); - auto op_node = GetNodeFromGraph(g.get(), "single_op"); - - EXPECT_EQ(op_node->outputs[0]->Name(), "test2_out"); -} - -TEST(InferInplace, MultiOutInplaceInToOut) { - ProgramDesc prog; - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("multi_out_op"); - op->SetInput("X", {"a0", "a1"}); - op->SetInput("Y", {"b0"}); - op->SetInput("Z", {"c0", "c1"}); - op->SetOutput("Out", {"o0"}); - op->SetOutput("YOut", {"y0"}); - op->SetOutput("ZOut", {"z0"}); - - prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("o0"); - prog.MutableBlock(0)->Var("y0"); - prog.MutableBlock(0)->Var("z0"); - prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); - - std::unique_ptr g(new ir::Graph(prog)); - g->Set(ir::kMemOptSkipVars, new std::unordered_set()); - auto pass = CreateInplacePass(); - pass->Apply(g.get()); - auto op_node = GetNodeFromGraph(g.get(), "multi_out_op"); - ASSERT_TRUE(op_node != nullptr); - EXPECT_EQ(op_node->outputs[0]->Name(), "a0"); - EXPECT_EQ(op_node->outputs[1]->Name(), "b0"); - EXPECT_EQ(op_node->outputs[2]->Name(), "c0"); -} - -TEST(InferInplace, MultiGradInplaceInToOut) { - ProgramDesc prog; - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("multi_out_grad"); - op->SetInput(GradVarName("Out"), {"o0"}); - op->SetInput(GradVarName("YOut"), {"y0"}); - op->SetInput(GradVarName("ZOut"), {"z0"}); - op->SetOutput(GradVarName("X"), {"a0", "a1"}); - op->SetOutput(GradVarName("Y"), {"b0"}); - op->SetOutput(GradVarName("Z"), {"c0", "c1"}); - - prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("o0"); - prog.MutableBlock(0)->Var("y0"); - prog.MutableBlock(0)->Var("z0"); - prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); - prog.MutableBlock(0)->Var("z0")->SetShape({32, 15, 1024, 1024}); - - std::unique_ptr g(new ir::Graph(prog)); - g->Set(ir::kMemOptSkipVars, new std::unordered_set()); - auto pass = CreateInplacePass(); - pass->Apply(g.get()); - auto op_node = GetNodeFromGraph(g.get(), "multi_out_grad"); - ASSERT_TRUE(op_node != nullptr); - EXPECT_EQ(op_node->outputs[0]->Name(), "o0"); - EXPECT_EQ(op_node->outputs[2]->Name(), "y0"); - EXPECT_EQ(op_node->outputs[3]->Name(), "c0"); - - std::unordered_map expects = { - {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, - }; -} - -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt index a732e1551544fc4400fddefd4178802a8f8b4fd1..32388f239c2dc9b9dc7407975de8f8a2d4ebd06b 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt +++ b/paddle/fluid/framework/ir/memory_optimize_pass/CMakeLists.txt @@ -4,20 +4,8 @@ cc_library(recurrent_op_eager_deletion_pass SRCS recurrent_op_eager_deletion_pas cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle var_handle) cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper) -if(WITH_GPU) - cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper gpu_info) -else() - cc_library(memory_optimize_helper SRCS memory_optimize_helper.cc DEPS graph graph_helper cpu_info) -endif() - -cc_library(memory_optimize_pass SRCS memory_optimize_pass.cc DEPS memory_optimize_helper pass) -cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_info) - -cc_test(memory_optimize_helper_test SRCS memory_optimize_helper_test.cc memory_optimize_helper.cc DEPS framework_proto graph graph_helper op_registry) - cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass recurrent_op_eager_deletion_pass reference_count_pass_helper) -cc_library(record_skip_memory_opt_vars_pass SRCS record_skip_memory_opt_vars_pass.cc DEPS graph graph_helper) cc_library(memory_reuse_pass SRCS memory_reuse_pass.cc DEPS computation_op_handle reference_count_pass_helper share_tensor_buffer_op_handle multi_devices_helper graph pass) diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc deleted file mode 100644 index 39c7d2a9f52512ca2ca95143bdf8b0987e4fb457..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_op_pass.cc +++ /dev/null @@ -1,488 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" -#include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/framework/op_info.h" - -// NOTE(dzhwinter): inplace means one op output variable reuse the input space. -// By our design, one operator only can read its input(const Variable), -// write its output(non-const Variable). If one operator is inplaced, means -// user have chance to write the space before reading happens. -// Especially when some optimize code writing style is applied. -// -// -// /* wrong case in operator */ -// /*In this case, a larger allocation is allocated, input content is lost*/ -// const Tensor* in = ctx.Input("In") -// Tensor* out = ctx.Output("Out"); -// auto* out_ptr = out->mutable_data(ctx.GetPlace()); -// out_ptr[0] = 0; // input contect is overwrited. - -// NOTE(dzhwinter): -// Only for backward compacity and stable. if enable_inplace_whitelist is turn -// on. -// only the ops in whitelist will be use inplace strategy. -// if not, all the op will be inplaced if it registered with InplaceClass -DEFINE_bool( - enable_inplace_whitelist, false, - "If this option turns on, only these op in whitelist can be inplaced." - "If it turns off, all of the running op can be candidate of inplaced op." - "Such as scale, elementwise_add" - "By default, it's turned off"); - -namespace paddle { -namespace framework { -namespace ir { - -// clang-format off -const std::string kInplacedOpWhiteList[] = { // NOLINT - "sigmoid", - "exp", - "relu", - "tanh", - "sqrt", - "ceil", - "floor", - "reciprocal", - "relu6", - "soft_relu", - "hard_sigmoid", - "batch_norm", - "batch_norm_grad", - "sum", - "sum_grad", - "scale", - "reshape", - "elementwise_add", - "elementwise_add_grad", -}; - -// FIXME(zjl): Shapes of in-out of some ops are exactly the same, -// but the static size during compiling time would be wrong. -// Use a flag to indicate such ops. Please fix me when found a better way. -static const std::unordered_set kSameShapeOpWhiteSet{ // NOLINT - "reshape2", "reshape2_grad" -}; -// clang-format on - -class InplacePass : public ir::Pass { - public: - InplacePass(); - - protected: - void ApplyImpl(ir::Graph *graph) const override; - - private: - // Collect vars that cannot be reused - // e.g.: subblock ops in/out, distributed ops in/out, op_role_var - void CollectSkipVars(ir::Graph *graph, - const std::vector &ops) const; - - // Check whether var_name should be skipped - bool IsSkipVar(const std::string &var_name) const; - - // Rename out with name of in, and guarantee that the graph is - // still a SSA graph - void RenameInOut(ir::Node *op, ir::Node *in, ir::Node *out) const; - - // Check whether var is the last version one in SSA graph - bool IsLastVersionVar(ir::Node *var) const; - - // Check whether var is the first version one in SSA graph - bool IsFirstVersionVar(ir::Node *var) const; - - // Check whether all `ops` is the preceding ops of `op` - bool CheckOpDeps(ir::Node *op, const std::vector &ops) const; - - // Find nodes whose names are equal to the given name - static std::unordered_set FindNodesByName( - const std::string &name, const std::vector &nodes); - - // Collect inputs and outputs of op_desc - static void CollectInputArgsOfOpDesc( - const OpDesc *op_desc, std::unordered_multiset *in_args); - - // Get all versions vars named var_name - std::vector *AllVersionVars(const std::string &var_name) const; - - private: - // SSA graph. var_name -> each version of vars - mutable std::map> ssa_map_; - - // Skip vars, including subblock ops in/out, distributed ops in/out, - // op_role_var - mutable std::unordered_set skip_vars_; - - // Op whitelist which should not peform inplace - // Only enabled when FLAGS_enable_inplace_whitelist is true. - mutable std::unordered_set whitelist_ops_; -}; - -InplacePass::InplacePass() { - if (FLAGS_enable_inplace_whitelist) { - for (auto &s : kInplacedOpWhiteList) { - whitelist_ops_.emplace(s); - } - } -} - -std::vector *InplacePass::AllVersionVars( - const std::string &var_name) const { - auto iter = ssa_map_.find(var_name); - PADDLE_ENFORCE(iter != ssa_map_.end(), "cannot find var %s in ssa graph", - var_name); - PADDLE_ENFORCE(!iter->second.empty(), "var %s is empty in ssa graph", - var_name); - return &(iter->second); -} - -bool InplacePass::IsSkipVar(const std::string &var_name) const { - return skip_vars_.count(var_name) > 0; -} - -bool InplacePass::IsFirstVersionVar(ir::Node *var) const { - return AllVersionVars(var->Name())->front() == var; -} - -bool InplacePass::IsLastVersionVar(ir::Node *var) const { - return AllVersionVars(var->Name())->back() == var; -} - -bool InplacePass::CheckOpDeps(ir::Node *op, - const std::vector &ops) const { - std::unordered_set other_ops(ops.begin(), ops.end()); - other_ops.erase(op); - if (other_ops.empty()) return true; - - // Traverse all preceding ops of op - std::queue queue; - std::unordered_set visited_ops; - queue.push(op); - visited_ops.insert(op); - - // Visit all preceding ops of `op`, and erase it from other_ops if it is - // inside other_ops. Return true only if other_ops is empty(), which means - // that all `ops` are preceding ops of `op`. - while (!queue.empty()) { - auto *cur_op = queue.front(); - queue.pop(); - - for (auto *in_var : cur_op->inputs) { - for (auto *in_op : in_var->inputs) { - if (visited_ops.count(in_op) != 0) { - continue; - } - - visited_ops.insert(in_op); - queue.push(in_op); - other_ops.erase(in_op); - if (other_ops.empty()) return true; - } - } - } - return false; -} - -void InplacePass::CollectSkipVars(ir::Graph *graph, - const std::vector &ops) const { - // 1. Collect op role vars - PADDLE_ENFORCE(graph->Has(kMemOptSkipVars), "Graph should have attr %s", - kMemOptSkipVars); - auto &mem_opt_whitelist = graph->Get(kMemOptSkipVars); - for (const auto &var : mem_opt_whitelist) { - skip_vars_.emplace(var); - } -} - -void InplacePass::RenameInOut(ir::Node *op, ir::Node *in_var, - ir::Node *out_var) const { - auto out_var_name = out_var->Name(); - auto in_var_name = in_var->Name(); - - auto &all_out_nodes = *AllVersionVars(out_var_name); - auto &all_in_nodes = *AllVersionVars(in_var_name); - - auto iter = std::find(all_out_nodes.begin(), all_out_nodes.end(), out_var); - PADDLE_ENFORCE(iter != all_out_nodes.end(), "Cannot find out var %s", - out_var_name); - - // The following codes are designed to guarantee that ssa_map_ is still - // an ssa graph after inplace is performed. - // Step 1: Rename the following versions of out_var as the name of in_var - // Step 2: Remove the following versions of out_var and append them to in_var - // Be careful that the inputs of input op of out_var should not be renamed, - // but outputs should be renamed. - auto original_iter = iter; - while (iter != all_out_nodes.end()) { - auto *node = *iter; - /* Step 1 */ - node->RenameVar(in_var_name); - if (iter != original_iter) { - for (auto *in : node->inputs) { - if (in->IsOp() && in->Op()) { - in->Op()->RenameOutput(out_var_name, in_var_name); - in->Op()->RenameInput(out_var_name, in_var_name); - in->Op()->Flush(); - } - } - } - - for (auto *out : node->outputs) { - if (out->IsOp() && out->Op()) { - out->Op()->RenameOutput(out_var_name, in_var_name); - out->Op()->RenameInput(out_var_name, in_var_name); - out->Op()->Flush(); - } - } - - /* Step 2 */ - all_in_nodes.emplace_back(node); - ++iter; - } - - /* Step 2 */ - all_out_nodes.erase(original_iter, all_out_nodes.end()); - - if (all_out_nodes.empty()) { - ssa_map_.erase(out_var_name); - } - op->Op()->RenameOutput(out_var_name, in_var_name); - op->Op()->Flush(); -} - -std::unordered_set InplacePass::FindNodesByName( - const std::string &name, const std::vector &nodes) { - std::unordered_set ret; - for (auto *node : nodes) { - if (node->Name() == name) { - ret.insert(node); - } - } - return ret; -} - -void InplacePass::CollectInputArgsOfOpDesc( - const OpDesc *op_desc, std::unordered_multiset *in_args) { - in_args->clear(); - for (auto &in_name : op_desc->InputArgumentNames()) { - in_args->insert(in_name); - } -} - -void InplacePass::ApplyImpl(ir::Graph *graph) const { - // Step 1: topo sort ops, collect skip vars - auto ops = ir::TopologySortOperations(*graph); - CollectSkipVars(graph, ops); - - // Step 2: build ssa var map - for (auto *op_node : ops) { - for (auto *in : op_node->inputs) { - PADDLE_ENFORCE(in->IsVar()); - // Only create a new var node when var first occurs in input of op. - if (ssa_map_.count(in->Name()) == 0) { - ssa_map_[in->Name()].emplace_back(in); - } - } - - // Always create a new var node for each output of op. - for (auto *out : op_node->outputs) { - PADDLE_ENFORCE(out->IsVar()); - ssa_map_[out->Name()].emplace_back(out); - } - } - - // Step 3: traverse ops and try inplace if possible - bool use_cuda = Get(kUseCuda); - VLOG(4) << "Inplace pass is applied when use_cuda = " - << (use_cuda ? "true" : "false"); - - for (auto *op_node : ops) { - PADDLE_ENFORCE_NOT_NULL(op_node->Op(), "op_desc is nullptr"); - - auto *op_desc = op_node->Op(); - auto op_type = op_desc->Type(); - - // Skip op inside whitelist - if (whitelist_ops_.count(op_type) > 0) { - continue; - } - - auto &infer_inplace = OpInfoMap::Instance().Get(op_type).infer_inplace_; - - if (!infer_inplace) { - continue; - } - - auto in_to_outs = infer_inplace(*op_desc, use_cuda); - if (in_to_outs.empty()) continue; - - std::unordered_multiset all_in_args; - CollectInputArgsOfOpDesc(op_desc, &all_in_args); - - for (auto &pair : in_to_outs) { - auto &in_param = pair.first; - auto &out_param = pair.second; - - auto &in_args = op_desc->Input(in_param); - auto &out_args = op_desc->Output(out_param); - - if (in_args.empty()) { - VLOG(4) << "Cannot inplace because Input(" << in_param - << ") is empty in " << op_type; - continue; - } - - if (out_args.empty()) { - VLOG(4) << "Cannot inplace because Output(" << out_param - << ") is empty in " << op_type; - continue; - } - - auto &in_arg = in_args[0]; - auto &out_arg = out_args[0]; - - if (IsSkipVar(in_arg)) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " is skipped in " << op_type; - continue; - } - - if (IsSkipVar(out_arg)) { - VLOG(4) << "Cannot inplace because Output(" << out_param - << ")=" << out_arg << " is skipped in " << op_type; - continue; - } - - if (in_arg == out_arg) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " is the same with Output(" << out_param << ")=" << out_arg - << " in " << op_type; - continue; - } - - size_t in_arg_occur_times = all_in_args.count(in_arg); - if (in_arg_occur_times > 1) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " occurs " << in_arg_occur_times << " times in input of op " - << op_type; - continue; - } - - auto in_nodes = FindNodesByName(in_arg, op_node->inputs); - PADDLE_ENFORCE(!in_nodes.empty(), "Input(%s)=%s cannot be found in op %s", - in_param, in_arg, op_type); - - if (in_nodes.size() > 1) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " occurs in other inputs of " << op_type; - continue; - } - - auto *in_node = *in_nodes.begin(); - - if (!NodeCanReused(in_node)) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " is not reusable in " << op_type; - continue; - } - - if (!IsLastVersionVar(in_node)) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " is not the last version in " << op_type; - continue; - } - - // If in_node is used as inputs of many ops, check whether all of that ops - // depends on op_node. If not, in_node cannot be inplaced. - if (in_node->outputs.size() > 1 && - !CheckOpDeps(op_node, in_node->outputs)) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " is not lastly used in " << op_type; - continue; - } - - auto out_nodes = FindNodesByName(out_arg, op_node->outputs); - PADDLE_ENFORCE(!out_nodes.empty(), - "Output(%s)=%s cannot be found in op %s", out_param, - out_arg, op_type); - - PADDLE_ENFORCE_EQ( - out_nodes.size(), 1, - "Wrong graph: Output(%s)=%s occurs in other outputs of op %s", - out_param, out_arg, op_type); - - if (!FindNodesByName(in_arg, op_node->outputs).empty()) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " occurs in output of op " << op_type; - continue; - } - - if (!FindNodesByName(out_arg, op_node->inputs).empty()) { - VLOG(4) << "Cannot inplace because Output(" << out_param - << ")=" << out_arg << " occurs in input of op " << op_type; - continue; - } - - auto *out_node = *out_nodes.begin(); - - if (!IsFirstVersionVar(out_node)) { - VLOG(4) << "Cannot inplace because Output(" << out_param - << ")=" << out_arg << " does not occur first in op " << op_type; - continue; - } - - if (!NodeCanReused(out_node)) { - VLOG(4) << "Cannot inplace because Output(" << out_param - << ")=" << out_arg << " is not reusable in " << op_type; - continue; - } - - if (in_node->Var()->GetType() != out_node->Var()->GetType()) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " is not the same type with " - << "Output(" << out_param << ")=" << out_arg << " in " - << op_type; - continue; - } - - if (NodeSize(*in_node->Var()) != NodeSize(*out_node->Var()) && - kSameShapeOpWhiteSet.count(op_desc->Type()) == 0) { - VLOG(4) << "Cannot inplace because Input(" << in_param << ")=" << in_arg - << " is not the same size with " - << "Output(" << out_param << ")=" << out_arg << " in " - << op_type; - continue; - } - - VLOG(4) << "Rename " << out_node->Name() << " with " << in_node->Name() - << " in " << op_type; - RenameInOut(op_node, in_node, out_node); - } - } -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(inplace_pass, paddle::framework::ir::InplacePass) - .RequirePassAttr(paddle::framework::ir::kUseCuda); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc deleted file mode 100644 index 0437de68687d8dc9eee3249ee438f2d907f8fe40..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.cc +++ /dev/null @@ -1,569 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/var_desc.h" -#include "paddle/fluid/platform/cpu_info.h" - -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/gpu_info.h" -#endif // PADDLE_WITH_CUDA - -namespace paddle { -namespace framework { -namespace ir { -using paddle::framework::VarDesc; - -std::vector SortOpLikeDescOrder(const ir::Graph& graph) { - PADDLE_ENFORCE(graph.Has(details::kStaleProgramOpDescs), - "Graph has no attribute of kStaleProgramOpDescs."); - // 1. get op desc order - auto& op_descs = - graph.Get>(details::kStaleProgramOpDescs); - - // 2. topology sort order - auto nodes = graph.Nodes(); - std::deque ops; - FilterVariables(nodes, [&](ir::Node* op) { - if (op->IsOp() && op->Op() != nullptr) { - ops.emplace_back(op); - } - }); - std::unordered_map op_deps; - std::list ready_ops; - std::unordered_map> pending_ops; - - for (auto* op : ops) { - std::unordered_set preceding_op; - for (auto* in : op->inputs) { - if (in->inputs.empty()) continue; - PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp()); - preceding_op.emplace(in->inputs[0]); - pending_ops[in->inputs[0]].emplace(op); - } - op_deps[op] = preceding_op.size(); - if (preceding_op.empty()) { - ready_ops.emplace_back(op); - } - } - - // 3. generated op list based desc order and the topology order - std::vector ret; - std::list op_descs_list(op_descs.begin(), op_descs.end()); - - auto update_by_found_node = [&](ir::Node* found_node) { - for (auto* pending_op : pending_ops[found_node]) { - if (--op_deps[pending_op] == 0) { - ready_ops.emplace_back(pending_op); - } - } - ready_ops.remove(found_node); - ret.emplace_back(found_node); - }; - - while (!ready_ops.empty()) { - bool all_of_ready_op_unmatched = true; - for (auto it = op_descs_list.begin(); it != op_descs_list.end();) { - auto op_desc = *it; - ir::Node* found_node = nullptr; - for (auto* op : ready_ops) { - if (IsSameDesc(op->Op(), op_desc)) { - found_node = op; - break; - } - } - - // 3.1 op desc deleted by other pass - if (found_node == nullptr) { - ++it; - continue; - } else { - all_of_ready_op_unmatched = false; - it = op_descs_list.erase(it); - } - update_by_found_node(found_node); - } - - // 3.2 op descs are added by other pass - // preceding op non empty means some new op descs are - // created, but not contained in return node list. - // these new op desc may depend on each other. - std::list prev_ready_ops(ready_ops); - if (all_of_ready_op_unmatched) { - for (auto op : prev_ready_ops) { - update_by_found_node(op); - } - } - } - - PADDLE_ENFORCE(std::all_of( - op_deps.begin(), op_deps.end(), - [&](const std::pair& p) { return p.second == 0; })); - - return ret; -} - -size_t NodeSize(const VarDesc& node) { - auto shape = node.GetShape(); - int size = - std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); - size_t type_size = SizeOfType(node.GetDataType()); - return type_size * std::abs(size); -} - -size_t NodeSize(ir::Node* n) { return NodeSize(*(n->Var())); } - -std::string DebugStringImpl(VarDesc* var) { - std::stringstream ss; - ss << var->Name(); - ss << "["; - try { - auto shape = var->GetShape(); - for (size_t i = 0; i < shape.size(); ++i) { - if (i != shape.size() - 1) { - ss << shape[i] << ","; - } else { - ss << shape[i]; - } - } - ss << "]"; - } catch (...) { - ss << "Var has no VarDesc !!! Name:" << var->Name(); - } - return ss.str(); -} - -std::string DebugString(ir::Node* var) { - return DebugStringImpl(GetVarDesc(var)); -} - -// NOTE(dzh): based ir node, if a large node has been reused -// by a small size node, then next time it appear in pool, it will -// have the small size. Find the original node shap from blockdesc. -VarDesc* GetVarDesc(ir::Node* n) { - PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1); - return n->Var(); -} - -struct NodeComparator { - bool operator()(ir::Node* lhs, ir::Node* rhs) const { - if (lhs->Var()->GetType() != rhs->Var()->GetType()) return false; - auto* lhs_desc = GetVarDesc(lhs); - auto* rhs_desc = GetVarDesc(rhs); - // match data type - if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) { - return false; - } - // match shape - auto lhs_shape = lhs_desc->GetShape(); - auto rhs_shape = rhs_desc->GetShape(); - if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || - (lhs_shape[0] != -1 && rhs_shape[0] != -1)) { - return NodeSize(lhs) == NodeSize(rhs); - } else { - return false; - } - } -}; - -void OrderedSet::Insert(ir::Node* var) { - PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); - if (mark_table_.count(var->Name()) != 0) { - mark_table_[var->Name()]->emplace_back(var); - return; - } - - auto* var_desc = var->Var(); - auto var_shape = var_desc->GetShape(); - int batch_size = static_cast(var_shape[0]); - - NodeComparator functor; - Iter it = nodes_.begin(); - while (it != nodes_.end()) { - auto& prev = it->front(); - auto* cache_desc = GetVarDesc(prev); - int cache_batch_size = cache_desc->GetShape()[0]; - if ((cache_batch_size == -1 && batch_size == -1) || - (cache_batch_size != -1 && batch_size != -1)) { - if (functor(prev, var)) { - ++it; - } else { - break; - } - } else if (cache_batch_size == -1 && batch_size != -1) { - ++it; - } else if (cache_batch_size != -1 && batch_size == -1) { - break; - } - } - - it = nodes_.insert(it, {var}); - mark_table_[var->Name()] = it; -} - -int OrderedSet::GetNodeIndexInPool(ir::Node* var) { - return std::distance(nodes_.begin(), mark_table_[var->Name()]); -} - -ir::Node* OrderedSet::FindBestFitNode(ir::Node* var) const { - ir::Node* found_node = nullptr; - NodeComparator functor; - - for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { - auto& candidate = it->front(); - if (functor(var, candidate)) { - found_node = candidate; - break; - } - } - return found_node; -} - -ir::Node* OrderedSet::FindNextBestFitNode(ir::Node* var, ir::Node* prev) const { - ir::Node* found_node = nullptr; - NodeComparator functor; - auto it = - std::find_if(nodes_.begin(), nodes_.end(), [&](const NodeVector& v) { - if (v.front() == prev) - return true; - else - return false; - }); - PADDLE_ENFORCE(it != nodes_.end(), "Not found previous in node list!"); - for (it = std::next(it); it != nodes_.end(); ++it) { - auto& candidate = it->front(); - if (functor(var, candidate)) { - found_node = candidate; - break; - } - } - return found_node; -} - -bool OrderedSet::Has(ir::Node* var) const { - if (mark_table_.count(var->Name())) { - auto& node_in_samename = mark_table_.at(var->Name()); - auto iter = - std::find_if(node_in_samename->begin(), node_in_samename->end(), - [&](ir::Node* n) { return n->Name() == var->Name(); }); - return iter != node_in_samename->end(); - } - return false; -} - -void OrderedSet::Erase(const std::string& var) { - PADDLE_ENFORCE(mark_table_.count(var)); - nodes_.erase(mark_table_[var]); - mark_table_.erase(var); -} - -void OrderedSet::Erase(ir::Node* var) { - PADDLE_ENFORCE(var != nullptr); - Erase(var->Name()); -} - -std::string OrderedSet::ToString() const { - std::stringstream ss; - for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { - for (auto& node : *it) { - ss << DebugString(node) << " "; - } - } - return ss.str(); -} - -bool NodeCanReused(ir::Node* node) { - // valid the node is a var node - // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad - if (node == nullptr || !node->IsVar() || node->IsCtrlVar() || - node->Name() == kEmptyVarName) - return false; - - bool flag = true; - // op output force generated in cpu, can not be reused. - for (auto* op : node->inputs) { - if (op->Op()->HasAttr("force_cpu")) { - flag &= framework::AttrReader(op->Op()->GetAttrMap()) - .Get("force_cpu") == 0; - } - } - // var desc validation. - flag &= NodeCanReused(*node->Var()); - return flag; -} - -int MinChunkSize() { - int size{0}; -#ifdef PADDLE_WITH_CUDA - size = platform::GpuMinChunkSize(); -#else - size = platform::CpuMinChunkSize(); -#endif // PADDLE_WITH_CUDA - return size; -} - -bool NodeCanReused(const VarDesc& node) { - auto type = node.GetType(); - // only these types holds bulk of gpu memory - // FIXME(liuwei1031) did not find good ways to test SELECTED_ROWS and - // LOD_TENSOR_ARRAY re-use logic, - // disable them in version 1.4 - // if (!(type == proto::VarType::LOD_TENSOR || - // type == proto::VarType::SELECTED_ROWS || - // type == proto::VarType::LOD_TENSOR_ARRAY)) { - // return false; - // } - if (type != proto::VarType::LOD_TENSOR) return false; - - // persistable variable is parameter - if (node.Persistable()) { - return false; - } - // shape < min_chunk_size is meaningless. - // further more, fetched loss always has size = 1 - // which should not be reused. - auto shape = node.GetShape(); - int size = std::abs( - std::accumulate(shape.begin(), shape.end(), 1, std::multiplies())); - if (shape.empty() || size < MinChunkSize()) { - return false; - } - return true; -} - -bool OpHasSubBlock(OpDesc* desc) { - const AttributeMap& attrs = desc->GetAttrMap(); - for (auto& attr : attrs) { - if (attr.second.type() == typeid(BlockDesc*) || // NOLINT - attr.second.type() == typeid(std::vector)) // NOLINT - return true; - } - return false; -} - -ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) { - ops_ = SortOpLikeDescOrder(graph); - ConnectNodes(); -} - -void ControlFlowGraph::BuildCFGGraph() { - // FIXME(dzh): same effect with ConnectNodes, but use the control - // link to build dependency graph, it goes wrong in transformer. - for (ir::Node* op : ops_) { - for (auto& input_var : op->inputs) { - if (!input_var->inputs.empty()) { - PADDLE_ENFORCE( - input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(), - "Preceding Op Node of Var Node must be unique"); - auto* pred_op = input_var->inputs[0]; - if (pred_op->Op() != nullptr) { - predecessors_[op].insert(pred_op); - successors_[pred_op].insert(op); - } - } - if (input_var->IsVar() && !input_var->IsCtrlVar()) { - uses_[op].insert(input_var->Name()); - } - } - for (auto& output_var : op->outputs) { - // output var may be used by many op - for (auto* succ_op : output_var->outputs) { - if (succ_op->Op() != nullptr) { - successors_[op].insert(succ_op); - predecessors_[succ_op].insert(op); - } - } - if (output_var->IsVar() && !output_var->IsCtrlVar()) { - defs_[op].insert(output_var->Name()); - } - } - } -} - -void ControlFlowGraph::ConnectNodes() { - for (size_t i = 0; i < ops_.size(); ++i) { - auto& op = ops_[i]; - try { - auto& next_op = ops_.at(i + 1); - successors_[op].insert(next_op); - predecessors_[next_op].insert(op); - } catch (...) { - // do nothing - } - - FilterVariables(op->inputs, - [&](ir::Node* var) { uses_[op].emplace(var->Name()); }); - - FilterVariables(op->outputs, - [&](ir::Node* var) { defs_[op].emplace(var->Name()); }); - } -} - -void ControlFlowGraph::LiveVariableAnalysis() { - // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm) - // compute the liveness of for each variable though reversed_ops algorithm. - // It iterates the operators from end to begin, compute the live in/live out - // variable set for each op, then the diff between in/out will be used for - // the variable reuse. For detail refer to - // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf - std::list work_list(ops_.rbegin(), ops_.rend()); - while (!work_list.empty()) { - ir::Node* op = work_list.front(); - work_list.pop_front(); - // get the live_in calculated before. Empty if first. - auto prev_live_in = std::move(live_in_[op]); - for (auto& s : successors_[op]) { - for (auto& var : live_in_[s]) { - live_out_[op].insert(var); - } - } - for (auto& var : uses_[op]) { - live_in_[op].insert(var); - } - for (auto& var : live_out_[op]) { - live_in_[op].insert(var); - } - for (auto& var : defs_[op]) { - if (uses_[op].count(var)) continue; - live_in_[op].erase(var); - } - - // If the live_in is not changed, then the liveness analysis of - // predecessors is completed. - // - // Otherwise, recalculate the predecessors liveness - if (live_in_[op] != prev_live_in) { - for (auto& pre : predecessors_[op]) { - work_list.push_back(pre); - } - } - } - - for (auto* op : ops_) { - unlived_vars_[op] = std::set(); - for (auto& var : this->LiveIn(op)) { - if (!this->LiveOut(op).count(var)) { - unlived_vars_[op].insert(var); - } - } - } -} - -void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, - const std::string& new_node, - int begin_idx) { - std::vector need_update(ops_.size(), false); - // update graph from begin idx to the end - for (size_t i = begin_idx; i != ops_.size(); ++i) { - auto* op = ops_[i]; - if (uses_[op].find(old_node) != uses_[op].end()) { - uses_[op].erase(old_node); - uses_[op].insert(new_node); - } - if (defs_[op].find(old_node) != defs_[op].end()) { - defs_[op].erase(old_node); - defs_[op].insert(new_node); - } - if (live_in_[op].find(old_node) != live_in_[op].end()) { - live_in_[op].erase(old_node); - live_in_[op].insert(new_node); - need_update[i] = true; - } - if (live_out_[op].find(old_node) != live_out_[op].end()) { - live_out_[op].erase(old_node); - live_out_[op].insert(new_node); - need_update[i] = true; - } - } - - for (size_t i = begin_idx; i < ops_.size(); ++i) { - if (!need_update[i]) continue; - auto* op = ops_[i]; - for (auto& var : this->LiveIn(op)) { - if (!this->LiveOut(op).count(var)) { - unlived_vars_[op].insert(var); - } - } - } -} - -const std::set& ControlFlowGraph::LiveIn(ir::Node* op) const { - auto it = live_in_.find(op); - PADDLE_ENFORCE( - it != live_in_.end(), - string::Sprintf("Expect %s in live_in, but Not Found.", op->Name())); - return it->second; -} - -const std::set& ControlFlowGraph::LiveOut(ir::Node* op) const { - auto it = live_out_.find(op); - PADDLE_ENFORCE( - it != live_out_.end(), - string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); - return it->second; -} - -const std::set& ControlFlowGraph::Use(ir::Node* op) const { - auto it = uses_.find(op); - PADDLE_ENFORCE( - it != uses_.end(), - string::Sprintf("Expect %s in use, but Not Found.", op->Name())); - return it->second; -} - -const std::set& ControlFlowGraph::Unlived(ir::Node* op) const { - auto it = unlived_vars_.find(op); - PADDLE_ENFORCE( - it != unlived_vars_.end(), - string::Sprintf("Expect %s in unlived_set, but Not Found.", op->Name())); - return it->second; - return it->second; -} - -const std::vector& ControlFlowGraph::Ops() const { return ops_; } - -std::vector& ControlFlowGraph::Ops() { return ops_; } - -ir::Node* ControlFlowGraph::GetNodeByName(const std::string& name, - ir::Node* op) const { - // in ssa-graph, different version nodes have same name, - // this function get the latest version var before target op - // It may return nullptr, such as data node. - ir::Node* found_node = nullptr; - for (auto* node : ops_) { - if (node == op) break; - for (auto& output : node->outputs) { - PADDLE_ENFORCE((output != nullptr && output->IsVar()), - "Output is empty!"); - if (output->Var() && output->Name() == name) { - found_node = output; - } - } - } - return found_node; -} - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h deleted file mode 100644 index b3e2c2b1e9e420a64bb072a238bfe074d4a015db..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h +++ /dev/null @@ -1,187 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/ir/graph.h" - -namespace paddle { -namespace framework { -namespace ir { - -/// this attribute is used to avoid some core variables removed/reused -/// in memory optimize related passes -constexpr char kMemOptSkipVars[] = "@MEM_OPT_SKIP_VARS@"; -typedef std::unordered_set MemOptSkipVars; - -std::vector SortOpLikeDescOrder(const ir::Graph& graph); - -// NOTE(dzh): A ordered set for node reuse in memory optimize. -// the orderedset sort node in ascend order(by node bytes size). -// in fluid, -1 means the batch_size, which is determined in runtime. -// So the reuse happens between nodes who's batch_size both are -1 -// simultaneously or not. -// -// sort rule: -// rule 0 : smaller node ranking in front. -// rule 1 : batch_size equal -1 ranking in the front than the node not. -// -// For example, -// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. - -class OrderedSet { - public: - // nodes with same name exists in pool. - using NodeVector = std::vector; - using Iter = typename std::list::iterator; - using ConstIter = typename std::list::const_iterator; - - void Insert(ir::Node* var); - void Erase(ir::Node* var); - void Erase(const std::string& var); - bool Has(ir::Node* var) const; - void Clear() { - mark_table_.clear(); - nodes_.clear(); - } - // find the bestfit shape node block with var. - ir::Node* FindBestFitNode(ir::Node* var) const; - ir::Node* FindNextBestFitNode(ir::Node* var, ir::Node* prev) const; - // map store non-const iterator, can not promise const - int GetNodeIndexInPool(ir::Node* var); - // pool all node to string - std::string ToString() const; - - Iter begin() { return nodes_.begin(); } - Iter end() { return nodes_.end(); } - ConstIter begin() const { return nodes_.begin(); } - ConstIter end() const { return nodes_.end(); } - - size_t size() const { return nodes_.size(); } - - private: - // for searching. - std::unordered_map mark_table_; - // node pool - std::list nodes_; -}; - -class ControlFlowGraph { - public: - ControlFlowGraph() = default; - // IR Graph - explicit ControlFlowGraph(const ir::Graph& graph); - - void LiveVariableAnalysis(); - - void RenameVarInCFGGraph(const std::string& old_node, - const std::string& new_node, int begin_idx); - - const std::set& LiveIn(ir::Node* op) const; - const std::set& LiveOut(ir::Node* op) const; - const std::set& Use(ir::Node* op) const; - const std::set& Unlived(ir::Node* op) const; - const std::vector& Ops() const; - std::vector& Ops(); - - // for ssa-graph nodes - ir::Node* GetNodeByName(const std::string& name, ir::Node* op) const; - - private: - void BuildCFGGraph(); - void ConnectNodes(); - - using NodeListMap = std::unordered_map>; - using VarSetMap = std::map>; - // successors ops use the output variables. - NodeListMap successors_; - // predecessors ops generated input variables. - NodeListMap predecessors_; - // variables lived before run current op. - VarSetMap live_in_; - // variables lived after run current op. - VarSetMap live_out_; - VarSetMap uses_; // op inputs - VarSetMap defs_; // op outputs - std::unordered_map> unlived_vars_; - - std::vector ops_; // op sequence by topology sort -}; - -// valid a tensor can be reuse or not -bool NodeCanReused(ir::Node* node); - -// valid a tensor can be reuse or not. -bool NodeCanReused(const VarDesc& node); - -// check op has subblock or not -bool OpHasSubBlock(OpDesc* desc); - -// node memory size in bytes -size_t NodeSize(ir::Node* n); - -// node memory size in bytes -size_t NodeSize(const VarDesc&); - -std::string DebugString(ir::Node* var); - -VarDesc* GetVarDesc(ir::Node* n); - -static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { - return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && - op1->Outputs() == op2->Outputs(); -} - -template -class FilterVariableImpl { - public: - void operator()(const Container& nodes, Callback callback) { - for (auto* node : nodes) { - callback(node); - } - } -}; - -// filter var node for op->inputs/outputs -template -class FilterVariableImpl, Callback> { - public: - void operator()(const std::vector& nodes, Callback callback) { - for (auto* var : nodes) { - if (var->IsVar() && !var->IsCtrlVar()) { - callback(var); - } - } - } -}; - -template -void FilterVariables(const Container& nodes, Callback callback) { - FilterVariableImpl()(nodes, callback); -} - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper_test.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper_test.cc deleted file mode 100644 index d38facd01950936c5ee7fb337ddce89d1bfd7209..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper_test.cc +++ /dev/null @@ -1,525 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include "glog/logging.h" -#include "gtest/gtest.h" -#include "paddle/fluid/framework/details/graph_test_base.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" - -namespace paddle { -namespace framework { -namespace ir { - -TEST(OrderedSet, Normal) { - OrderedSet pool; - std::vector> nodes; - - // clang-format off - std::vector> shapes = {{-1, 10}, - {-1, 20}, - {1, 2}, - {5, 2}, - {10, 20}, - {-1, 2, 5}, - {-1, 1, 5}, - {-1, 1}}; - // clang-format on - const int COUNT = shapes.size(); - ProgramDesc prog; - BlockDesc* block_desc = prog.MutableBlock(0); - auto* op_desc = block_desc->AppendOp(); - op_desc->SetType("dummy"); - std::unique_ptr op = ir::CreateNodeForTest(op_desc); - - for (int i = 0; i < COUNT; ++i) { - auto desc = block_desc->Var(std::to_string(i)); - desc->SetShape(shapes[i]); - std::unique_ptr node = ir::CreateNodeForTest(desc); - node->inputs.emplace_back(op.get()); - nodes.emplace_back(std::move(node)); - } - - // Insert - for (auto& node : nodes) { - pool.Insert(node.get()); - } - - // Has/size - ASSERT_EQ(pool.size(), shapes.size()); - for (auto& node : nodes) { - ASSERT_TRUE(pool.Has(node.get())); - } - - // assert its order and interface. - std::cout << pool.ToString() << std::endl; - pool.Erase(nodes.front().get()); - std::cout << pool.ToString() << std::endl; - - ASSERT_EQ(pool.size(), static_cast(COUNT - 1)); - ASSERT_EQ(pool.GetNodeIndexInPool(nodes.back().get()), 0); - - { - auto v1 = block_desc->Var("11"); - v1->SetShape({-1, 256, 56, 56}); - std::unique_ptr node1 = ir::CreateNodeForTest(v1); - node1->inputs.emplace_back(op.get()); - auto* cache = pool.FindBestFitNode(node1.get()); - ASSERT_EQ(cache, nullptr); - } - { - auto v2 = block_desc->Var("12"); - v2->SetShape({-1, 2, 5}); - std::unique_ptr node1 = ir::CreateNodeForTest(v2); - node1->inputs.emplace_back(op.get()); - auto* cache = pool.FindBestFitNode(node1.get()); - ASSERT_EQ(pool.GetNodeIndexInPool(cache), 2); // match 6:[-1,2,5] - } - { - auto v3 = block_desc->Var("13"); - v3->SetShape({2, 5}); - std::unique_ptr node1 = ir::CreateNodeForTest(v3); - node1->inputs.emplace_back(op.get()); - auto* cache = pool.FindBestFitNode(node1.get()); - ASSERT_EQ(pool.GetNodeIndexInPool(cache), 5); // match 4:[5,2] - } -} - -TEST(OrderedSet, FindBestFitNode) { - OrderedSet pool; - std::vector> nodes; - ProgramDesc prog; - BlockDesc* block_desc = prog.MutableBlock(0); - auto* op_desc = block_desc->AppendOp(); - op_desc->SetType("dummy"); - std::unique_ptr op = ir::CreateNodeForTest(op_desc); - - { - auto desc = block_desc->Var("a"); - desc->SetShape({128, 128}); - std::unique_ptr node = ir::CreateNodeForTest(desc); - node->inputs.emplace_back(op.get()); - nodes.emplace_back(std::move(node)); - } - { - auto desc = block_desc->Var("b"); - desc->SetShape({128, 129}); - std::unique_ptr node = ir::CreateNodeForTest(desc); - node->inputs.emplace_back(op.get()); - nodes.emplace_back(std::move(node)); - } - { - auto desc = block_desc->Var("c"); - desc->SetShape({128, 128}); - std::unique_ptr node = ir::CreateNodeForTest(desc); - node->inputs.emplace_back(op.get()); - nodes.emplace_back(std::move(node)); - } - - for (auto& node : nodes) { - pool.Insert(node.get()); - } - - auto* n = nodes[0].get(); - auto* cache = pool.FindBestFitNode(n); - ASSERT_TRUE(cache->Name() == "a" || cache->Name() == "c"); - auto* cache_b = pool.FindNextBestFitNode(n, cache); - ASSERT_TRUE(cache_b->Name() != cache->Name()); - ASSERT_TRUE(cache_b->Name() == "a" || cache_b->Name() == "c"); - cache = pool.FindNextBestFitNode(n, cache_b); - ASSERT_TRUE(cache == nullptr); -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_OPERATOR(sum, paddle::framework::DummyOp, - paddle::framework::SumOpMaker, - paddle::framework::DummyVarTypeInference); -REGISTER_OPERATOR(assign, paddle::framework::DummyOp, - paddle::framework::AssignOpMaker, - paddle::framework::DummyVarTypeInference); -REGISTER_OPERATOR(dummy, paddle::framework::DummyOp, - paddle::framework::SumOpMaker, - paddle::framework::DummyVarTypeInference); -/* - https://en.wikipedia.org/wiki/Live_variable_analysis - Create a customed classical dependency graph, left row is the instruction - number. - 1. a = 1 - 2. b = a - 3. c = a - 4. d = b + c - 5. e = d - - a--------+ - | | - b c - | | - d--------+ - | - e - Then analysis these variable's liveness range - */ - -namespace paddle { -namespace framework { -namespace ir { - -inline static ProgramDesc FillProgramDesc() { - ProgramDesc prog; - prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR); - prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR); - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("assign"); - op->SetInput("X", {"a"}); - op->SetOutput("Out", {"b"}); - } - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("assign"); - op->SetInput("X", {"a"}); - op->SetOutput("Out", {"c"}); - } - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d"}); - } - { - auto* op = prog.MutableBlock(0)->AppendOp(); - op->SetType("assign"); - op->SetInput("X", {"d"}); - op->SetOutput("Out", {"e"}); - } - return prog; -} - -TEST(CFGGraph, IRGraph) { - // prepare ir graph - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - - ControlFlowGraph cfg(graph); - cfg.LiveVariableAnalysis(); - - // test assign op - ASSERT_TRUE((std::set{"a"} == cfg.LiveIn(cfg.Ops()[0]))); - ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveOut(cfg.Ops()[0]))); - - // test assign op - ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveIn(cfg.Ops()[1]))); - ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveOut(cfg.Ops()[1]))); - - // test sum op - ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveIn(cfg.Ops()[2]))); - ASSERT_TRUE((std::set{"d"} == cfg.LiveOut(cfg.Ops()[2]))); - - // test assign op - ASSERT_TRUE((std::set{"d"} == cfg.LiveIn(cfg.Ops()[3]))); - ASSERT_TRUE((std::set{} == cfg.LiveOut(cfg.Ops()[3]))); -} - -// 1. normal test -TEST(SortOpLikeDescOrder, NormalTest) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - - auto nodes = SortOpLikeDescOrder(graph); - auto op_descs = prog.Block(0).AllOps(); - for (size_t i = 0; i < nodes.size(); ++i) { - auto node = nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 2. remove some op_desc -TEST(SortOpLikeDescOrder, RemoveOpDesc) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - auto nodes = graph.Nodes(); - auto op_descs = prog.Block(0).AllOps(); - ir::Node* found_node = nullptr; - for (auto node : nodes) { - if (node->IsOp() && node->outputs.back()->Name() == "e") { - found_node = node; - break; - } - } - PADDLE_ENFORCE(found_node != nullptr); - for (auto it = op_descs.begin(); it != op_descs.end();) { - if (IsSameDesc(*it, found_node->Op())) { - it = op_descs.erase(it); - } else { - ++it; - } - } - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - ir::Node* e = find_node_in_graph("e"); - ir::Node* d = find_node_in_graph("d"); - std::remove(d->outputs.begin(), d->outputs.end(), found_node); - graph.RemoveNode(found_node); - graph.RemoveNode(e); - - // other node keeps the same order - auto remain_nodes = SortOpLikeDescOrder(graph); - for (size_t i = 0; i < remain_nodes.size(); ++i) { - auto node = remain_nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 3. add some op_desc -TEST(SortOpLikeDescOrder, AddOpDesc) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - // cached desc different with real one - // mimic the intermidiete pass modify the programdesc. - std::vector op_descs = graph.OriginProgram().Block(0).AllOps(); - - auto op = prog.MutableBlock(0)->AppendOp(); - prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d1"}); - ir::Node* node = graph.CreateOpNode(op); - ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); - ir::Node* b = find_node_in_graph("b"); - ir::Node* c = find_node_in_graph("c"); - node->outputs.emplace_back(d1); - node->inputs.emplace_back(b); - node->inputs.emplace_back(c); - d1->inputs.emplace_back(node); - b->outputs.emplace_back(node); - c->outputs.emplace_back(node); - op_descs.insert(op_descs.begin() + 4, op); - - auto nodes = SortOpLikeDescOrder(graph); - - for (size_t i = 0; i < nodes.size(); ++i) { - auto node = nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 4. add and delete some op_desc -TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - std::vector op_descs = graph.OriginProgram().Block(0).AllOps(); - - // remove sum node - ir::Node* found_node = nullptr; - auto nodes = graph.Nodes(); - for (auto node : nodes) { - if (node->Name() == "sum") { - found_node = node; - break; - } - } - PADDLE_ENFORCE(found_node != nullptr); - for (auto it = op_descs.begin(); it != op_descs.end();) { - if (IsSameDesc(*it, found_node->Op())) { - it = op_descs.erase(it); - } else { - ++it; - } - } - { - ir::Node* d = find_node_in_graph("d"); - ir::Node* c = find_node_in_graph("c"); - ir::Node* e = find_node_in_graph("e"); - std::remove(d->outputs.begin(), d->outputs.end(), found_node); - std::remove(c->outputs.begin(), c->outputs.end(), found_node); - ir::Node* pending_op = found_node->outputs[0]->outputs[0]; - graph.RemoveNode(e); - graph.RemoveNode(pending_op); - graph.RemoveNode(found_node); - } - - // add node - auto op = prog.MutableBlock(0)->AppendOp(); - prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d1"}); - { - ir::Node* node = graph.CreateOpNode(op); - ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); - ir::Node* b = find_node_in_graph("b"); - ir::Node* c = find_node_in_graph("c"); - node->outputs.emplace_back(d1); - node->inputs.emplace_back(b); - node->inputs.emplace_back(c); - b->outputs.emplace_back(node); - c->outputs.emplace_back(node); - } - op_descs.insert(op_descs.begin() + 2, op); - - // check the order - auto mynodes = SortOpLikeDescOrder(graph); - for (size_t i = 0; i < mynodes.size(); ++i) { - auto node = mynodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -// 5. add and replace some op_desc inplace. -TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { - auto prog = FillProgramDesc(); - ir::Graph graph(prog); - std::vector op_descs = graph.OriginProgram().Block(0).AllOps(); - - auto find_node_in_graph = [&](std::string s) { - ir::Node* ret = nullptr; - for (auto n : graph.Nodes()) { - if (n->Name() == s) { - ret = n; - break; - } - } - PADDLE_ENFORCE(ret != nullptr); - return ret; - }; - - // add node - auto op = prog.MutableBlock(0)->AppendOp(); - prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); - op->SetType("sum"); - op->SetInput("X", {"b", "c"}); - op->SetOutput("Out", {"d1"}); - { - ir::Node* node = graph.CreateOpNode(op); - ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); - ir::Node* b = find_node_in_graph("b"); - ir::Node* c = find_node_in_graph("c"); - node->outputs.emplace_back(d1); - node->inputs.emplace_back(b); - node->inputs.emplace_back(c); - d1->inputs.emplace_back(node); - b->outputs.emplace_back(node); - c->outputs.emplace_back(node); - } - - op_descs.emplace_back(op); - - // replace op_desc inplace - auto nodes = graph.Nodes(); - ir::Node* found_node = nullptr; - for (auto node : nodes) { - if (node->IsOp() && node->Op() && node->Name() == "assign") { - if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") { - found_node = node; - break; - } - } - } - { - ir::Node* d = find_node_in_graph("d"); - ir::Node* e = find_node_in_graph("e"); - std::remove(d->outputs.begin(), d->outputs.end(), found_node); - std::remove(e->inputs.begin(), e->inputs.end(), found_node); - graph.RemoveNode(found_node); - } - op_descs.erase(op_descs.begin() + 3); - - auto replace_op = prog.MutableBlock(0)->AppendOp(); - replace_op->SetType("sum"); - replace_op->SetInput("X", {"d", "d1"}); - replace_op->SetOutput("Out", {"e"}); - { - ir::Node* sum2 = graph.CreateOpNode(replace_op); - ir::Node* e = find_node_in_graph("e"); - ir::Node* d = find_node_in_graph("d"); - ir::Node* d1 = find_node_in_graph("d1"); - sum2->inputs.emplace_back(d); - sum2->inputs.emplace_back(d1); - sum2->outputs.emplace_back(e); - e->inputs.emplace_back(sum2); - d->outputs.emplace_back(sum2); - d1->outputs.emplace_back(sum2); - } - - op_descs.emplace_back(replace_op); - // compare op order - auto graph_nodes = SortOpLikeDescOrder(graph); - for (size_t i = 0; i < graph_nodes.size(); ++i) { - auto node = graph_nodes[i]; - auto op_desc = op_descs[i]; - ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); - } -} - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc deleted file mode 100644 index af3fbb2808b0c11a5013800e41f877391a51d368..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.cc +++ /dev/null @@ -1,224 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "gflags/gflags.h" -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_helper.h" - -namespace paddle { -namespace framework { -namespace ir { - -void MemoryOptimizePass::ApplyImpl(ir::Graph* graph) const { - CollectSkipVarsSet(graph); - - cfg_.reset(new ControlFlowGraph(*graph)); - cfg_->LiveVariableAnalysis(); - InitSSAGraphNodes(); - - int reuse_id = 0; - for (size_t idx = 0; idx < cfg_->Ops().size(); ++idx) { - auto& op = cfg_->Ops()[idx]; - auto* op_desc = op->Op(); - // some op in graph has no op desc - if (op_desc == nullptr) continue; - - for (auto& var : op->outputs) { - if (var->IsVar() && !var->IsCtrlVar() && skip_set_.count(var->Name())) { - VLOG(3) << "Skip set contains variable of " << var->Name() - << "disable reuse on it. skipped"; - continue; - } - if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) { - ir::Node* cache = pool_.FindBestFitNode(var); - while (cache != nullptr && var->Name() == cache->Name()) { - VLOG(3) << "The same cache variable is cascade reused. " - << cache->Name() << " is re-filled to the pool after " - << "the reused op is finished. Current op can not " - << "replace it again. Skip this candidate."; - cache = pool_.FindNextBestFitNode(var, cache); - } - - if (cache != nullptr) { - int node_idx_in_pool = pool_.GetNodeIndexInPool(cache); - VLOG(3) << string::Sprintf( - "!!! %s, %s => %s, cache idx %d, pool size %d", - std::to_string(reuse_id++), DebugString(var), DebugString(cache), - node_idx_in_pool, static_cast(pool_.size())); - // NOTE(dzhwinter): update the ProgramDesc/IR Graph - // and the CFG Graph on the fly. - // - // IR Graph define the dependence relationship between nodes. - // - // ProgramDesc defines the input/output vars. Its used in - // CreateOp, CreateVar when running happens. - // - // CFG Graph store the liveness information, when reuse happens - // we also need to update the variable liveness. - const std::string var_name = var->Name(); - const std::string cache_name = cache->Name(); - - cfg_->RenameVarInCFGGraph(var_name, cache_name, idx); - RenameVarInGraphDesc(var_name, cache_name, idx); - RenameVarInGraphNode(var_name, cache_name, idx, graph); - pool_.Erase(cache_name); - } - } - } - // fill the pool - for (auto& var : cfg_->Unlived(op)) { - ir::Node* var_node = cfg_->GetNodeByName(var, op); - if (var_node == nullptr || var_node->IsCtrlVar()) continue; - if (NodeCanReused(var_node) && !pool_.Has(var_node)) { - pool_.Insert(var_node); - } - } - } - graph->ResolveHazard(var_nodes_); -} - -void MemoryOptimizePass::CollectSkipVarsSet(ir::Graph* graph) const { - // fill skip_set_ - PADDLE_ENFORCE(graph->Has(kMemOptSkipVars)); - auto& mem_opt_whitelist = graph->Get(kMemOptSkipVars); - for (const auto& var : mem_opt_whitelist) { - skip_set_.emplace(var); - } -} - -void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, - const std::string& cache_var, - size_t idx) const { - for (size_t i = idx; i < cfg_->Ops().size(); ++i) { - auto* op = cfg_->Ops()[i]; - PADDLE_ENFORCE(op->IsOp() && op->Op()); - auto* op_desc = op->Op(); - op_desc->RenameInput(var, cache_var); - op_desc->RenameOutput(var, cache_var); - if (op_desc->Block() != nullptr) { - op_desc->Block()->RemoveVar(var); - } else { - LOG(WARNING) << "op " << op->Name() << " not know its block." - << "Is the op_desc created without block pointer? " - << "Can not find " << var << " in Block(0)"; - } - op_desc->Flush(); - } -} - -void MemoryOptimizePass::InitSSAGraphNodes() const { - std::unordered_map> all_vars; - if (var_nodes_.empty()) { - for (auto* op : cfg_->Ops()) { - for (auto* node : op->inputs) { - if (all_vars[node->Name()].count(node) == 0) { - all_vars[node->Name()].emplace(node); - var_nodes_[node->Name()].emplace_back(node); - } - } - for (auto* node : op->outputs) { - if (all_vars[node->Name()].count(node) == 0) { - all_vars[node->Name()].emplace(node); - var_nodes_[node->Name()].emplace_back(node); - } - } - } - } -} - -void MemoryOptimizePass::RenameVarInGraphNode(const std::string& var, - const std::string& cache_var, - size_t idx, - ir::Graph* graph) const { - // if replace happens, we need to create a newer version cache_var - // but use the same dims/data_type with var. - PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && - var_nodes_[var].at(0)->Var() != nullptr); - std::unique_ptr var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); - var_desc->SetName(cache_var); - - for (size_t i = idx; i < cfg_->Ops().size(); ++i) { - auto* op = cfg_->Ops()[i]; - - // redirect the input to the latest version of cache_var - for (auto* node : op->inputs) { - if (node->Name() == var) { - ir::Node* cache_node = var_nodes_[cache_var].back(); - - // swap node to cache_node - cache_node->outputs.insert(cache_node->outputs.end(), - node->outputs.begin(), node->outputs.end()); - PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp()); - auto* prev_op = node->inputs[0]; - std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, - cache_node); - for (auto* next_op : node->outputs) { - std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, - cache_node); - } - - // erase unused node - auto& nodes = var_nodes_.at(var); - nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end()); - graph->RemoveNode(node); - } - } - - // if we need to rename the output, - // always create a newer version of cache_var - for (auto* node : op->outputs) { - if (node->Name() == var) { - ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); - var_nodes_[cache_var].emplace_back(cache_node); - - // swap node to cache node - cache_node->outputs.insert(cache_node->outputs.end(), - node->outputs.begin(), node->outputs.end()); - cache_node->inputs.emplace_back(op); - std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node); - for (auto* next_op : node->outputs) { - std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, - cache_node); - } - - // erase unused node - auto& nodes = var_nodes_.at(var); - nodes.erase(std::remove(nodes.begin(), nodes.end(), node), nodes.end()); - graph->RemoveNode(node); - } - } - } -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(memory_optimize_pass, paddle::framework::ir::MemoryOptimizePass) - .RequireGraphAttr(paddle::framework::details::kStaleProgramOpDescs); diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h b/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h deleted file mode 100644 index eef289eff138c454631ffbb34d0780b1c14d99dc..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" -#include "paddle/fluid/framework/ir/pass.h" - -namespace paddle { -namespace framework { -namespace ir { - -class MemoryOptimizePass : public ir::Pass { - protected: - void ApplyImpl(ir::Graph* graph) const override; - // fill the variable map(var_nodes) by version. - void InitSSAGraphNodes() const; - - private: - // update program descs - void RenameVarInGraphDesc(const std::string& var, - const std::string& cache_var, size_t idx) const; - // update ir nodes - void RenameVarInGraphNode(const std::string& var, - const std::string& cache_var, size_t idx, - ir::Graph* graph) const; - - void SubGraphOptimize(OpDesc* op_desc) const; - // 1. scan op with subblock and collect the output/input vars. - // while, while_grad, conditional_block - // 2. scan distributed ops and collect the output/input vars - // 3. op_role_vars - void CollectSkipVarsSet(ir::Graph* graph) const; - - private: - // Reuse Node Pool, Owned. - mutable OrderedSet pool_; - // controlflow Graph - mutable std::unique_ptr cfg_; - // skip set - mutable std::unordered_set skip_set_; - // var nodes - mutable std::map> var_nodes_; -}; - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc deleted file mode 100644 index 040b769f89dd6de6cf3585d1e5f83da8fdb700d3..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/ir/memory_optimize_pass/record_skip_memory_opt_vars_pass.cc +++ /dev/null @@ -1,170 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" -#include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/framework/op_proto_maker.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace framework { -namespace ir { - -class RecordSkipMemoryOptVarsPass : public ir::Pass { - protected: - void ApplyImpl(ir::Graph* graph) const override { - PADDLE_ENFORCE(!graph->Has(kMemOptSkipVars)); - graph->Set(kMemOptSkipVars, new MemOptSkipVars); - auto& skip_vars = graph->Get(kMemOptSkipVars); - - std::vector op_nodes; - for (auto& node : graph->Nodes()) { - PADDLE_ENFORCE_NOT_NULL(node, "The node should not be nullptr."); - if (node->IsOp() && node->Op()) { - op_nodes.emplace_back(node); - } - } - - // Insert kEmptyVarName to avoid optimizing empty variable - skip_vars.insert(framework::kEmptyVarName); - - // NOTE(zcd): Insert OpRoleVars to SkipVarSet to prevent the vars are rename - // in memory optimize pass. - InsertOpRoleVarsToSkipVarSet(op_nodes, &skip_vars); - - InsertSkipMemOptOpInOutToSkipVarSet(op_nodes, &skip_vars); - } - - private: - static void InsertOpRoleVarsToSkipVarSet(const std::vector& ops, - MemOptSkipVars* skip_vars) { - for (auto& node : ops) { - try { - auto op_role_vars = - boost::get>(node->Op()->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - PADDLE_ENFORCE_EQ(op_role_vars.size() % 2, 0); - for (size_t i = 0; i < op_role_vars.size(); i += 2) { - auto& g_name = op_role_vars[i + 1]; - skip_vars->insert(g_name); - } - } catch (boost::bad_get& e) { - } - } - } - - static void UpdateSkipVarSet( - MemOptSkipVars* skip_vars, - const std::vector>& var_names) { - for (auto& var_name : var_names) { - skip_vars->insert(var_name.begin(), var_name.end()); - } - } - - static std::vector ToGradVarName( - const std::vector& names) { - std::vector ret; - ret.reserve(names.size()); - for (auto& name : names) { - if (name != framework::kEmptyVarName) { - ret.emplace_back(framework::GradVarName(name)); - } - } - return ret; - } - - static void InsertSkipMemOptOpInOutToSkipVarSet( - const std::vector& ops, MemOptSkipVars* skip_vars) { - static std::unordered_set kSkipMemOptOps{ - "send", "recv", "prefetch", "send_barrier", "fetch_barrier"}; - - for (auto& node : ops) { - auto* op_desc = node->Op(); - // Some ops (while, conditional_block, recurrent, etc.) have sub-blocks. - // These ops often use variables from its parent or forward blocks. - // Optimizing in/out of such ops would make these variables cannot - // be found when running sub-block ops. - if (OpHasSubBlock(op_desc)) { - UpdateSkipVarSet(skip_vars, {op_desc->InputArgumentNames(), - op_desc->OutputArgumentNames()}); - } - - // Skip ops that are related to parameter server. - // In distributed mode, trainers and parameter server use same - // variable names to track same variables. We cannot change the - // names of these variables, otherwise trainers or parameter - // server would not find them. - if (kSkipMemOptOps.count(op_desc->Type()) > 0) { - UpdateSkipVarSet(skip_vars, {op_desc->InputArgumentNames(), - op_desc->OutputArgumentNames()}); - } - - // FIXME(zjl): some ops use variables that are not from their - // inputs or outputs. We do not have a nice method to solve this - // issue yet. Currently, we should skip these variables when - // memory optimization is enabled. - auto op_type = op_desc->Type(); - if (op_type == "while_grad") { - // In while_grad, framework::GradVarName(Input("X")) is visited - // without being any in/out of while_grad. While_grad uses - // these variable to accumulate gradient of X across time steps. - UpdateSkipVarSet(skip_vars, {ToGradVarName(op_desc->Input("X"))}); - } else if (op_type == "conditional_block_grad") { - // In conditional_block_grad, framework::GradVarName(Input("Input", - // "Cond")) is visited without being any in/out of - // conditional_block_grad. Conditional_block_grad uses these - // variables to accumulate gradient of Input/Cond across time steps. - UpdateSkipVarSet(skip_vars, {ToGradVarName(op_desc->Input("Input")), - ToGradVarName(op_desc->Input("Cond"))}); - } else if (op_type == "recurrent" || op_type == "recurrent_grad") { - // Recurrent and recurrent_grad ops are implemented by a very trickly - // way. Attr("states", "ex_states") is visited without being any - // in/out of op. It is because these variables are from sub blocks, - // not main block. Adding these variables to input would make recurrent - // fail since "states" and "ex_states" cannot be found in main block. - // When memory optimization is enabled, "states", "ex_states" and their - // gradient should be skipped. - auto ex_states = - boost::get>(op_desc->GetAttr("ex_states")); - auto states = - boost::get>(op_desc->GetAttr("states")); - if (op_type == "recurrent") { - UpdateSkipVarSet(skip_vars, {ex_states, states}); - } else { - // In recurrent_grad, framework::GradVarName(Input("parameters", - // "input")) is visited without being any in/out of recurrent_grad. - // Recurrent_grad uses these variables to accumulate gradient of - // parameters/input across time steps. - UpdateSkipVarSet( - skip_vars, - {ToGradVarName(op_desc->Input("parameters")), - ToGradVarName(op_desc->Input("inputs")), ex_states, states, - ToGradVarName(ex_states), ToGradVarName(states)}); - } - } - } - } -}; - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(record_skip_memory_opt_vars_pass, - paddle::framework::ir::RecordSkipMemoryOptVarsPass); diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc index 3a5333d08d4444e4c3d402c5dc549c40c87e4e99..7de3b7c6054183d9a9cb80e66bee571f29ed68eb 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/sequential_execution_pass.cc @@ -17,7 +17,6 @@ #include #include #include "paddle/fluid/framework/ir/graph.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_helper.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/op_proto_maker.h" diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index c1a5774cf49cd3f8e00a4265357a1d8f7f79eced..815042c7419395178b45133b00211646acc82b06 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -252,7 +252,22 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { VLOG(10) << "buffer_shared_inplace_pass Applied"; } - if (build_strategy_.memory_optimize_) { + /** + * NOTE(zengjinle): If BuildStrategy.memory_optimize = None in Python, + * set BuildStrategy.memory_optimize according to whether gc is enabled. + * If gc is enabled, BuildStrategy.memory_optimize = False. + * If gc is disabled, BuildStrategy.memory_optimize = True. + * This is because gc+memory_optimize is worse than gc only. + * + * As an option, users can enable BuildStrategy.memory_optimize forcely + * by setting True, and disable it forcely by setting False. + */ + bool is_gc_enabled = (GetEagerDeletionThreshold() >= 0); + if (!build_strategy_.memory_optimize_) { + build_strategy_.memory_optimize_ = !is_gc_enabled; + } + + if (build_strategy_.memory_optimize_.get()) { auto cross_op_memory_reuse_pass = ir::PassRegistry::Instance().Get( "buffer_shared_cross_op_memory_reuse_pass"); cross_op_memory_reuse_pass->SetNotOwned(ir::kMemOptVarInfoMapList, @@ -265,7 +280,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { VLOG(10) << "buffer_shared_cross_op_memory_reuse_pass Applied"; } - if (GetEagerDeletionThreshold() < 0) { + if (!is_gc_enabled) { return graph; } size_t max_memory_size = static_cast(GetEagerDeletionThreshold()); @@ -313,6 +328,9 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { eager_deletion_pass->SetNotOwned(ir::kAllPlaces, &places_); graph = eager_deletion_pass->Apply(graph); VLOG(10) << "EagerDeletionPass Applied"; + LOG(INFO) << "Garbage collection strategy is enabled, when " + << "FLAGS_eager_delete_tensor_gb = " + << (static_cast(GetEagerDeletionThreshold()) / (1 << 30)); } return graph; } diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc index 633e3259adaaedc92ce2f3420f4d1dbf86387143..71eeaf3b53acf98c9f5e43f9acd6d67d42086005 100644 --- a/paddle/fluid/pybind/const_value.cc +++ b/paddle/fluid/pybind/const_value.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/pybind/const_value.h" -#include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimize_pass.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/operator.h" @@ -34,7 +33,6 @@ void BindConstValue(pybind11::module* m) { m->def("kControlDepVarName", [] { return framework::ir::Node::kControlDepVarName; }); m->def("kNewGradSuffix", [] { return framework::kNewGradSuffix; }); - m->def("kMemOptSkipVars", [] { return framework::ir::kMemOptSkipVars; }); auto op_proto_and_checker_maker = m->def_submodule("op_proto_and_checker_maker"); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index f763e69809010b89375e98945adde8059e985e6b..fb0882e3533252460e3dd2546e9af8d50f053db6 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1548,17 +1548,31 @@ All parameter, weight, gradient are variables in Paddle. )DOC") .def_property( "memory_optimize", - [](const BuildStrategy &self) { return self.memory_optimize_; }, - [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; }, - R"DOC(The type is BOOL, memory opitimize aims to save total memory + [](const BuildStrategy &self) -> py::object { + if (self.memory_optimize_) { + return py::cast(self.memory_optimize_.get()); + } else { + return py::cast(nullptr); + } + }, + [](BuildStrategy &self, const py::handle &value) { + auto *py_obj = value.ptr(); + if (py_obj == nullptr || py_obj == Py_None) { + self.memory_optimize_ = boost::none; + } else if (PyBool_Check(py_obj)) { + self.memory_optimize_ = (py_obj == Py_True); + } else { + PADDLE_THROW( + "BuildStrategy.memory_optimize must be None, False or True"); + } + }, + R"DOC(The type is BOOL or None, memory opitimize aims to save total memory consumption, set to True to enable it. - Memory Optimize is our experimental feature, some variables - may be reused/removed by optimize strategy. If you need to - fetch some variable values when using this feature, please - set the persistable property of the variables to True. - - Default False)DOC") + Default None. None means framework would choose to use or not use + this strategy automatically. Currently, None means that it is + enabled when GC is disabled, and disabled when GC is enabled. + True means enabling and False means disabling. Default None.)DOC") .def_property( "is_distribution", [](const BuildStrategy &self) { return self.is_distribution_; }, @@ -1578,13 +1592,6 @@ All parameter, weight, gradient are variables in Paddle. "enable_inplace", [](const BuildStrategy &self) { return self.enable_inplace_; }, [](BuildStrategy &self, bool b) { self.enable_inplace_ = b; }) - .def_property("_use_legacy_memory_optimize_strategy", - [](const BuildStrategy &self) { - return self.use_legacy_memory_optimize_strategy_; - }, - [](BuildStrategy &self, bool b) { - self.use_legacy_memory_optimize_strategy_ = b; - }) .def_property( "fuse_all_reduce_ops", [](const BuildStrategy &self) { return self.fuse_all_reduce_ops_; }, diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 35e8ffcda804b3e7c2a74e10440517a8bf6ba5fe..dfe58c7e4d92edd9fdbfa3689305b1ed29211947 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -206,7 +206,7 @@ def __bootstrap__(): 'cudnn_exhaustive_search', 'selected_gpus', 'sync_nccl_allreduce', 'limit_of_tmp_allocation', 'times_excess_than_required_tmp_allocation', - 'enable_inplace_whitelist', 'cudnn_batchnorm_spatial_persistent' + 'cudnn_batchnorm_spatial_persistent' ] core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 790f297fb966bdb923ecca0083c6745a92e34531..bd82ba7f283ae2ce9812c7d90bca7670a3ba99ff 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -533,36 +533,6 @@ class Executor(object): return as_numpy(arr) return [arr[i] for i in range(len(arr))] - def _check_fetch_vars_persistable(self, program, fetch_list): - for var in fetch_list: - if isinstance(var, Variable): - persistable = var.persistable - else: - block_num = program.desc.num_blocks() - persistable = None - var_name = cpt.to_bytes(var) - for i in six.moves.range(block_num): - var_desc = program.desc.block(i).find_var(var_name) - if var_desc: - persistable = var_desc.persistable() - break - assert persistable is not None, "Variable {} is not found".format( - var) - - if not persistable: - logging.warn(""" - Detect that build_strategy.memory_optimize = True, but the some variables in the fetch - list is not persistable, you may get wrong fetched value, or an exeception may be thrown - about cannot find variable of the fetch list. - - TO FIX this: - # Sample - conv1 = fluid.layers.conv2d(data, 4, 5, 1, act=None) - # if you need to fetch conv1, then: - conv1.persistable = True - - """) - def run(self, program=None, feed=None, @@ -667,10 +637,6 @@ class Executor(object): scope=scope, return_numpy=return_numpy, use_program_cache=use_program_cache) - else: - if fetch_list and program._is_data_parallel and program._program and \ - program._build_strategy._use_legacy_memory_optimize_strategy: - self._check_fetch_vars_persistable(program._program, fetch_list) program._compile(scope, self.place) if program._is_data_parallel: diff --git a/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py b/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py index 873bd61d40bc3df6448a22cdd00211f7815eb985..90666d4ebb6e6069ff74ec5efb2834d3c384b1bc 100644 --- a/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py +++ b/python/paddle/fluid/tests/unittests/test_inplace_softmax_with_cross_entropy.py @@ -61,16 +61,13 @@ class TestSoftmaxWithXe(unittest.TestCase): build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = inplace - if inplace: - build_strategy._use_legacy_memory_optimize_strategy = True prog = fluid.CompiledProgram(fluid.default_main_program( )).with_data_parallel( build_strategy=build_strategy, places=place) - if inplace: - fetch_list = [z_d.name, x_d.name] - else: - fetch_list = [z_d.name, s_d.name] + fetch_list = [z_d.name, s_d.name] + + print('Inplace is {}'.format("ON" if inplace else "OFF")) z, s = exe.run(prog, feed={x_d.name: x, diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index e1db0c30bdd9397e78d8c51186910f0f2983ff4d..00a94fa829f4b9695d1dcc727d2035045ee7105e 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -14,6 +14,7 @@ from __future__ import print_function +import logging import six import sys from collections import defaultdict, MutableSet @@ -550,8 +551,14 @@ def memory_optimize(input_program, fluid.memory_optimize(main_prog) """ - sys.stderr.write('memory_optimize is deprecated. ' - 'Use CompiledProgram and Executor\n') + logging.warn( + 'Caution! paddle.fluid.memory_optimize() is deprecated ' + 'and not maintained any more, since it is not stable!\n' + 'Please use the newest and stable memory optimization strategies!\n' + ' 1. Enable garbage collection strategy by exporting environment ' + 'variable FLAGS_eager_delete_tensor_gb=0\n' + ' 2. Set build_strategy.enable_inplace=True (True is the default ' + 'value) when using CompiledProgram or ParallelExecutor.\n') def to_name_str(var): if isinstance(var, Variable):