diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index e6f5cb7473cdac95afabef8b133131ad71867f7b..79277a4174b7a14d53465fe27512d2130a7430a3 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -134,7 +134,7 @@ paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', '80641ee6810b1cdc3fd6e14fc89ecc9d')) paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', 'b350b9a30a18e7efd7e1bb740eef6996')) paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', '17485788fffe4e2d36dc58c2ac8d174e')) -paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '013795af319e2e86d3506741941078ee')) +paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '2c4d1ae83da6ed35e3b36ba1b3b51d23')) paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', 'de6a906950bae9f3c245cb744d22b94e')) paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '419c3a24a83cc89219a029cf4092788b')) paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '3f536aafba30d793287b52d231baff1b')) diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 10aa7a59422f4508dda8d0bcd960583056e25938..72c50518af08b9c1b2f97e6864e5836e806c77fc 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -134,6 +134,11 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, out_layout = out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout; + auto& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = dynamic_cast( + pool.Get(expected_kernel_type.place_)); + auto& cpu_engine = dev_ctx->GetEngine(); + std::vector in_tz = paddle::framework::vectorize2int(in.dims()); std::vector out_tz = in_tz; @@ -142,25 +147,29 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, "Input tensor type is not supported: %s", in.type()); memory::data_type out_type = in_type; + auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format()); + auto out_format = + platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout)); + // output tensor has the same dims as input. Reorder don't change dims out->Resize(in.dims()); - // tempory mem pd fr out , to make reorder - auto out_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(out->dims()), - mkldnn::memory::format::blocked, out_type); - if (in.get_mkldnn_prim_desc() != out_mem_pd) { + if (in_format != out_format) { void* in_data = GetDataFromTensor(in, in_type); auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); - auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data); - auto out_memory = memory(out_mem_pd, out_data); + auto in_memory = + memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data); + auto out_memory = + memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data); platform::Reorder(in_memory, out_memory); } else { out->ShareDataWith(in); } out->set_layout(out_layout); + // reset format since the out tensor will be feed to non-MKLDNN OPkernel + out->set_format(memory::format::format_undef); #endif } diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index f0203edf05635452bf347335066dadc24ecc3138..82872224501709080ff02a13464d58543a0abda8 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -51,31 +51,13 @@ void TransformData(const OpKernelType &expected_kernel_type, #ifdef PADDLE_WITH_MKLDNN // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel // Just set layout/format. No real transform occur + + auto out_format = platform::MKLDNNFormatForSize(in.dims().size(), + ToMKLDNNFormat(lin)); + out.ShareDataWith(input_tensor); - // TODO(jczaja): Remove that once all mkldnn ops - // are modified to work with mkldnn_blocked - auto mkldnn_fmt = [&](int rank) { - switch (rank) { - case 5: - return mkldnn::memory::format::ncdhw; - case 4: - return mkldnn::memory::format::nchw; - case 3: - return mkldnn::memory::format::ncw; - case 2: - return mkldnn::memory::format::nc; - case 1: - return mkldnn::memory::format::x; - default: - return mkldnn::memory::format::blocked; - } - }; - - auto out_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(out.dims()), - mkldnn_fmt(out.dims().size())); - - out.set_mkldnn_prim_desc(out_mem_pd); + out.set_layout(DataLayout::kMKLDNN); + out.set_format(out_format); #endif } else { // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 046ec6978a84fa6eba11513860523de5a63a31d8..d4939779a2401c9828e0478f5f5de780907c767e 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -10,7 +10,10 @@ cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framewor cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper) cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper) cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper) + cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper) +cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper) +cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper) cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) @@ -104,5 +107,7 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass fuse_elewise_add_act_pass multi_batch_merge_pass - fuse_relu_depthwise_conv_pass - memory_optimize_pass lock_free_optimize_pass alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass) + fuse_relu_depthwise_conv_pass + memory_optimize_pass lock_free_optimize_pass + alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass + fuse_adam_op_pass fuse_sgd_op_pass) diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.cc b/paddle/fluid/framework/details/all_reduce_deps_pass.cc index 98a74d630cd4f32b064e95ddc6082e2d2ad657e1..d93c84606d9492920ebcf669650ab74fb5b09af5 100644 --- a/paddle/fluid/framework/details/all_reduce_deps_pass.cc +++ b/paddle/fluid/framework/details/all_reduce_deps_pass.cc @@ -42,8 +42,7 @@ VarHandle* GetValidInput(const OpHandleBase* a) { return nullptr; } -std::unique_ptr AllReduceDepsPass::ApplyImpl( - std::unique_ptr graph) const { +void AllReduceDepsPass::ApplyImpl(ir::Graph* graph) const { auto graph_ops = ir::FilterByNodeWrapper(*graph); // get vars order @@ -131,8 +130,6 @@ std::unique_ptr AllReduceDepsPass::ApplyImpl( VLOG(10) << "pre_op:" << pre_op->DebugString() << ", op:" << op->DebugString(); } - - return graph; } } // namespace details diff --git a/paddle/fluid/framework/details/all_reduce_deps_pass.h b/paddle/fluid/framework/details/all_reduce_deps_pass.h index e8b91089816c71bc56ba7dba0105e85d73eb52ad..4ed3736587aa3d45e288e3dc7e6ab3099f935f41 100644 --- a/paddle/fluid/framework/details/all_reduce_deps_pass.h +++ b/paddle/fluid/framework/details/all_reduce_deps_pass.h @@ -24,8 +24,7 @@ namespace details { // TODO(gongwb): overlap allreduce with backward computation. class AllReduceDepsPass : public ir::Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace details diff --git a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc index fbc8bbf56b0d1ce75fbb459038c63571d0a16cd3..8e8258ffb124e5008954a455264f5c0bc5cabc37 100644 --- a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc +++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/op_registry.h" + DEFINE_uint32(fuse_parameter_memory_size, 0, // 0 KB "fuse_parameter_memory_size is up limited memory size " "of one group parameters' gradient which is the input " @@ -46,8 +47,7 @@ static framework::proto::VarType::Type kDefaultDtype = class AllocContinuousSpaceForGradPass : public ir::Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override { + void ApplyImpl(ir::Graph *graph) const override { ir::Graph &result = *graph; auto &places = Get>(kPlaces); @@ -65,7 +65,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { if (params_grads.size() == 0) { VLOG(10) << "Doesn't find gradients"; - return std::move(graph); + return; } std::unordered_map vars; @@ -106,26 +106,33 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { auto ele_dtype = iter->second->Var()->GetDataType(); if (dtype == kDefaultDtype) { dtype = ele_dtype; - PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype); + PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype, + "The data type should not be bool."); } - PADDLE_ENFORCE_EQ(ele_dtype, dtype); + PADDLE_ENFORCE_EQ(ele_dtype, dtype, + "The data type of input is not consistent."); } - // Create the fused variable name. + // Create a FusedVarsSet to avoid duplicating names for fused_var in other + // pass. if (!result.Has(kFusedVars)) { result.Set(kFusedVars, new FusedVars); } - const std::string prefix(kFusedVarNamePrefix); - // The fused_var_name should be unique. - auto fused_var_name = prefix + "GRAD@" + params_grads[0].second; + // the kFusedGrads is used be fuse_optimizer_op_pass. + result.Set(kFusedGrads, new FusedGrads); + + // the fused_var_name should be unique, so it appends + // params_grads.begin()->second. + auto fused_var_name = std::string(kFusedVarNamePrefix) + "@GRAD@" + + params_grads.begin()->second; + result.Get(kFusedGrads) = fused_var_name; auto &fused_var_set = result.Get(kFusedVars); - PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0); + PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0, + "%s is duplicate in FusedVars.", fused_var_name); fused_var_set.insert(fused_var_name); InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars, fused_var_name, params_grads); - - return std::move(graph); } template @@ -298,17 +305,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { return type == proto::VarType::LOD_TENSOR; } - void AppendAllocSpaceForVarsOp(const std::vector ¶ms_name, - const std::vector &grads_name, - const std::string &fused_var_name, - BlockDesc *global_block) const { - auto op_desc = global_block->AppendOp(); - op_desc->SetType("alloc_continuous_space"); - op_desc->SetInput("Input", params_name); - op_desc->SetOutput("Output", grads_name); - op_desc->SetOutput("FusedOutput", {fused_var_name}); - } - void RecordParamsAndGrads(ir::Node *node, ParamsAndGrads *params_grads) const { try { @@ -361,6 +357,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { } } + // Alloc continuous space for vars. std::vector grads_name; std::vector params_name; grads_name.reserve(params_grads.size()); @@ -373,7 +370,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name, program_desc.MutableBlock(0)); - // Run Only Once Programs for (size_t i = 0; i < local_scopes.size(); ++i) { for (auto &op_desc : program_desc.Block(0).AllOps()) { auto op = OpRegistry::CreateOp(*op_desc); @@ -381,6 +377,17 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { } } } + + void AppendAllocSpaceForVarsOp(const std::vector ¶ms_name, + const std::vector &grads_name, + const std::string &fused_var_name, + BlockDesc *global_block) const { + auto op_desc = global_block->AppendOp(); + op_desc->SetType("alloc_continuous_space"); + op_desc->SetInput("Input", params_name); + op_desc->SetOutput("Output", grads_name); + op_desc->SetOutput("FusedOutput", {fused_var_name}); + } }; } // namespace details diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index fdff83b92819b39974f3b2ce0848710f1ee02a41..752c932a215bad53f47f19f143a8008b66617a51 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -27,20 +27,17 @@ void BroadcastOpHandle::RunImpl() { if (places_.size() == 1) return; // The input and output may have dummy vars. - VarHandle *in_var_handle; - { - auto in_var_handles = DynamicCast(inputs_); - PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL, - "The number of input should be one."); - in_var_handle = in_var_handles[0]; - } - + auto in_var_handles = DynamicCast(inputs_); auto out_var_handles = DynamicCast(outputs_); + PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL, + "The number of input should be one."); PADDLE_ENFORCE_EQ( out_var_handles.size(), places_.size(), "The number of output should equal to the number of places."); + VarHandle *in_var_handle = in_var_handles[0]; + WaitInputVarGenerated(); std::vector var_scopes; diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 5d9db237538599ec9a6887317b61af73f1113b97..df69b11ec6ae3bb08ba03b749c69eb718525de4d 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include #include #include - #include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" @@ -82,23 +81,43 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { AppendPass("inplace_pass"); } - if (strategy.fuse_elewise_add_act_ops_) { + if (strategy_.fuse_elewise_add_act_ops_) { VLOG(10) << "Add fuse_elewise_add_act_pass"; AppendPass("fuse_elewise_add_act_pass"); } // for single card training, fuse_all_reduce_ops is unnecessary. // alloc_continuous_space_for_grad_pass should be before of MultiDevPass. - if (strategy.fuse_all_reduce_ops_) { + if (strategy_.fuse_all_reduce_ops_) { VLOG(10) << "Add alloc_continuous_space_for_grad_pass"; AppendPass("alloc_continuous_space_for_grad_pass"); } + if (strategy_.fuse_all_optimizer_ops_) { + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce || + strategy_.is_distribution_) { + VLOG(3) + << "Currently, fuse_all_optimizer_ops only works under AllReduce " + "mode."; + strategy_.fuse_all_optimizer_ops_ = false; + } else { + VLOG(10) << "Add alloc_continuous_space_for_grad_pass"; + AppendPass("alloc_continuous_space_for_grad_pass"); + // NOTE: fuse_all_xx_ops will count the number of xx operator first, + // if the number is zero, fuse_all_reduce_ops will do nothing. + // Currently, only one type of optimization algorithm can be fused. + VLOG(10) << "Add fuse_adam_op_pass"; + AppendPass("fuse_adam_op_pass"); + VLOG(10) << "Add fuse_sgd_op_pass"; + AppendPass("fuse_sgd_op_pass"); + } + } + // Add a graph viz pass to record a graph. if (!strategy.debug_graphviz_path_.empty()) { auto viz_pass = AppendPass("graph_viz_pass"); const std::string graph_path = string::Sprintf( - "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph"); + "%s%s", strategy_.debug_graphviz_path_.c_str(), "_fused_graph"); viz_pass->Set("graph_viz_path", new std::string(graph_path)); } @@ -118,14 +137,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // the de-fact IR, any reuse on Graph is meaningless. // A side-effect of that, memory optimize cannot forsee the fetched vars // , so fetchlist should be set persistable before call the Run interface. - if (strategy.memory_optimize_) { + if (strategy_.memory_optimize_) { VLOG(10) << "Add memory_optimize_pass"; AppendPass("memory_optimize_pass"); } - AppendMultiDevPass(strategy); + AppendMultiDevPass(strategy_); - if (strategy.fuse_all_reduce_ops_) { + if (strategy_.fuse_all_reduce_ops_) { // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator // first, if the number is zero, fuse_all_reduce_ops will do nothing. VLOG(10) << "Add fuse_all_reduce_op_pass"; @@ -151,7 +170,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { AppendPass("all_reduce_deps_pass"); } - if (SeqOnlyAllReduceOps(strategy)) { + if (SeqOnlyAllReduceOps(strategy_)) { VLOG(10) << "Add all_reduce_deps_pass"; AppendPass("all_reduce_deps_pass"); } @@ -165,7 +184,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Convert graph to run on multi-devices. void AppendMultiDevPass(const BuildStrategy &strategy) { ir::Pass *multi_devices_pass = nullptr; - if (strategy_.is_distribution_) { + if (strategy.is_distribution_) { VLOG(10) << "Add dist_multi_devices_pass"; multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); } else { @@ -204,15 +223,16 @@ bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const { return framework::details::MultiDevSSAGraphBuilder().count(pass_name) > 0; } -std::unique_ptr BuildStrategy::Apply( - std::unique_ptr graph, - const std::vector &places, - const std::string &loss_var_name, const std::vector &local_scopes, - const size_t &nranks, +ir::Graph *BuildStrategy::Apply(ir::Graph *graph, + const std::vector &places, + const std::string &loss_var_name, + const std::vector &local_scopes, + const size_t &nranks, #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { + const bool use_cuda, + platform::NCCLContextMap *nccl_ctxs) const { #else - const bool use_cuda) const { + const bool use_cuda) const { #endif // Create a default one if not finalized by user. CreatePassesFromStrategy(false); @@ -234,17 +254,22 @@ std::unique_ptr BuildStrategy::Apply( pass->Erase(kNCCLCtxs); pass->SetNotOwned(kNCCLCtxs, nctx); #endif - } else if (pass->Type() == "fuse_all_reduce_op_pass") { + } else if (pass->Type() == "alloc_continuous_space_for_grad_pass" || + pass->Type() == "fuse_adam_op_pass" || + pass->Type() == "fuse_sgd_op_pass" || + pass->Type() == "fuse_all_reduce_op_pass") { pass->Erase(kPlaces); pass->SetNotOwned>(kPlaces, &places); pass->Erase(kLocalScopes); pass->SetNotOwned>(kLocalScopes, &local_scopes); + if (pass->Type() == "fuse_all_reduce_op_pass") { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; - pass->Erase(kNCCLCtxs); - pass->SetNotOwned(kNCCLCtxs, nctx); + platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; + pass->Erase(kNCCLCtxs); + pass->SetNotOwned(kNCCLCtxs, nctx); #endif + } } else if (pass->Type() == "alloc_continuous_space_for_grad_pass") { pass->Erase(kPlaces); pass->SetNotOwned>(kPlaces, &places); @@ -265,7 +290,7 @@ std::unique_ptr BuildStrategy::Apply( } } VLOG(3) << "Start Apply Pass " << pass->Type(); - graph = pass->Apply(std::move(graph)); + graph = pass->Apply(graph); VLOG(3) << "Finish Apply Pass " << pass->Type(); } return graph; @@ -293,4 +318,6 @@ USE_PASS(inplace_pass); USE_PASS(lock_free_optimize_pass); USE_PASS(alloc_continuous_space_for_grad_pass); USE_PASS(graph_to_program_pass); +USE_PASS(fuse_adam_op_pass); +USE_PASS(fuse_sgd_op_pass); USE_PASS(fuse_all_reduce_op_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 4b599fb914dc7c35a0524ea62ba8d458b8dccf8f..85f328b7c40568cc9246fd4ecab34e8e6778439b 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -18,7 +18,6 @@ #include #include #include - #include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" @@ -76,6 +75,8 @@ struct BuildStrategy { bool fuse_elewise_add_act_ops_{false}; + bool fuse_all_optimizer_ops_{false}; + bool fuse_all_reduce_ops_{false}; bool fuse_relu_depthwise_conv_{false}; @@ -120,16 +121,15 @@ struct BuildStrategy { // Apply the passes built by the pass_builder_. The passes will be // applied to the Program and output an ir::Graph. - std::unique_ptr Apply(std::unique_ptr graph, - const std::vector &places, - const std::string &loss_var_name, - const std::vector &local_scopes, - const size_t &nranks, + ir::Graph *Apply(ir::Graph *graph, const std::vector &places, + const std::string &loss_var_name, + const std::vector &local_scopes, + const size_t &nranks, #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - const bool use_cuda, - platform::NCCLContextMap *nccl_ctxs) const; + const bool use_cuda, + platform::NCCLContextMap *nccl_ctxs) const; #else - const bool use_cuda) const; + const bool use_cuda) const; #endif // If set true, ParallelExecutor would build the main_program into multiple diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc index a6baa26134cf36ea93dde554f808e73fa0c30b93..622a59b4c2e24c420da00cac2cce82ca365077e8 100644 --- a/paddle/fluid/framework/details/eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -170,12 +170,10 @@ static OpToVarNameSetMap ShrinkGCVars( class EagerDeletionPass : public ir::Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph *graph) const override; }; -std::unique_ptr EagerDeletionPass::ApplyImpl( - std::unique_ptr graph) const { +void EagerDeletionPass::ApplyImpl(ir::Graph *graph) const { auto &ref_cnts = Get>(kRuntimeReferenceCount); PADDLE_ENFORCE(ref_cnts.empty(), @@ -240,7 +238,7 @@ std::unique_ptr EagerDeletionPass::ApplyImpl( auto while_op_eager_deletion_pass = ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass"); - return while_op_eager_deletion_pass->Apply(std::move(graph)); + while_op_eager_deletion_pass->Apply(graph); } } // namespace details diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index d4fbea9d95118666ababde811867e95c657c07de..297ee92fc3c84c2feec9cb85bd8671ce8ad94ed0 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -31,9 +31,10 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( local_scopes_(local_scopes), places_(places), graph_(graph), + fetch_ctxs_(places), pool_(strategy.num_threads_), - prepare_pool_(1), // add one more thread for generate op_deps - fetch_ctxs_(places) { + // add one more thread for generate op_deps + prepare_pool_(1) { for (auto &op : ir::FilterByNodeWrapper(*graph_)) { int dep = static_cast(op->NotReadyInputSize()); op_deps_.emplace(op, dep); diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h index 970298950cc8089bc5861fcbf8dc2544934b181f..f6d5160e75cc3f48c5129dae05eec4ec82d83ae5 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h @@ -14,7 +14,9 @@ #pragma once #include +#include #include +#include #include #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/details/exception_holder.h" @@ -37,6 +39,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { const ir::Graph &Graph() const override; private: + // Note(zcd): the ThreadPool should be placed last so that ThreadPool should + // be destroyed first. ExecutionStrategy strategy_; std::vector local_scopes_; std::vector places_; @@ -45,21 +49,22 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { std::unordered_map op_deps_; std::vector bootstrap_ops_; - ::ThreadPool pool_; - ::ThreadPool prepare_pool_; platform::DeviceContextPool fetch_ctxs_; std::atomic remaining_; + std::future< + std::unique_ptr>>> + atomic_op_deps_; + ExceptionHolder exception_; + + ::ThreadPool pool_; + ::ThreadPool prepare_pool_; + void RunOpAsync(std::unordered_map> *op_deps, OpHandleBase *op, const std::shared_ptr> &complete_q); void PrepareAtomicOpDeps(); - - std::future< - std::unique_ptr>>> - atomic_op_deps_; - ExceptionHolder exception_; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/fuse_adam_op_pass.cc b/paddle/fluid/framework/details/fuse_adam_op_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..0ef75e319244e2ccc63dfa3f93f0cd764cf67633 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_adam_op_pass.cc @@ -0,0 +1,199 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fuse_adam_op_pass.h" +#include +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace framework { +namespace details { + +const std::string FuseAdamOpPass::GetOpType() const { return "adam"; } + +const std::vector FuseAdamOpPass::GetAuxiliaryVarNames() const { + return {"Param", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow"}; +} + +void FuseAdamOpPass::FuseOptimizerOps( + const std::unordered_map> + &aux_var_set, + const std::unordered_map &fused_vars_name, + const std::vector &adam_ops, ir::Graph *graph) const { + FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph); + FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"), + adam_ops, graph); + FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"), + adam_ops, graph); +} + +void FuseAdamOpPass::FuseAdamOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &adam_ops, ir::Graph *graph) const { + PADDLE_ENFORCE_GT(adam_ops.size(), static_cast(0)); + + // Check attributions + // NOTE: If new attribution is added, the following code maybe need change. + int op_role = boost::get( + adam_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + float beta1 = boost::get(adam_ops[0]->Op()->GetAttr("beta1")); + float beta2 = boost::get(adam_ops[0]->Op()->GetAttr("beta2")); + float epsilon = boost::get(adam_ops[0]->Op()->GetAttr("epsilon")); + bool lazy_mode = boost::get(adam_ops[0]->Op()->GetAttr("lazy_mode")); + int64_t min_row_size_to_use_multithread = boost::get( + adam_ops[0]->Op()->GetAttr("min_row_size_to_use_multithread")); + for (auto &adam_op : adam_ops) { + PADDLE_ENFORCE_EQ(beta1, + boost::get(adam_op->Op()->GetAttr("beta1"))); + PADDLE_ENFORCE_EQ(beta2, + boost::get(adam_op->Op()->GetAttr("beta2"))); + PADDLE_ENFORCE_EQ(epsilon, + boost::get(adam_op->Op()->GetAttr("epsilon"))); + PADDLE_ENFORCE_EQ(lazy_mode, + boost::get(adam_op->Op()->GetAttr("lazy_mode"))); + PADDLE_ENFORCE_EQ(min_row_size_to_use_multithread, + boost::get(adam_op->Op()->GetAttr( + "min_row_size_to_use_multithread"))); + PADDLE_ENFORCE_EQ(op_role, boost::get(adam_op->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName()))); + } + + // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var + // node. + + VLOG(10) << "Insert adam to graph "; + OpDesc adam_desc(adam_ops[0]->Op()->Block()); + adam_desc.SetType("adam"); + adam_desc.SetInput("Param", {fused_vars_name.at("Param")}); + adam_desc.SetInput("Grad", {fused_vars_name.at("Grad")}); + adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")}); + adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")}); + // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. + adam_desc.SetInput("LearningRate", adam_ops[0]->Op()->Input("LearningRate")); + adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow")); + adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow")); + + adam_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")}); + adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")}); + adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")}); + adam_desc.SetAttr("beta1", beta1); + adam_desc.SetAttr("beta2", beta2); + adam_desc.SetAttr("epsilon", epsilon); + adam_desc.SetAttr("lazy_mode", lazy_mode); + adam_desc.SetAttr("min_row_size_to_use_multithread", + min_row_size_to_use_multithread); + adam_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + + auto adam_node = graph->CreateOpNode(&adam_desc); + + InserInputAndOutputForOptOps(adam_ops, adam_node); +} + +void FuseAdamOpPass::FuseScaleOps(const std::vector &beta_name, + const std::string &fused_var_name, + const std::vector &adam_ops, + ir::Graph *graph) const { + PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size()); + const std::string scale_op_name = "scale"; + + // Get the scale_ops of dealing the adam's beta var. + std::vector scale_ops; + scale_ops.reserve(beta_name.size()); + for (size_t i = 0; i < adam_ops.size(); ++i) { + auto &beta_1_pow_name = beta_name[i]; + auto beta_pow_iter = std::find_if( + adam_ops[i]->inputs.begin(), adam_ops[i]->inputs.end(), + [&beta_name, &beta_1_pow_name](ir::Node *var_node) -> bool { + return var_node->Var() && var_node->Var()->Name() == beta_1_pow_name; + }); + PADDLE_ENFORCE(beta_pow_iter != adam_ops[i]->inputs.end()); + + auto beta_pow_node = *beta_pow_iter; + auto scale_op_iter = std::find_if( + beta_pow_node->outputs.begin(), beta_pow_node->outputs.end(), + [&scale_op_name](ir::Node *op_node) -> bool { + return op_node->Op() && op_node->Op()->Type() == scale_op_name; + }); + PADDLE_ENFORCE(scale_op_iter != beta_pow_node->outputs.end()); + + scale_ops.emplace_back(*scale_op_iter); + } + PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size()); + + // Check attributions + // NOTE: If new attribution is added, the following code maybe need change. + int op_role = boost::get( + scale_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + float scale = boost::get(scale_ops[0]->Op()->GetAttr("scale")); + float bias = boost::get(scale_ops[0]->Op()->GetAttr("bias")); + bool bias_after_scale = + boost::get(scale_ops[0]->Op()->GetAttr("bias_after_scale")); + for (auto &scale_op : scale_ops) { + PADDLE_ENFORCE_EQ(scale, + boost::get(scale_op->Op()->GetAttr("scale"))); + PADDLE_ENFORCE_EQ(bias, boost::get(scale_op->Op()->GetAttr("bias"))); + PADDLE_ENFORCE_EQ( + bias_after_scale, + boost::get(scale_op->Op()->GetAttr("bias_after_scale"))); + PADDLE_ENFORCE_EQ(op_role, boost::get(scale_op->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName()))); + } + + // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var + // node. + + VLOG(10) << "Insert fused scale to graph."; + OpDesc scale_desc(scale_ops[0]->Op()->Block()); + scale_desc.SetType("scale"); + scale_desc.SetInput("X", {fused_var_name}); + scale_desc.SetOutput("Out", {fused_var_name}); + scale_desc.SetAttr("scale", scale); + scale_desc.SetAttr("bias", bias); + scale_desc.SetAttr("bias_after_scale", bias_after_scale); + scale_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + auto scale_node = graph->CreateOpNode(&scale_desc); + + for (auto scale_op : scale_ops) { + // set inputs + scale_node->inputs.insert(scale_node->inputs.begin(), + scale_op->inputs.begin(), scale_op->inputs.end()); + for (auto &input : scale_op->inputs) { + std::replace(input->outputs.begin(), input->outputs.end(), scale_op, + scale_node); + } + // set outputs + scale_node->outputs.insert(scale_node->outputs.begin(), + scale_op->outputs.begin(), + scale_op->outputs.end()); + for (auto &output : scale_op->outputs) { + std::replace(output->inputs.begin(), output->inputs.end(), scale_op, + scale_node); + } + } + + // Delete scale_ops + for (auto &scale_op : scale_ops) { + graph->RemoveNode(scale_op); + } +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_adam_op_pass, paddle::framework::details::FuseAdamOpPass) + .RequirePassAttr(paddle::framework::details::kPlaces) + .RequirePassAttr(paddle::framework::details::kLocalScopes); diff --git a/paddle/fluid/framework/details/fuse_adam_op_pass.h b/paddle/fluid/framework/details/fuse_adam_op_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..5866c37552e26d9b14fa946e119f20121ecf7cb2 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_adam_op_pass.h @@ -0,0 +1,55 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { + +class FuseAdamOpPass : public FuseOptimizerOpPass { + private: + virtual const std::string GetOpType() const; + + virtual const std::vector GetAuxiliaryVarNames() const; + + // Fuse Adam Ops and Scale Ops which are used to update "Beta1Pow", "Beta2Pow" + virtual void FuseOptimizerOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &adam_ops, ir::Graph *graph) const; + + void FuseAdamOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &adam_ops, ir::Graph *graph) const; + + void FuseScaleOps(const std::vector &aux_var_set, + const std::string &fused_var_name, + const std::vector &adam_ops, + ir::Graph *graph) const; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc index f226491c9f5355d0418d672069b19a78ef53c595..31efd78ad3dbed73d7993bac47694c9d6d742343 100644 --- a/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc +++ b/paddle/fluid/framework/details/fuse_all_reduce_op_pass.cc @@ -28,8 +28,7 @@ namespace details { class FuseAllReduceOpPass : public ir::Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override { + void ApplyImpl(ir::Graph *graph) const override { ir::Graph &result = *graph; auto &places = Get>(kPlaces); @@ -71,7 +70,7 @@ class FuseAllReduceOpPass : public ir::Pass { VLOG(10) << "Find all_reduce_ops: " << all_reduce_ops.size(); if (all_reduce_ops.size() == 0) { - return std::move(graph); + return; } PADDLE_ENFORCE_EQ(all_reduce_ops.size(), grads.size(), @@ -99,7 +98,6 @@ class FuseAllReduceOpPass : public ir::Pass { group_all_reduce_ops, &result); #endif } - return std::move(graph); } void InsertFusedAllReduce(const std::vector &places, diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..b49f095d428a017dd1a3bed2788a048af9afa6bb --- /dev/null +++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc @@ -0,0 +1,240 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h" +#include +#include +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace framework { +namespace details { + +void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { + ir::Graph &result = *graph; + + auto &places = Get>(kPlaces); + auto &local_scopes = Get>(kLocalScopes); + + const std::string fuse_op_type = GetOpType(); + const std::vector aux_var_names = GetAuxiliaryVarNames(); + + // Step 1: Get the specified op and auxiliary variables. + std::vector topo_nodes = ir::TopologySortOperations(result); + std::unordered_map> aux_var_set; + std::vector opt_ops; + for (auto &node : topo_nodes) { + GetSpecifiedOpsAndVars(fuse_op_type, aux_var_names, node, &opt_ops, + &aux_var_set); + } + + VLOG(10) << "Find " << fuse_op_type << " operators: " << opt_ops.size(); + if (opt_ops.size() == 0) { + return; + } + + if (result.Has(kFusedOptType)) { + VLOG(10) + << "Currently only support fusing one type optimizer op. Has fused " + << result.Get(kFusedOptType); + return; + } else { + result.Set(kFusedOptType, new FusedOptType); + } + result.Get(kFusedOptType) = fuse_op_type; + + // Step 2: Insert fused_var_name to FusedVars, and the FusedVars need be + // initialized in scopes before execution. + if (!result.Has(kFusedVars)) { + result.Set(kFusedVars, new FusedVars); + } + std::unordered_map fused_vars_name; + fused_vars_name.reserve(aux_var_names.size() + 1); + auto &fused_var_set = result.Get(kFusedVars); + const std::string prefix(kFusedVarNamePrefix); + // NOTE: the fused_var_name should be unique. + for (auto &var_name : aux_var_names) { + auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" + + aux_var_set[var_name][0]; + VLOG(10) << fused_var_name; + fused_vars_name.emplace(var_name, fused_var_name); + PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0); + fused_var_set.insert(fused_var_name); + } + + // Step 3: Get the fused Gradient's name + auto ¶ms_grads = result.Get(kParamsAndGrads); + if (!result.Has(kFusedGrads)) { + PADDLE_THROW( + "The alloc_continuous_space_for_grad_pass should be called before this " + "pass."); + } + auto &fused_grad = result.Get(kFusedGrads); + auto &fused_vars = result.Get(kFusedVars); + auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad); + PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad."); + fused_vars_name.emplace("Grad", fused_grad); + + // Step 4: Sort the parameters and auxiliary variables according + // to parameters' name to make variables' name correspond correctly. + PADDLE_ENFORCE(result.Has(kParamsAndGrads), "Does't find kParamsAndGrads."); + PADDLE_ENFORCE_EQ(params_grads.size(), aux_var_set.begin()->second.size(), + "The size of params_grads and aux_var_set are not equal."); + SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops); + + // Step 5: Alloc continuous space for Parameters and AuxiliaryVar(e.g. + // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately. + InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names, + aux_var_set, fused_vars_name); + + // Step 6: Fuse optimizer Ops and Scale Ops + FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result); + + // Step 7: Remove optimizer Ops + for (auto &opt_op : opt_ops) { + graph->RemoveNode(opt_op); + } +} + +void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( + const std::vector &places, + const std::vector &local_scopes, + const std::vector &aux_var_names, + const std::unordered_map> + &aux_var_set, + const std::unordered_map &fused_vars_name) const { + VLOG(10) << "Init FusedVars."; + // Alloc parameters and auxiliary vars in the respective scope. + size_t idx = local_scopes.size(); + for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend(); + ++iter, --idx) { + auto &scope = *iter; + for (auto &var_name : aux_var_names) { + auto fused_var_name = fused_vars_name.at(var_name); + VLOG(10) << "Init " << fused_var_name; + PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr, + "%s has exist in scope[%d]", fused_var_name, idx); + scope->Var(fused_var_name)->GetMutable(); + } + } + + ProgramDesc program_desc; + auto *global_block = program_desc.MutableBlock(0); + for (auto &var_name : aux_var_names) { + AppendAllocContinuousSpace(aux_var_set.at(var_name), + fused_vars_name.at(var_name), true, + global_block); + } + + for (size_t i = 0; i < local_scopes.size(); ++i) { + for (auto &op_desc : global_block->AllOps()) { + auto op = OpRegistry::CreateOp(*op_desc); + op->Run(*local_scopes[i], places[i]); + } + } +} + +void FuseOptimizerOpPass::SortParametersAndAuxVars( + const std::vector> ¶ms_grads, + std::unordered_map> *aux_vars_set, + std::vector *ops) const { + PADDLE_ENFORCE_NE(aux_vars_set->count("Param"), static_cast(0)); + auto ¶m_vec = aux_vars_set->at("Param"); + + std::vector param_sort_idx; + param_sort_idx.reserve(param_vec.size()); + + for (auto &p_g : params_grads) { + auto iter = std::find(param_vec.begin(), param_vec.end(), p_g.first); + PADDLE_ENFORCE(iter != param_vec.end()); + auto idx = std::distance(param_vec.begin(), iter); + param_sort_idx.emplace_back(idx); + } + + for (auto &aux_vars : *aux_vars_set) { + std::vector sorted_vars; + sorted_vars.reserve(aux_vars.second.size()); + for (size_t i = 0; i < aux_vars.second.size(); ++i) { + sorted_vars.emplace_back(aux_vars.second.at(param_sort_idx[i])); + } + std::swap(aux_vars.second, sorted_vars); + + std::stringstream out; + for (auto &var_name : aux_vars.second) { + out << var_name << " "; + } + VLOG(10) << aux_vars.first << ": " << out.str(); + } + + std::vector sorted_ops; + sorted_ops.reserve(ops->size()); + for (size_t i = 0; i < ops->size(); ++i) { + sorted_ops.emplace_back(ops->at(param_sort_idx[i])); + } + std::swap(*ops, sorted_ops); +} + +void FuseOptimizerOpPass::GetSpecifiedOpsAndVars( + const std::string &op_type, const std::vector &aux_vars_name, + ir::Node *node, std::vector *ops, + std::unordered_map> *aux_args_name) + const { + if (node->Op()->Type() != op_type) return; + + for (auto &var_n : aux_vars_name) { + auto arg_names = node->Op()->Input(var_n); + PADDLE_ENFORCE_EQ(arg_names.size(), static_cast(1)); + (*aux_args_name)[var_n].emplace_back(arg_names[0]); + VLOG(10) << var_n << ", " << arg_names[0]; + } + ops->emplace_back(node); +} + +void FuseOptimizerOpPass::AppendAllocContinuousSpace( + const std::vector &args, const std::string &out_arg, + bool copy_data, BlockDesc *global_block) const { + auto op_desc = global_block->AppendOp(); + op_desc->SetType("alloc_continuous_space"); + op_desc->SetInput("Input", args); + op_desc->SetOutput("Output", args); + op_desc->SetOutput("FusedOutput", {out_arg}); + op_desc->SetAttr("copy_data", copy_data); + op_desc->SetAttr("check_name", true); +} + +void FuseOptimizerOpPass::InserInputAndOutputForOptOps( + const std::vector &opt_ops, ir::Node *opt_node) const { + std::unordered_set inputs; + std::unordered_set outputs; + for (auto opt_op : opt_ops) { + // set inputs + inputs.insert(opt_op->inputs.begin(), opt_op->inputs.end()); + for (auto &input : opt_op->inputs) { + replace(input->outputs.begin(), input->outputs.end(), opt_op, opt_node); + } + // set outputs + outputs.insert(opt_op->outputs.begin(), opt_op->outputs.end()); + for (auto &output : opt_op->outputs) { + replace(output->inputs.begin(), output->inputs.end(), opt_op, opt_node); + } + } + opt_node->inputs.insert(opt_node->inputs.begin(), inputs.begin(), + inputs.end()); + opt_node->outputs.insert(opt_node->outputs.begin(), outputs.begin(), + outputs.end()); +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.h b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..0240f1594d7ef9d855eb6e96e8e8a32ee1d957ba --- /dev/null +++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h @@ -0,0 +1,75 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { + +class FuseOptimizerOpPass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph *graph) const override; + + protected: + virtual void SortParametersAndAuxVars( + const std::vector> ¶ms_grads, + std::unordered_map> *aux_var_set, + std::vector *ops) const; + + void InserInputAndOutputForOptOps(const std::vector &opt_ops, + ir::Node *opt_node) const; + + private: + virtual const std::string GetOpType() const = 0; + + virtual const std::vector GetAuxiliaryVarNames() const = 0; + + virtual void FuseOptimizerOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &adam_ops, ir::Graph *graph) const = 0; + + void GetSpecifiedOpsAndVars( + const std::string &op_type, const std::vector &aux_vars_name, + ir::Node *node, std::vector *ops, + std::unordered_map> *aux_args_name) + const; + + void AppendAllocContinuousSpace(const std::vector &args, + const std::string &out_arg, bool copy_data, + BlockDesc *global_block) const; + + void InitFusedVarsAndAllocSpaceForVars( + const std::vector &places, + const std::vector &local_scopes, + const std::vector &aux_var_names, + const std::unordered_map> + &aux_var_set, + const std::unordered_map &fused_vars_name) + const; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_sgd_op_pass.cc b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..f91c21e3cc869de1a6d67146eb99f27a2ca5497c --- /dev/null +++ b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fuse_sgd_op_pass.h" +#include +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace framework { +namespace details { + +const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; } + +const std::vector FuseSgdOpPass::GetAuxiliaryVarNames() const { + return {"Param"}; +} + +void FuseSgdOpPass::FuseOptimizerOps( + const std::unordered_map> + &aux_var_set, + const std::unordered_map &fused_vars_name, + const std::vector &sgd_ops, ir::Graph *graph) const { + FuseSgdOps(aux_var_set, fused_vars_name, sgd_ops, graph); +} + +void FuseSgdOpPass::FuseSgdOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &sgd_ops, ir::Graph *graph) const { + PADDLE_ENFORCE_GT(sgd_ops.size(), static_cast(0)); + + // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var + // node. + + int op_role = boost::get( + sgd_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + VLOG(10) << "Insert sgd to graph "; + // Add fused scale + OpDesc Sgd_desc(sgd_ops[0]->Op()->Block()); + Sgd_desc.SetType("sgd"); + Sgd_desc.SetInput("Param", {fused_vars_name.at("Param")}); + Sgd_desc.SetInput("Grad", {fused_vars_name.at("Grad")}); + Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")}); + + // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. + Sgd_desc.SetInput("LearningRate", sgd_ops[0]->Op()->Input("LearningRate")); + + // NOTE: multi_devices_pass requires that every op should have a role. + Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + + auto sgd_node = graph->CreateOpNode(&Sgd_desc); + + InserInputAndOutputForOptOps(sgd_ops, sgd_node); +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_sgd_op_pass, paddle::framework::details::FuseSgdOpPass) + .RequirePassAttr(paddle::framework::details::kPlaces) + .RequirePassAttr(paddle::framework::details::kLocalScopes); diff --git a/paddle/fluid/framework/details/fuse_sgd_op_pass.h b/paddle/fluid/framework/details/fuse_sgd_op_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..b3aa6a203b726a5a1540ce533c0305d7f579d4a9 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_sgd_op_pass.h @@ -0,0 +1,50 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { + +class FuseSgdOpPass : public FuseOptimizerOpPass { + private: + virtual const std::string GetOpType() const; + + virtual const std::vector GetAuxiliaryVarNames() const; + + // Fuse Sgd Ops + virtual void FuseOptimizerOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &sgd_ops, ir::Graph *graph) const; + + void FuseSgdOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &sgd_ops, ir::Graph *graph) const; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index 644cd4e15083519d6c685ae3e6a0737692018a07..a57d670f118f2eb0bdcbeb7ed080729e4f9e4f2b 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -24,6 +24,19 @@ namespace paddle { namespace framework { namespace details { +// Note(zcd): Addresses should be aligned, otherwise, the results may have +// diff. +static size_t Alignment(size_t size, const platform::Place &place) { + // Allow to allocate the minimum chunk size is 4 KB. + size_t alignment = 1 << 12; + if (platform::is_gpu_place(place)) { + // Allow to allocate the minimum chunk size is 256 B. + alignment = 1 << 8; + } + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + typedef std::vector>> GradientAndLoDTensor; @@ -111,10 +124,11 @@ void FusedAllReduceOpHandle::RunImpl() { return grad1.second->data() < grad2.second->data(); }); + size_t size_of_dtype = framework::SizeOfType(dtype); for (size_t k = 1; k < g_tensor.size(); ++k) { const void *cur_address = g_tensor.at(k - 1).second->data(); int64_t len = g_tensor.at(k - 1).second->numel(); - auto offset = len * framework::SizeOfType(dtype); + auto offset = Alignment(len * size_of_dtype, places_[0]); void *infer_next_address = reinterpret_cast( reinterpret_cast(cur_address) + offset); const void *next_address = g_tensor.at(k).second->data(); @@ -228,18 +242,21 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel( const std::vector> &grad_tensor, proto::VarType::Type *dtype, int64_t *numel) const { *numel = 0; + size_t size_of_dtype = 0; for (size_t i = 0; i < grad_tensor.size(); ++i) { - // Get element number - int64_t len = grad_tensor.at(i).second->numel(); - PADDLE_ENFORCE_GT(len, 0); - *numel += len; - // Get dtype auto ele_type = grad_tensor.at(i).second->type(); if (i == 0) { *dtype = ele_type; + size_of_dtype = framework::SizeOfType(ele_type); } PADDLE_ENFORCE_EQ(ele_type, *dtype); + + // Get element number + int64_t len = grad_tensor.at(i).second->numel(); + PADDLE_ENFORCE_GT(len, 0); + // Alignment(len) + *numel += Alignment(len * size_of_dtype, places_[0]) / size_of_dtype; } } diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc index 88f26b41618e4b74766f2caa00ad29fd912f48f9..afbda33b0662e7831b7ea0d44dc7ae4ff3694b1c 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.cc +++ b/paddle/fluid/framework/details/inplace_op_pass.cc @@ -144,10 +144,9 @@ void InplacePass::InitSSAGraphNodes() const { } } -std::unique_ptr InplacePass::ApplyImpl( - std::unique_ptr graph) const { +void InplacePass::ApplyImpl(ir::Graph* graph) const { var_nodes_.clear(); - view_.Build(graph.get()); + view_.Build(graph); InitSSAGraphNodes(); auto cnt = 0; @@ -155,11 +154,9 @@ std::unique_ptr InplacePass::ApplyImpl( VLOG(4) << "Handle op " << cnt++ << ": " << op->Name(); if (FLAGS_enable_inplace_whitelist && !whitelist_.count(op->Name())) continue; - TryInplaceOpInputOutput(op, graph.get()); + TryInplaceOpInputOutput(op, graph); } // graph->ResolveHazard(var_nodes_); - - return graph; } void InplacePass::InplaceModifyDesc(const std::string& var, diff --git a/paddle/fluid/framework/details/inplace_op_pass.h b/paddle/fluid/framework/details/inplace_op_pass.h index 01964ba8fc43fa86bb99c185fa20b056fddbffee..fbec973ddaa7673601780810cfbbf8c1128af513 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.h +++ b/paddle/fluid/framework/details/inplace_op_pass.h @@ -69,8 +69,7 @@ class InplacePass : public ir::Pass { InplacePass(); protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; void InitSSAGraphNodes() const; diff --git a/paddle/fluid/framework/details/memory_optimize_pass.cc b/paddle/fluid/framework/details/memory_optimize_pass.cc index 80720af32d5670928a6ad2b9efbeadf6452b0273..ddaef206028b16dd10c2beb57ce6bf30103a8d10 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.cc +++ b/paddle/fluid/framework/details/memory_optimize_pass.cc @@ -44,8 +44,7 @@ namespace paddle { namespace framework { namespace details { -std::unique_ptr MemoryOptimizePass::ApplyImpl( - std::unique_ptr graph) const { +void MemoryOptimizePass::ApplyImpl(ir::Graph* graph) const { auto nodes = graph->Nodes(); CollectSkipVarsSet(nodes); @@ -113,7 +112,7 @@ std::unique_ptr MemoryOptimizePass::ApplyImpl( cfg_->RenameVarInCFGGraph(var_name, cache_name, idx); RenameVarInGraphDesc(var_name, cache_name, idx); - RenameVarInGraphNode(var_name, cache_name, idx, graph.get()); + RenameVarInGraphNode(var_name, cache_name, idx, graph); pool_.Erase(cache_name); } } @@ -128,8 +127,6 @@ std::unique_ptr MemoryOptimizePass::ApplyImpl( } } graph->ResolveHazard(var_nodes_); - - return graph; } void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { diff --git a/paddle/fluid/framework/details/memory_optimize_pass.h b/paddle/fluid/framework/details/memory_optimize_pass.h index 593ffc10fc99d26b1ee9174ceef081581126e7e8..ce94890b3856fa6bf167b8a08c814f81e422c372 100644 --- a/paddle/fluid/framework/details/memory_optimize_pass.h +++ b/paddle/fluid/framework/details/memory_optimize_pass.h @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -35,8 +36,7 @@ namespace details { class MemoryOptimizePass : public ir::Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; // fill the variable map(var_nodes) by version. void InitSSAGraphNodes() const; diff --git a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc index 67aad9f94f088f4b50e1ce2728d83de98a3c60ad..ae363f96393bddac4c88c7caf0ef6087ea848fb9 100644 --- a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc +++ b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.cc @@ -34,8 +34,7 @@ static bool IsLockAndRecordEventFreeComputationOpHandle( return true; } -std::unique_ptr ModifyOpLockAndRecordEventPass::ApplyImpl( - std::unique_ptr ir_graph) const { +void ModifyOpLockAndRecordEventPass::ApplyImpl(ir::Graph *ir_graph) const { auto all_ops = ir::FilterByNodeWrapper(*ir_graph); OpGraphView graph_view(all_ops); for (auto &op : all_ops) { @@ -49,7 +48,6 @@ std::unique_ptr ModifyOpLockAndRecordEventPass::ApplyImpl( << compute_op->DebugString(); } } - return ir_graph; } } // namespace details diff --git a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.h b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.h index b54e1b318be95e1e0abf6830f8c918895df02718..54d52d6240a830dfc66f13c26fb79a896897f980 100644 --- a/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.h +++ b/paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.h @@ -23,8 +23,7 @@ namespace details { class ModifyOpLockAndRecordEventPass : public ir::Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace details diff --git a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc index a4bb1e26d933946b7ca36196d1c0e8a0a4ec54e2..9859b04dec4193812769cc63d4489a9150b973f2 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc @@ -23,10 +23,8 @@ namespace details { class SSAGraghBuilderWithChecker : public ir::Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override { - PADDLE_ENFORCE(IsValidGraph(graph.get())); - return graph; + void ApplyImpl(ir::Graph *graph) const override { + PADDLE_ENFORCE(IsValidGraph(graph)); } bool IsValidGraph(const ir::Graph *graph) const { diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 8c61684c9c9643476bf2fe780172c9c34a2e1986..f80a098bfa26f160d6008cdefbad1803a85f9161 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -153,8 +153,7 @@ void MultiDevSSAGraphBuilderBase::Init() const { PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); } -std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( - std::unique_ptr graph) const { +void MultiDevSSAGraphBuilderBase::ApplyImpl(ir::Graph *graph) const { Init(); CheckGraph(*graph); std::vector sorted_ops = SortOperations(*graph); @@ -236,7 +235,6 @@ std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( AddOutputToLeafOps(&result); result.Erase(kGraphOps); - return graph; } void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp( diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 8bfd7b9bf8f96c074996486e70184ca56c6f7167..611693fc7c241f0afed39ab86390df69b9cf4797 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -20,7 +20,6 @@ #include #include #include - #include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph.h" @@ -34,10 +33,13 @@ namespace framework { class Scope; namespace details { +constexpr char kLossVarName[] = "loss_var_name"; +constexpr char kStrategy[] = "strategy"; +constexpr char kNRanks[] = "nranks"; + class MultiDevSSAGraphBuilderBase : public ir::Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph *graph) const override; virtual void Init() const; diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc index e82eb104fa9f461ec370fc4b31551dd1a9214a7c..34c38ea81a9e4832f7e1b63e1e6db4ea27704c34 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc @@ -13,7 +13,9 @@ // limitations under the License. #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" +#include #include +#include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h index b06c87a5c185c550818af0bdeacd0070d1d90e4e..6d57d75e8a5541ac39e6dbe231c3f47daaa4206a 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include "paddle/fluid/framework/details/multi_devices_helper.h" @@ -40,13 +41,11 @@ class GraphvizSSAGraphPrinter : public SSAGraphPrinter { class SSAGraghBuilderWithPrinter : public ir::Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override { + void ApplyImpl(ir::Graph* graph) const override { std::unique_ptr fout( new std::ofstream(Get(kGraphvizPath))); PADDLE_ENFORCE(fout->good()); Get("graph_printer").Print(*graph, *fout); - return graph; } }; diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h index ab5e099023382c4e28a9613d321ea8dc182d3534..6e6ef074db3450ebbb5567743b908e0aee382c27 100644 --- a/paddle/fluid/framework/details/multi_devices_helper.h +++ b/paddle/fluid/framework/details/multi_devices_helper.h @@ -20,7 +20,6 @@ #include #include #include - #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/var_handle.h" @@ -41,22 +40,25 @@ namespace details { // `std::vector` is the version of varaibles. typedef std::vector>> GraphVars; -const char kGraphVars[] = "vars"; - -// aux variables to represent dependency. Useful to resolve data hazard. -typedef std::unordered_set GraphDepVars; -const char kGraphDepVars[] = "dep_vars"; +constexpr char kGraphVars[] = "vars"; -constexpr char kNCCLCtxs[] = "nccl_ctxs"; - -constexpr char kLossVarName[] = "loss_var_name"; constexpr char kPlaces[] = "places"; constexpr char kLocalScopes[] = "local_scopes"; -constexpr char kStrategy[] = "strategy"; -constexpr char kNRanks[] = "nranks"; +constexpr char kNCCLCtxs[] = "nccl_ctxs"; + +// aux variables to represent dependency. Useful to resolve data hazard. +typedef std::unordered_set GraphDepVars; +constexpr char kGraphDepVars[] = "dep_vars"; typedef std::unordered_set FusedVars; constexpr char kFusedVars[] = "fused_vars"; +constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@"; + +typedef std::string FusedOptType; +constexpr char kFusedOptType[] = "fused_opt_type"; + +typedef std::string FusedGrads; +constexpr char kFusedGrads[] = "fused_gradients"; typedef std::vector> ParamsAndGrads; constexpr char kParamsAndGrads[] = "params_grads"; @@ -65,8 +67,6 @@ typedef std::vector>> GroupGradsAndParams; constexpr char kGroupGradsAndParams[] = "group_grads_params"; -constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@"; - } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 2afac32437dd79a54ef7d1ee2d203a34c1b5f30e..137e0dd7708dcc77c3a927979cfb357249f1fdc9 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -96,7 +96,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( auto seq_allreduce_pass = ir::PassRegistry::Instance().Get("all_reduce_deps_pass"); for (size_t i = 0; i < graphs_.size(); ++i) { - graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i])); + graphs_[i].reset(seq_allreduce_pass->Apply(graphs_[i].release())); } // set the correct size of thread pool to each device. diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 0c3d8d5caec0015c5696223db2e4b75a8d79e5e1..25337872c10f932b6e9ecf4f0a6fb9bed332b11c 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -266,8 +266,7 @@ static bool ShrinkNoNeedBufferVarOpDependency( } } -std::unique_ptr ReferenceCountPass::ApplyImpl( - std::unique_ptr graph) const { +void ReferenceCountPass::ApplyImpl(ir::Graph *graph) const { auto &ref_cnts = Get>(kGlobalReferenceCount); auto &last_live_ops_of_vars = Get>(kLastLiveOpsOfVars); @@ -335,14 +334,13 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( var_name); ref_cnts[i].emplace(var_name, result.size()); last_live_ops_of_vars[i].emplace(var_name, std::move(result)); + break; } // Seldomly, all preceding trying failed. // Just skip this corner case } } - - return graph; } } // namespace details diff --git a/paddle/fluid/framework/details/reference_count_pass.h b/paddle/fluid/framework/details/reference_count_pass.h index bcbef027354ef5a5fcc7da28103a9565982c7631..7bb01ee6161eda944006d8d3d0fe6e9f91befcee 100644 --- a/paddle/fluid/framework/details/reference_count_pass.h +++ b/paddle/fluid/framework/details/reference_count_pass.h @@ -23,8 +23,7 @@ namespace details { class ReferenceCountPass : public ir::Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace details diff --git a/paddle/fluid/framework/details/sequential_execution_pass.cc b/paddle/fluid/framework/details/sequential_execution_pass.cc index 0b53a76e7877891509ea4d0334673ae2a1fcf949..839f8dc43ed8c6f13380732b221520b3bb59b099 100644 --- a/paddle/fluid/framework/details/sequential_execution_pass.cc +++ b/paddle/fluid/framework/details/sequential_execution_pass.cc @@ -29,8 +29,7 @@ static bool IsSameOpDesc(OpDesc *op1, OpDesc *op2) { op1->Outputs() == op2->Outputs(); } -std::unique_ptr SequentialExecutionPass::ApplyImpl( - std::unique_ptr graph) const { +void SequentialExecutionPass::ApplyImpl(ir::Graph *graph) const { // FIXME(zjl): Insert dependencies between some distributed ops may cause // the multi_devices_graph_pass fails. So we skip these ops here. // Indeed, maybe we should not insert dependencies between these ops @@ -98,7 +97,6 @@ std::unique_ptr SequentialExecutionPass::ApplyImpl( VLOG(10) << "Add dependencies between " << op_node_list[i - 1]->Name() << " and " << op_node_list[i]->Name(); } - return graph; } } // namespace details diff --git a/paddle/fluid/framework/details/sequential_execution_pass.h b/paddle/fluid/framework/details/sequential_execution_pass.h index ea3034877fcea80de0124df64d8d23028bdcb7b3..7d6a4f4cc55698d80a60333d2e8d528b4a3b1641 100644 --- a/paddle/fluid/framework/details/sequential_execution_pass.h +++ b/paddle/fluid/framework/details/sequential_execution_pass.h @@ -23,8 +23,7 @@ namespace details { class SequentialExecutionPass : public ir::Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace details diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index c4254bbadfa17682f437f46f02adc9c884d24304..c00932a7bdb170e63b5fd4d43ccb2072f1a0a9c9 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -24,13 +24,13 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, ir::Graph *graph) : graph_(graph), - pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) - : nullptr), - prepare_pool_(1), local_scopes_(local_scopes), places_(places), fetch_ctxs_(places), - strategy_(strategy) { + strategy_(strategy), + prepare_pool_(1), + pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) + : nullptr) { PrepareOpDeps(); CopyOpDeps(); } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index b9bccba8fa2fa13d99a9a39a5135106101daa903..1fa5196970512ccc4a3dee698f477711be1e7101 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -63,13 +63,20 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { details::OpHandleBase *op); private: + // Note(zcd): the ThreadPool should be placed last so that ThreadPool should + // be destroyed first. ir::Graph *graph_; - std::unique_ptr<::ThreadPool> pool_; - ::ThreadPool prepare_pool_; std::vector local_scopes_; std::vector places_; platform::DeviceContextPool fetch_ctxs_; ExceptionHolder exception_holder_; + std::unique_ptr op_deps_; + std::future> op_deps_futures_; + ExecutionStrategy strategy_; + // use std::list because clear(), push_back, and for_each are O(1) + std::list> run_op_futures_; + ::ThreadPool prepare_pool_; + std::unique_ptr<::ThreadPool> pool_; void InsertPendingOp(std::unordered_map *pending_ops, OpHandleBase *op_instance) const; @@ -88,14 +95,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { void PrepareOpDeps(); void CopyOpDeps(); - - private: - std::future> op_deps_futures_; - - ExecutionStrategy strategy_; - std::unique_ptr op_deps_; - // use std::list because clear(), push_back, and for_each are O(1) - std::list> run_op_futures_; }; } // namespace details diff --git a/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc index fd6b6dd2274d9721b8754e16cd7b4f1ab596380d..8f7c99f12a6338ad99d988d3eda3759e323f64bb 100644 --- a/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc +++ b/paddle/fluid/framework/details/while_op_eager_deletion_pass.cc @@ -23,8 +23,7 @@ namespace details { class WhileOpEagerDeletionPass : public ir::Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override { + void ApplyImpl(ir::Graph *graph) const override { auto all_ops = ir::FilterByNodeWrapper(*graph); // Find all while_op and while_grad_op @@ -50,7 +49,6 @@ class WhileOpEagerDeletionPass : public ir::Pass { operators::PrepareSafeEagerDeletionOnWhileOpAndWhileGradOp( while_ops, while_grad_ops); } - return graph; } }; diff --git a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc index 83b0da0c0118a856e54d744607cee8b421f330a3..39077f6420613e115fff828eefc295769c187833 100644 --- a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc +++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc @@ -29,10 +29,9 @@ namespace ir { GET_IR_NODE(elementwise_mul); \ GET_IR_NODE(elementwise_mul_out); -std::unique_ptr AnakinFillconstantElementwisemulFuse::ApplyImpl( - std::unique_ptr graph) const { +void AnakinFillconstantElementwisemulFuse::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "anakin_fillconstant_elementwisemul_fuse"; - FusePassBase::Init(pattern_name, graph.get()); + FusePassBase::Init(pattern_name, graph); GraphPatternDetector gpd; auto* x = gpd.mutable_pattern() @@ -69,12 +68,11 @@ std::unique_ptr AnakinFillconstantElementwisemulFuse::ApplyImpl( IR_NODE_LINK_TO(scale_op, elementwise_mul_out); // Output // Delete the unneeded nodes. - GraphSafeRemoveNodes(graph.get(), + GraphSafeRemoveNodes(graph, {fill_constant, fill_constant_out, elementwise_mul}); }; - gpd(graph.get(), handler); - return graph; + gpd(graph, handler); } } // namespace ir diff --git a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h index fa95143d3adae3e3eeb913af09986fb4a401bd73..14c07c5884ebeda602953704de6db42f16441d6e 100644 --- a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h +++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h @@ -26,8 +26,7 @@ class AnakinFillconstantElementwisemulFuse : public FusePassBase { virtual ~AnakinFillconstantElementwisemulFuse() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc index a9897e0bb884c9cc8ee9a288bbef9e067d789cb5..5a82d7927f4cf3ca7e7b27ecdb71eab69e007efb 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/attention_lstm_fuse_pass.h" #include +#include #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -253,8 +254,7 @@ void PrepareLSTMBias(const LoDTensor& B_forget, const LoDTensor& B_input, // Parameters -std::unique_ptr AttentionLSTMFusePass::ApplyImpl( - std::unique_ptr graph) const { +void AttentionLSTMFusePass::ApplyImpl(ir::Graph* graph) const { PDPattern external_pattern, subblock_pattern; // Use the following variables to tell whether this model is RNN1. @@ -269,12 +269,11 @@ std::unique_ptr AttentionLSTMFusePass::ApplyImpl( } } if (count < specified_vars.size()) { - return graph; + return; } // Continue to fuse. - FindWhileOp(graph.get()); - return graph; + FindWhileOp(graph); } } // namespace ir diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h index 39b0585d3a6f9b52c9ec4b0a24f8532a3410851a..47ed9f0393fb222e612ed3bce1afbc879edb410d 100644 --- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h +++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.h @@ -22,8 +22,7 @@ namespace ir { class AttentionLSTMFusePass : public FusePassBase { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc index a7bfb8cf1ee09e78051e2f140c9a7ab4c40db60c..fecc159adef1992a90b6ee88b3b7ffceea116243 100644 --- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc @@ -77,10 +77,9 @@ void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight, weights_array_2d.colwise() *= scale_array; } -std::unique_ptr ConvAffineChannelFusePass::ApplyImpl( - std::unique_ptr graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init(name_scope_, graph.get()); +void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); auto* scope = param_scope(); PADDLE_ENFORCE(scope); @@ -139,7 +138,7 @@ std::unique_ptr ConvAffineChannelFusePass::ApplyImpl( desc.SetAttr("axis", 1); auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. - GraphSafeRemoveNodes(graph.get(), {ac_scale, ac_bias, affine_channel}); + GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel}); IR_NODE_LINK_TO(conv_out, eltwise_op); IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op); @@ -147,16 +146,14 @@ std::unique_ptr ConvAffineChannelFusePass::ApplyImpl( found_conv_ac_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_conv_ac_count); - return graph; } -std::unique_ptr ConvEltwiseAddAffineChannelFusePass::ApplyImpl( - std::unique_ptr graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init(name_scope_, graph.get()); +void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); auto* scope = param_scope(); PADDLE_ENFORCE(scope); @@ -199,7 +196,7 @@ std::unique_ptr ConvEltwiseAddAffineChannelFusePass::ApplyImpl( eltwise->Op()->SetAttr("axis", 1); eltwise->Op()->SetOutput("Out", std::vector({ac_out->Name()})); - GraphSafeRemoveNodes(graph.get(), + GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel, eltwise_out}); IR_NODE_LINK_TO(eltwise, ac_out); @@ -207,9 +204,8 @@ std::unique_ptr ConvEltwiseAddAffineChannelFusePass::ApplyImpl( found_conv_ac_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_conv_ac_count); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h index 8c3c8b56c08cc09e66b20d17bf730edec0499f35..d607020a47b8c589775ac763f04e64272dfec4e0 100644 --- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h @@ -31,8 +31,7 @@ class ConvAffineChannelFusePass : public FusePassBase { virtual ~ConvAffineChannelFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph*) const override; const std::string name_scope_{"conv_affine_channel_fuse"}; }; @@ -41,8 +40,7 @@ class ConvEltwiseAddAffineChannelFusePass : public FusePassBase { virtual ~ConvEltwiseAddAffineChannelFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph*) const override; const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"}; }; diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc index 04765dd1440331fb37ed2eb05a9ce762eb2b81bc..876a9996456c256f9b5f511ecd792f915b74b0df 100644 --- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc @@ -101,10 +101,9 @@ void recompute_bias_and_weights(const Scope* scope, weights_array_2d.colwise() *= variance_array; } -std::unique_ptr ConvBNFusePass::ApplyImpl( - std::unique_ptr graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init(name_scope_, graph.get()); +void ConvBNFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); auto* scope = param_scope(); PADDLE_ENFORCE(scope); @@ -187,7 +186,7 @@ std::unique_ptr ConvBNFusePass::ApplyImpl( std::vector({bn_out->Name()})); GraphSafeRemoveNodes( - graph.get(), + graph, {conv_out, bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out, bn_variance_out, bn_saved_mean, bn_saved_variance}); @@ -203,10 +202,9 @@ std::unique_ptr ConvBNFusePass::ApplyImpl( desc.SetAttr("axis", 1); auto eltwise_op = g->CreateOpNode(&desc); // OpDesc will be copied. - GraphSafeRemoveNodes( - graph.get(), - {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out, - bn_variance_out, bn_saved_mean, bn_saved_variance}); + GraphSafeRemoveNodes(graph, {bn_scale, bn_bias, bn_mean, bn_variance, + batch_norm, bn_mean_out, bn_variance_out, + bn_saved_mean, bn_saved_variance}); IR_NODE_LINK_TO(conv_out, eltwise_op); IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op); @@ -215,16 +213,14 @@ std::unique_ptr ConvBNFusePass::ApplyImpl( } }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_conv_bn_count); - return graph; } -std::unique_ptr ConvEltwiseAddBNFusePass::ApplyImpl( - std::unique_ptr graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init(name_scope_, graph.get()); +void ConvEltwiseAddBNFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); auto* scope = param_scope(); PADDLE_ENFORCE(scope); @@ -274,7 +270,7 @@ std::unique_ptr ConvEltwiseAddBNFusePass::ApplyImpl( eltwise->Op()->SetOutput("Out", std::vector({bn_out->Name()})); GraphSafeRemoveNodes( - graph.get(), + graph, {bn_scale, bn_bias, bn_mean, bn_variance, batch_norm, bn_mean_out, bn_variance_out, bn_saved_mean, bn_saved_variance, eltwise_out}); @@ -283,10 +279,9 @@ std::unique_ptr ConvEltwiseAddBNFusePass::ApplyImpl( found_conv_bn_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_conv_bn_count); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h index cf425a2730904d4ab21c33e66b72db0692cb087c..837a48ed7305f4176fc709ab2cb4edf68aeb9fa1 100644 --- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.h @@ -31,8 +31,7 @@ class ConvBNFusePass : public FusePassBase { virtual ~ConvBNFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"conv_bn_fuse"}; }; @@ -41,8 +40,7 @@ class ConvEltwiseAddBNFusePass : public FusePassBase { virtual ~ConvEltwiseAddBNFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"conv_eltwiseadd_bn_fuse"}; }; diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc index 6e9905b7ecdba653bb4d8a4aa82234ffba5a9528..99bc5fe8c506bb69c0fefcfb9af6747ea7db38d7 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc @@ -50,10 +50,9 @@ framework::proto::OpDesc PrepareOpDesc( return *desc.Proto(); } -std::unique_ptr ConvElementwiseAddActFusePass::ApplyImpl( - std::unique_ptr graph) const { +void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "conv_elementwise_add_act_fuse"; - FusePassBase::Init(pattern_name, graph.get()); + FusePassBase::Init(pattern_name, graph); GraphPatternDetector gpd; auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input( @@ -95,7 +94,6 @@ std::unique_ptr ConvElementwiseAddActFusePass::ApplyImpl( elementwise_add_out}); }; gpd(graph.get(), handler); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc index c6121777e8d2c32193b5c170bb0fa3f0337c9bc3..b4d6f683ce747a35aea7b431165911d942bcf092 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc @@ -51,10 +51,9 @@ framework::proto::OpDesc PrepareOpDesc( return *desc.Proto(); } -std::unique_ptr ConvElementwiseAdd2ActFusePass::ApplyImpl( - std::unique_ptr graph) const { +void ConvElementwiseAdd2ActFusePass::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "conv_elementwise_add2_act_fuse"; - FusePassBase::Init(pattern_name, graph.get()); + FusePassBase::Init(pattern_name, graph); GraphPatternDetector gpd; auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input( @@ -92,12 +91,10 @@ std::unique_ptr ConvElementwiseAdd2ActFusePass::ApplyImpl( // Delete the unneeded nodes. GraphSafeRemoveNodes( - graph.get(), - {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1, - elementwise_add_out, elementwise_add_out_1, act_op}); + graph, {conv_op, conv_out, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out, elementwise_add_out_1, act_op}); }; - gpd(graph.get(), handler); - return graph; + gpd(graph, handler); } } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h index 9259a4ac5c89b1a7d1413fb2eaaa5fc6a70348f2..ea9e465d8d765a298215db29c77aa58e727fd15e 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h @@ -25,8 +25,7 @@ class ConvElementwiseAdd2ActFusePass : public FusePassBase { virtual ~ConvElementwiseAdd2ActFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc index fe3b4fca79f372d570634a3c182a9ec3cf5522e1..ba0a2fb96458bd70105fa4d97114b609657b62f6 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc @@ -48,10 +48,9 @@ framework::proto::OpDesc PrepareOpDesc( return *desc.Proto(); } -std::unique_ptr ConvElementwiseAddActFusePass::ApplyImpl( - std::unique_ptr graph) const { +void ConvElementwiseAddActFusePass::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "conv_elementwise_add_act_fuse"; - FusePassBase::Init(pattern_name, graph.get()); + FusePassBase::Init(pattern_name, graph); GraphPatternDetector gpd; auto* x = gpd.mutable_pattern() @@ -88,12 +87,11 @@ std::unique_ptr ConvElementwiseAddActFusePass::ApplyImpl( IR_NODE_LINK_TO(new_conv_op, act_out); // Output // Delete the unneeded nodes. - GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op, - elementwise_add_out, act_op}); + GraphSafeRemoveNodes(graph, {conv_op, conv_out, elementwise_add_op, + elementwise_add_out, act_op}); }; - gpd(graph.get(), handler); - return graph; + gpd(graph, handler); } } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h index 9c0b50f155821cf2bd815a6fb087e3f6cc513641..8b34c3551d8f9b54f01e52cc0fc896901cd7df99 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h @@ -25,8 +25,7 @@ class ConvElementwiseAddActFusePass : public FusePassBase { virtual ~ConvElementwiseAddActFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc index 476c9dbc353f865916d0065bbce653d7b7204dce..8c491d4f58b4d3a1d93fe075fd0d118feeb6f8c2 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc @@ -30,10 +30,9 @@ namespace ir { GET_IR_NODE(elementwise_add_in_y); \ GET_IR_NODE(elementwise_add_out); -std::unique_ptr ConvElementwiseAddFusePass::ApplyImpl( - std::unique_ptr graph) const { +void ConvElementwiseAddFusePass::ApplyImpl(ir::Graph* graph) const { const std::string pattern_name = "conv_elementwise_add_fuse"; - FusePassBase::Init(pattern_name, graph.get()); + FusePassBase::Init(pattern_name, graph); GraphPatternDetector gpd; auto* x = gpd.mutable_pattern() @@ -76,11 +75,10 @@ std::unique_ptr ConvElementwiseAddFusePass::ApplyImpl( IR_NODE_LINK_TO(new_conv_op, elementwise_add_out); // Output // Delete the unneeded nodes. - GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op}); + GraphSafeRemoveNodes(graph, {conv_op, conv_out, elementwise_add_op}); }; - gpd(graph.get(), handler); - return graph; + gpd(graph, handler); } } // namespace ir diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h index bf43bd5ce2602a3e240c56f00f66f13b79151002..66a562cdd1948980a6792a53713cac947d72e7d6 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h @@ -25,8 +25,7 @@ class ConvElementwiseAddFusePass : public FusePassBase { virtual ~ConvElementwiseAddFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc index ba11f19c9273650113096be3fa23ca077bbc7dd9..3a6bbe65b369341c2a142dfcb261f5646d782796 100644 --- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc @@ -15,6 +15,8 @@ #include "paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h" #include #include +#include +#include #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/operators/math/blas.h" @@ -201,7 +203,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, // Remove unneeded nodes. // TODO(jczaja): Proper removing of lookup table std::unordered_set marked_nodes( - //{lookup_table, mul, lstm, elementwise_add, fc_bias, W}); + // {lookup_table, mul, lstm, elementwise_add, fc_bias, W}); {mul, lstm, elementwise_add, fc_bias}); GraphSafeRemoveNodes(graph, marked_nodes); } else { @@ -224,15 +226,13 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, return fusion_count; } -std::unique_ptr EmbeddingFCLSTMFusePass::ApplyImpl( - std::unique_ptr graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void EmbeddingFCLSTMFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); - int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), - true /*with_fc_bias*/); + int fusion_count = + BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/); AddStatis(fusion_count); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h index fde2a0a4eecdec9ad5ac58ad8e63c26cce482682..65cb4439727b466506af35df1bed609b18c06ee0 100644 --- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h +++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h @@ -32,8 +32,7 @@ class EmbeddingFCLSTMFusePass : public FusePassBase { virtual ~EmbeddingFCLSTMFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"embedding_fc_lstm_fuse"}; }; diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index 12b31da010c34a1e87a0ee449ca1cca2c33f113e..ca008763bff8ff89d5dba02e483090f2bec77592 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/fc_fuse_pass.h" #include +#include #include #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/platform/enforce.h" @@ -22,10 +23,9 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr FCFusePass::ApplyImpl( - std::unique_ptr graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("fc_fuse", graph.get()); +void FCFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("fc_fuse", graph); std::unordered_set nodes2delete; @@ -61,7 +61,7 @@ std::unique_ptr FCFusePass::ApplyImpl( desc.SetAttr("in_num_col_dims", mul->Op()->GetAttr("x_num_col_dims")); desc.SetType("fc"); auto fc_node = g->CreateOpNode(&desc); // OpDesc will be copied. - GraphSafeRemoveNodes(graph.get(), {mul, elementwise_add, mul_out}); + GraphSafeRemoveNodes(graph, {mul, elementwise_add, mul_out}); PADDLE_ENFORCE(subgraph.count(x)); IR_NODE_LINK_TO(subgraph.at(x), fc_node); @@ -72,10 +72,9 @@ std::unique_ptr FCFusePass::ApplyImpl( found_fc_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_fc_count); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.h b/paddle/fluid/framework/ir/fc_fuse_pass.h index 783a052edcf84c8c437a7b2e25f0d67c0366691e..0a0fcd2da8542b83e6b1239f9d822eb8637b8f5b 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_fuse_pass.h @@ -31,8 +31,7 @@ class FCFusePass : public FusePassBase { virtual ~FCFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc index 4e1e4e27f9ba932b56ecc25e816a2aee9d42362e..affe506910bbefc6244d85ff8c88cb33e05f8fe5 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc @@ -73,7 +73,7 @@ TEST(FCFusePass, basic) { int pre_nodes = graph->Nodes().size(); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); int after_nodes = graph->Nodes().size(); diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc index a902b0b50cf27ff84877053aca2ff921cd00b833..5f660c6d366fe094aed84ed2aa2f05adcbebbc43 100644 --- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/fc_gru_fuse_pass.h" #include +#include #include "paddle/fluid/framework/lod_tensor.h" namespace paddle { @@ -39,7 +40,6 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, // Create New OpDesc auto gru_creater = [&](Node* gru, Node* x, Node* weight_x, Node* weight_h, Node* bias, Node* hidden, Node* fc_bias) { - OpDesc op_desc; op_desc.SetType("fusion_gru"); @@ -155,26 +155,22 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, return fusion_count; } -std::unique_ptr MulGRUFusePass::ApplyImpl( - std::unique_ptr graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void MulGRUFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); - int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), - false /*with_fc_bias*/); + int fusion_count = + BuildFusion(graph, name_scope_, param_scope(), false /*with_fc_bias*/); AddStatis(fusion_count); - return graph; } -std::unique_ptr FCGRUFusePass::ApplyImpl( - std::unique_ptr graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); - int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), - true /*with_fc_bias*/); + int fusion_count = + BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/); AddStatis(fusion_count); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h index e359a3289440fffbec622488ecf3a7f49e986574..e11cdac7ea95219444c35bb8deef630fe29d3734 100644 --- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.h @@ -30,8 +30,7 @@ class FCGRUFusePass : public FusePassBase { virtual ~FCGRUFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"fc_gru_fuse"}; }; @@ -42,8 +41,7 @@ class MulGRUFusePass : public FusePassBase { virtual ~MulGRUFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"fc_nobias_gru_fuse"}; }; diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index f5c286486520391906a6cd7545041c8a7df614ea..babeba96149247fda20a1621a580cdcdbc2750d1 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/fc_lstm_fuse_pass.h" #include +#include #include "paddle/fluid/framework/lod_tensor.h" namespace paddle { @@ -157,26 +158,22 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope, return fusion_count; } -std::unique_ptr MulLstmFusePass::ApplyImpl( - std::unique_ptr graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void MulLstmFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); - int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), - false /*with_fc_bias*/); + int fusion_count = + BuildFusion(graph, name_scope_, param_scope(), false /*with_fc_bias*/); AddStatis(fusion_count); - return graph; } -std::unique_ptr FCLstmFusePass::ApplyImpl( - std::unique_ptr graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); - int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(), - true /*with_fc_bias*/); + int fusion_count = + BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/); AddStatis(fusion_count); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h index 21482615a6efef930b7328594477a51f4aaf28e7..5dea7c91a860f0b9622610f12f195eafb9849555 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.h @@ -32,8 +32,7 @@ class FCLstmFusePass : public FusePassBase { virtual ~FCLstmFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"fc_lstm_fuse"}; }; @@ -43,8 +42,7 @@ class MulLstmFusePass : public FusePassBase { virtual ~MulLstmFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"fc_nobias_lstm_fuse"}; }; diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc index 648acc4a759417240d9a39749b059289182ebb1e..bd49673168377486cd81726ce623e7196270d6a0 100644 --- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc +++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.cc @@ -15,6 +15,8 @@ #include "paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h" #include #include +#include +#include #include #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" @@ -23,29 +25,25 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr FuseElewiseAddActPass::ApplyImpl( - std::unique_ptr graph) const { +void FuseElewiseAddActPass::ApplyImpl(ir::Graph *graph) const { std::unordered_set act_types = {"relu", "scale"}; - graph = FuseActElewiseAdd(std::move(graph), act_types); - graph = FuseElewiseAddAct(std::move(graph), act_types); + graph = FuseActElewiseAdd(graph, act_types); + graph = FuseElewiseAddAct(graph, act_types); // backward { std::unordered_set in_place_act_types = {"relu_grad"}; - graph = FuseElewiseAddActInplaceGrad(std::move(graph), in_place_act_types); + graph = FuseElewiseAddActInplaceGrad(graph, in_place_act_types); } // Remove the removable intermediate_out. - RemoveIntermediateOut(graph.get()); - - return graph; + RemoveIntermediateOut(graph); } // ele_add(x, act(y)) -std::unique_ptr FuseElewiseAddActPass::FuseElewiseAddAct( - std::unique_ptr graph, - const std::unordered_set &act_types) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("elewise_add_act", graph.get()); +ir::Graph *FuseElewiseAddActPass::FuseElewiseAddAct( + ir::Graph *graph, const std::unordered_set &act_types) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("elewise_add_act", graph); GraphPatternDetector gpd; auto *x = gpd.mutable_pattern() @@ -86,18 +84,17 @@ std::unique_ptr FuseElewiseAddActPass::FuseElewiseAddAct( found_elewise_add_act_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_elewise_add_act_count); return graph; } // act(ele_add(x,y)) -std::unique_ptr FuseElewiseAddActPass::FuseActElewiseAdd( - std::unique_ptr graph, - const std::unordered_set &act_types) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("act_elewise_add", graph.get()); +ir::Graph *FuseElewiseAddActPass::FuseActElewiseAdd( + ir::Graph *graph, const std::unordered_set &act_types) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("act_elewise_add", graph); GraphPatternDetector gpd; auto *x = gpd.mutable_pattern() @@ -137,7 +134,7 @@ std::unique_ptr FuseElewiseAddActPass::FuseActElewiseAdd( found_elewise_add_act_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_elewise_add_act_count); return graph; @@ -146,11 +143,10 @@ std::unique_ptr FuseElewiseAddActPass::FuseActElewiseAdd( // the backward of act(ele_add(x,y)) // act_grad: in["Out", "Out@GRAD"], out["X@GRAD"] // ele_add_grad: in["Y", "Out@GRAD"], out["X@GRAD", "Y@GRAD"] -std::unique_ptr FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad( - std::unique_ptr graph, - const std::unordered_set &act_types) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("elewise_add_act_grad", graph.get()); +ir::Graph *FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad( + ir::Graph *graph, const std::unordered_set &act_types) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("elewise_add_act_grad", graph); GraphPatternDetector gpd; auto *d_act_out = gpd.mutable_pattern() @@ -217,7 +213,7 @@ std::unique_ptr FuseElewiseAddActPass::FuseElewiseAddActInplaceGrad( found_elewise_add_act_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_elewise_add_act_count); return graph; diff --git a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h index 0fee5274478e8b8db852774077ff5979f0aaba25..dc73f1fda03e130c6876819d91897b497b8b321e 100644 --- a/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h +++ b/paddle/fluid/framework/ir/fuse_elewise_add_act_pass.h @@ -14,6 +14,8 @@ #pragma once #include +#include +#include #include #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph.h" @@ -32,20 +34,16 @@ class FuseElewiseAddActPass : public FusePassBase { virtual ~FuseElewiseAddActPass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph *graph) const override; - std::unique_ptr FuseElewiseAddAct( - std::unique_ptr graph, - const std::unordered_set &act_types) const; + ir::Graph *FuseElewiseAddAct( + ir::Graph *graph, const std::unordered_set &act_types) const; - std::unique_ptr FuseActElewiseAdd( - std::unique_ptr graph, - const std::unordered_set &act_types) const; + ir::Graph *FuseActElewiseAdd( + ir::Graph *graph, const std::unordered_set &act_types) const; - std::unique_ptr FuseElewiseAddActInplaceGrad( - std::unique_ptr graph, - const std::unordered_set &act_types) const; + ir::Graph *FuseElewiseAddActInplaceGrad( + ir::Graph *graph, const std::unordered_set &act_types) const; /** * Remove the removable intermediate_out. diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc index fe844caed2e757fb080dcee398c8903b929b06e5..c4e6b6e6a52ec77c85c7c6162c4cbd006e47c502 100644 --- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc +++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h" #include #include +#include #include #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/enforce.h" @@ -23,20 +24,18 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr FuseReluDepthwiseConvPass::ApplyImpl( - std::unique_ptr graph) const { - graph = FuseReluDepthwiseConv(std::move(graph), true); - graph = FuseReluDepthwiseConv(std::move(graph), false); - return graph; +void FuseReluDepthwiseConvPass::ApplyImpl(ir::Graph *graph) const { + graph = FuseReluDepthwiseConv(graph, true); + graph = FuseReluDepthwiseConv(graph, false); } -std::unique_ptr FuseReluDepthwiseConvPass::FuseReluDepthwiseConv( - std::unique_ptr graph, bool only_forward) const { - PADDLE_ENFORCE(graph.get()); +ir::Graph *FuseReluDepthwiseConvPass::FuseReluDepthwiseConv( + ir::Graph *graph, bool only_forward) const { + PADDLE_ENFORCE(graph); if (only_forward) - FusePassBase::Init("relu_depthwise_conv_only_forward", graph.get()); + FusePassBase::Init("relu_depthwise_conv_only_forward", graph); else - FusePassBase::Init("relu_depthwise_conv", graph.get()); + FusePassBase::Init("relu_depthwise_conv", graph); /* x ---act--> y ---layer-> z +----------+ @@ -144,10 +143,9 @@ std::unique_ptr FuseReluDepthwiseConvPass::FuseReluDepthwiseConv( } count++; }; - gpd(graph.get(), handler); - GraphSafeRemoveNodes(graph.get(), need_removed_nodes); + gpd(graph, handler); + GraphSafeRemoveNodes(graph, need_removed_nodes); AddStatis(count); - return graph; } diff --git a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h index efb49b8300e677f17d9e205800d837b88edfd2e9..d37c153dd2a05ecfc8f0626626bbc3ed2f85968b 100644 --- a/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h +++ b/paddle/fluid/framework/ir/fuse_relu_depthwise_conv_pass.h @@ -32,10 +32,8 @@ class FuseReluDepthwiseConvPass : public FusePassBase { virtual ~FuseReluDepthwiseConvPass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; - std::unique_ptr FuseReluDepthwiseConv( - std::unique_ptr graph, bool only_forward) const; + void ApplyImpl(ir::Graph* graph) const override; + ir::Graph* FuseReluDepthwiseConv(ir::Graph* graph, bool only_forward) const; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.cc b/paddle/fluid/framework/ir/graph_to_program_pass.cc index 3372dcd181d32d9d36eb590c9a4688d1f4c9357b..b0d056f2c0f8286caadfbfed3b55b19fcef34402 100644 --- a/paddle/fluid/framework/ir/graph_to_program_pass.cc +++ b/paddle/fluid/framework/ir/graph_to_program_pass.cc @@ -15,7 +15,9 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph_to_program_pass.h" #include +#include #include +#include #include #include "paddle/fluid/framework/ir/graph.h" @@ -26,8 +28,7 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr GraphToProgramPass::ApplyImpl( - std::unique_ptr graph) const { +void GraphToProgramPass::ApplyImpl(ir::Graph* graph) const { // Remove the unneeded variables after memory optimization. std::unordered_set vars2remove; if (graph->Has(kGraphToProgramVarsToRemove)) { @@ -73,7 +74,6 @@ std::unique_ptr GraphToProgramPass::ApplyImpl( } program.CopyFrom(*program_pb); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/graph_to_program_pass.h b/paddle/fluid/framework/ir/graph_to_program_pass.h index 4c36c3a5da13aa9414a55604eb953302e738f014..52c8f4e0fcafcd42647b323a20fee7c7cf167b3a 100644 --- a/paddle/fluid/framework/ir/graph_to_program_pass.h +++ b/paddle/fluid/framework/ir/graph_to_program_pass.h @@ -26,7 +26,7 @@ const char kGraphToProgramSortKind[] = "__graph_to_program_sort_kind__"; class GraphToProgramPass : public Pass { protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc index 5d51d9751a28d2b1549096b1984d67b55f913da6..5ee6b8a5f1e4e7415adfac6b51e9d3ae8e3062a9 100644 --- a/paddle/fluid/framework/ir/graph_to_program_pass_test.cc +++ b/paddle/fluid/framework/ir/graph_to_program_pass_test.cc @@ -14,7 +14,9 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph_to_program_pass.h" +#include #include +#include #include #include "gtest/gtest.h" #include "paddle/fluid/framework/program_desc.h" @@ -84,7 +86,7 @@ TEST(GraphToProgramPass, Basic) { ProgramDesc compiled_prog; pass->SetNotOwned("program", &compiled_prog); - pass->Apply(std::move(g)); + pass->Apply(g.get()); std::vector ops = compiled_prog.Block(0).AllOps(); EXPECT_EQ(ops[0]->Type(), "op1"); EXPECT_EQ(ops[1]->Type(), "op2"); diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc index 87a28a2a66c93db763a148801876eb2fb4c61f66..f4df4cfeba66889f3bf547d989d27aa76587e6be 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.cc +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include +#include #include - -#include "paddle/fluid/framework/ir/graph_viz_pass.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/inference/analysis/dot.h" #include "paddle/fluid/string/printf.h" @@ -38,8 +38,7 @@ std::string FormatName(const Node* node) { } } // namespace -std::unique_ptr GraphVizPass::ApplyImpl( - std::unique_ptr graph) const { +void GraphVizPass::ApplyImpl(ir::Graph* graph) const { const std::string graph_viz_path = Get(kGraphVizPath); VLOG(3) << "draw IR graph viz to " << graph_viz_path; std::unique_ptr fout(new std::ofstream(graph_viz_path)); @@ -82,7 +81,7 @@ std::unique_ptr GraphVizPass::ApplyImpl( {Dot::Attr("style", "filled,rounded"), Dot::Attr("shape", "box"), Dot::Attr("fillcolor", "yellow")}); - auto marked_nodes = ConsumeMarkedNodes(graph.get()); + auto marked_nodes = ConsumeMarkedNodes(graph); // Create nodes for (const Node* n : graph->Nodes()) { std::string node_id = FormatName(n) + "(" + std::to_string(n->id()) + ")"; @@ -115,8 +114,6 @@ std::unique_ptr GraphVizPass::ApplyImpl( } sout << dot.Build(); - - return graph; } GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes( @@ -135,4 +132,4 @@ GraphVizPass::marked_nodes_t GraphVizPass::ConsumeMarkedNodes( } // namespace paddle REGISTER_PASS(graph_viz_pass, paddle::framework::ir::GraphVizPass) - .RequirePassAttr(paddle::framework::ir::kGraphVizPath); \ No newline at end of file + .RequirePassAttr(paddle::framework::ir::kGraphVizPath); diff --git a/paddle/fluid/framework/ir/graph_viz_pass.h b/paddle/fluid/framework/ir/graph_viz_pass.h index e64916a5bb662e3b00cfe212f0bbbc537c7bc2cc..7091aa6a95bd9ebde10bfbd45c98f8757b9d06c4 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.h +++ b/paddle/fluid/framework/ir/graph_viz_pass.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include #include #include "paddle/fluid/framework/ir/graph.h" @@ -34,8 +35,7 @@ class GraphVizPass : public Pass { using marked_nodes_t = std::unordered_set; protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; // Tell whether there are any marked nodes in the graph. Consume the // corresponding attribute. diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc index 5bdc0c5faed7131b873edf9b43c847c010b6e3f3..a39901e63bf65f7c314595a5fb2cc31d00959bd5 100644 --- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc +++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.cc @@ -20,9 +20,8 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr IdentityScaleOpCleanPass::ApplyImpl( - std::unique_ptr graph) const { - FusePassBase::Init("identity_scale_op_clean", graph.get()); +void IdentityScaleOpCleanPass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init("identity_scale_op_clean", graph); // pre_op -> scale_in -> scale_op -> scale_out // -> @@ -72,8 +71,7 @@ std::unique_ptr IdentityScaleOpCleanPass::ApplyImpl( IR_NODE_LINK_TO(pre_op_var, scale_out_var); }; - detector(graph.get(), handler); - return graph; + detector(graph, handler); } } // namespace ir diff --git a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h index 6da592561da1e4046acbfd86c04862f69b7a97a8..d66b411257e530fa5188091702b0b309652ffaa4 100644 --- a/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h +++ b/paddle/fluid/framework/ir/identity_scale_op_clean_pass.h @@ -22,8 +22,7 @@ namespace ir { class IdentityScaleOpCleanPass : public FusePassBase { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; private: virtual ~IdentityScaleOpCleanPass() = default; diff --git a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc index 6607c026a748576f38419b275d71217f3eee0c59..d76924116f6d6202557a0d76cfcdadba0a3a6de6 100644 --- a/paddle/fluid/framework/ir/infer_clean_graph_pass.cc +++ b/paddle/fluid/framework/ir/infer_clean_graph_pass.cc @@ -26,9 +26,9 @@ class InferCleanGraphPass : public FusePassBase { virtual ~InferCleanGraphPass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const { - FusePassBase::Init("original_graph", graph.get()); - PADDLE_ENFORCE(graph.get()); + void ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init("original_graph", graph); + PADDLE_ENFORCE(graph); auto is_valid_node = [](Node* x) { return x && IsControlDepVar(*x) && x->IsVar() && !x->Var(); @@ -46,11 +46,9 @@ class InferCleanGraphPass : public FusePassBase { } } - GraphSafeRemoveNodes(graph.get(), invalid_nodes); + GraphSafeRemoveNodes(graph, invalid_nodes); AddStatis(valid_op); - - return graph; } void CleanEdges(std::vector* nodes, diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc index 57cc98e2ca0175848aa62c62c8ad3b20594b3bde..bf6fe999c1e68c35bc2c19fe38646da93bb1e204 100644 --- a/paddle/fluid/framework/ir/is_test_pass.cc +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -20,8 +20,7 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr IsTestPass::ApplyImpl( - std::unique_ptr graph) const { +void IsTestPass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Sets is_test attrbiute to true and if it is missing, inserts it " "for activations and pooling."; auto op_list = {"pool2d", "sigmoid", "logsigmoid", @@ -47,7 +46,6 @@ std::unique_ptr IsTestPass::ApplyImpl( } } } - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/is_test_pass.h b/paddle/fluid/framework/ir/is_test_pass.h index 99e76ca4a3de21e350e68e05e0f241937a743b9e..80cedbf9f850f6fe31c9f2898264e19ebf931c72 100644 --- a/paddle/fluid/framework/ir/is_test_pass.h +++ b/paddle/fluid/framework/ir/is_test_pass.h @@ -22,8 +22,7 @@ namespace ir { class IsTestPass : public Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc index 9696441a21661db89146c448742a992d1f7df022..3fa543c6221ae6ada8afddcf4563c1174127c221 100644 --- a/paddle/fluid/framework/ir/is_test_pass_tester.cc +++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc @@ -97,7 +97,7 @@ TEST(IsTestPass, basic) { auto pass = PassRegistry::Instance().Get("is_test_pass"); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); for (auto* node : graph->Nodes()) { if (node->IsOp()) { diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc index 92e897ca9ce02ed67f026fd08062842e3bafa098..05d23961a8b180381eef6372f7049bed2b530db7 100644 --- a/paddle/fluid/framework/ir/lock_free_optimize_pass.cc +++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.cc @@ -32,9 +32,8 @@ const char kSumGradOpName[] = "sum"; // other optimizers later. const char kOptimizerType[] = "sgd"; -std::unique_ptr LockFreeOptimizePass::ApplyImpl( - std::unique_ptr graph) const { - PADDLE_ENFORCE(graph.get()); +void LockFreeOptimizePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); // We could collect all weights' name from SGD, where // W1 <- SGD(W0, Grad0) @@ -92,14 +91,14 @@ std::unique_ptr LockFreeOptimizePass::ApplyImpl( // find the forward op related to the backward op ir::Node* forward_op = - FindForwardOpViaBackwardOp(graph.get(), backward_op); + FindForwardOpViaBackwardOp(graph, backward_op); VLOG(3) << "Found forward_op " << forward_op->Name(); PADDLE_ENFORCE(forward_op); Node* new_optimizer_node = CreateNewSGDNode( - graph.get(), forward_op, backward_op, node, opt_node); + graph, forward_op, backward_op, node, opt_node); PADDLE_ENFORCE(new_optimizer_node); } @@ -140,8 +139,6 @@ std::unique_ptr LockFreeOptimizePass::ApplyImpl( } } } - - return graph; } ir::Node* LockFreeOptimizePass::CreateNewSGDNode( diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h index f9157b10d9554092a5da6a6f73ecf7ceac1430dd..d1718857a5d84304c3c02e74c7ca79c24f367f8c 100644 --- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h +++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h @@ -60,8 +60,7 @@ class LockFreeOptimizePass : public Pass { virtual ~LockFreeOptimizePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; private: // Create a new sgd node via current optimizer node diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc index 5d0b294f6fec5f14dcddb91f8ceffb27fc833d4e..8ef3993b065bcd37dcd571ba5a284cd35cfe052d 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc @@ -38,10 +38,9 @@ LoDTensor tensor_apply_eltwise(const LoDTensor& vec_a, const LoDTensor& vec_b, return vec_y; } -std::unique_ptr ConvBiasFusePass::ApplyImpl( - std::unique_ptr graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init(name_scope_, graph.get()); +void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); auto* scope = param_scope(); PADDLE_ENFORCE(scope); @@ -99,7 +98,7 @@ std::unique_ptr ConvBiasFusePass::ApplyImpl( conv->Op()->SetOutput("Output", std::vector({eltwise_out->Name()})); - GraphSafeRemoveNodes(graph.get(), {eltwise, conv_out}); + GraphSafeRemoveNodes(graph, {eltwise, conv_out}); IR_NODE_LINK_TO(conv, eltwise_out); } else { @@ -123,14 +122,13 @@ std::unique_ptr ConvBiasFusePass::ApplyImpl( IR_NODE_LINK_TO(eltwise_bias, conv_bias_node); IR_NODE_LINK_TO(conv_bias_node, eltwise_out); - GraphSafeRemoveNodes(graph.get(), {conv, eltwise, conv_out}); + GraphSafeRemoveNodes(graph, {conv, eltwise, conv_out}); } found_conv_bias_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_conv_bias_count); - return graph; } } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h index 0ef5c177bf98b354bb18fc1d2ec8e5bef4b58951..84106d0655d5578338da3b5993f3d2ec191542fd 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h @@ -29,8 +29,7 @@ class ConvBiasFusePass : public FusePassBase { virtual bool is_conv3d() const { return false; } protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"conv_bias_mkldnn_fuse"}; }; /* diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc index 38b7fe52037c1a264e4251b7a54ef7569ee6d765..ff7f9190fdeb1648a7ff2c59a07bad399a03bf3f 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc @@ -13,10 +13,10 @@ // limitations under the License. #include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h" +#include #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/platform/place.h" -#include #include "paddle/fluid/framework/op_proto_maker.h" namespace paddle { @@ -103,7 +103,7 @@ void MainTest(bool convWithExistingBias) { int original_nodes_num = graph->Nodes().size(); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); int current_nodes_num = graph->Nodes().size(); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index fb3db81347b102cfa264082b36a2e22ea8c22982..ef7874c1c0b21f7c4ce4a2883e6b8e3ba49bf2f7 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -16,8 +16,8 @@ #include #include #include +#include #include - #include "paddle/fluid/framework/ir/graph_traits.h" namespace paddle { @@ -327,17 +327,15 @@ GraphWithStats ResidualConnectionMKLDNNFusePass::FuseProjectionConv( get_node_from_elementwise_add); } -graph_ptr ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void ResidualConnectionMKLDNNFusePass::ApplyImpl(graph_ptr graph) const { + FusePassBase::Init(name_scope_, graph); auto fused_graph_with_stats = FuseConvAsY( name_scope_, - FuseConvAsX( - name_scope_, - FuseProjectionConv(name_scope_, std::make_pair(graph.get(), 0)))); + FuseConvAsX(name_scope_, + FuseProjectionConv(name_scope_, std::make_pair(graph, 0)))); std::cout << "Fused graph " << fused_graph_with_stats.second << std::endl; AddStatis(fused_graph_with_stats.second); - return graph; } } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h index 6629dae425ae85446fe2f6c8c172ca53f5ae8bea..9bf1ae607937f0cae2fd312b0f6c7f7e14bd8fbf 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -27,7 +28,7 @@ namespace paddle { namespace framework { namespace ir { -using graph_ptr = std::unique_ptr; +using graph_ptr = ir::Graph*; using GraphWithStats = std::pair; void CorrectGraphEdges(Graph* graph, Node* from, Node* to); @@ -124,7 +125,7 @@ class ResidualConnectionMKLDNNFusePass : public FusePassBase { virtual ~ResidualConnectionMKLDNNFusePass() {} protected: - std::unique_ptr ApplyImpl(graph_ptr graph) const; + void ApplyImpl(graph_ptr graph) const; const std::string name_scope_{"residual_connection_fuse_pass"}; }; diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc index 433d89d8d3f20b3f87cd94901ebbf79cd99de813..8a13596cd50087475bf12b6cfa5920b82e24de31 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc @@ -148,7 +148,7 @@ void RunPassAndAssert(ProgramDesc* prog, const std::string& from, auto pass = PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); int original_nodes_num = graph->Nodes().size(); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); int current_nodes_num = graph->Nodes().size(); EXPECT_TRUE(is_reachable(graph)(from, to)); @@ -258,7 +258,7 @@ TEST(ConvElementwiseAddMKLDNNFusePass, NoFusion) { auto pass = PassRegistry::Instance().Get("conv_elementwise_add_mkldnn_fuse_pass"); int original_nodes_num = graph->Nodes().size(); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); int current_nodes_num = graph->Nodes().size(); EXPECT_TRUE(is_reachable(graph)("a", "g")); diff --git a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc index 4f4605398a665e63662a64a3a925c32d48f10952..dd0fb456040fcf4e135333f938f8e3bdb18b7bcf 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.cc @@ -21,10 +21,9 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr ConvReLUFusePass::ApplyImpl( - std::unique_ptr graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get()); +void ConvReLUFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("conv_relu_mkldnn_fuse", graph); GraphPatternDetector gpd; auto* conv_input = gpd.mutable_pattern() @@ -56,7 +55,7 @@ std::unique_ptr ConvReLUFusePass::ApplyImpl( OpDesc* desc = conv->Op(); desc->SetOutput("Output", std::vector({relu_out->Name()})); desc->SetAttr("fuse_relu", true); - GraphSafeRemoveNodes(graph.get(), {relu, conv_out}); + GraphSafeRemoveNodes(graph, {relu, conv_out}); PADDLE_ENFORCE(subgraph.count(conv_input)); IR_NODE_LINK_TO(conv, relu_out); @@ -64,10 +63,9 @@ std::unique_ptr ConvReLUFusePass::ApplyImpl( found_conv_relu_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_conv_relu_count); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h index fe585bd7c41bb32ae00462e989ab4c0051fc89a8..2174c22dbf53790015be4c651b6e0c40b8e159fb 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass.h @@ -31,8 +31,7 @@ class ConvReLUFusePass : public FusePassBase { virtual ~ConvReLUFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc index 06d56f6222e4bb9a9969d4ab2d260c97d1ce6c72..67a9957059a501f39f20c1de2ae17cafbe51a53a 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc @@ -88,7 +88,7 @@ TEST(ConvReLUFusePass, basic) { int original_nodes_num = graph->Nodes().size(); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); int current_nodes_num = graph->Nodes().size(); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index b3a8c208916f699dc032496c6d0fa5bf86227903..dff98e523ac45ef79f3e8fd020ecd6cd7035cf92 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -216,19 +216,16 @@ void CPUQuantizePass::QuantizePool(Graph* graph) const { PrettyLogDetail("--- quantized %d pool2d ops", quantize_pool_count); } -std::unique_ptr CPUQuantizePass::ApplyImpl( - std::unique_ptr graph) const { +void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Quantizing the graph."; - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init(name_scope_, graph.get()); + PADDLE_ENFORCE(graph); + FusePassBase::Init(name_scope_, graph); PADDLE_ENFORCE(param_scope()); - QuantizeConv(graph.get(), false /* with_residual_data */); - QuantizeConv(graph.get(), true /* with_residual_data */); - QuantizePool(graph.get()); - - return graph; + QuantizeConv(graph, false /* with_residual_data */); + QuantizeConv(graph, true /* with_residual_data */); + QuantizePool(graph); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index 9873bb04e138a745ac6aa44cf5791651ad897444..a178c4dc363f672fdc7c535954be0c5877a599ac 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -42,8 +42,7 @@ class CPUQuantizePass : public FusePassBase { virtual ~CPUQuantizePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; void QuantizeConv(Graph* graph, bool with_residual_data = false) const; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index 0d0ed989012fced7f639c2bc12a3bafa6edf27f6..8716a412e4d5b96161c5b2e2ac06d6aa0b4e74e1 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -139,7 +139,7 @@ void MainTest(const ProgramDesc& prog, int conv_count, int pool_count, int original_nodes_num = graph->Nodes().size(); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); int current_nodes_num = graph->Nodes().size(); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc index 511003dce59f91272802766544577e9c473a3a1d..79a8ac68b82fc79ec91c18ec96a04e1e676c8ba0 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc @@ -20,8 +20,7 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr CPUQuantizePlacementPass::ApplyImpl( - std::unique_ptr graph) const { +void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Marks operators which are to be quantized."; const auto& excluded_ids_list = Get>("quantize_excluded_op_ids"); @@ -43,7 +42,6 @@ std::unique_ptr CPUQuantizePlacementPass::ApplyImpl( } } } - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h index ef3861b2493b5b82620f8bec6e808c0c921a9680..008a462dc414c04f53315a8f262de15ab8fb7fb5 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.h @@ -25,8 +25,7 @@ namespace ir { */ class CPUQuantizePlacementPass : public Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc index 11d72a56bd66792ff3ed5cc8184f5b242d9cdba5..ba4d281f818bb752570e7b500013f5f58001307c 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass_tester.cc @@ -94,7 +94,7 @@ void MainTest(std::initializer_list quantize_enabled_op_types, pass->Set("quantize_excluded_op_ids", new std::unordered_set(quantize_excluded_op_ids)); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); unsigned use_quantizer_true_count = 0; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc index 6e74cc7787b73d06b1093ed4e846ab83b1234803..debbbd6440b05c3f8c0db708c8ad5c54e018f725 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc @@ -126,16 +126,13 @@ void CPUQuantizeSquashPass::Squash( found_squash_count); } -std::unique_ptr CPUQuantizeSquashPass::ApplyImpl( - std::unique_ptr graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("cpu_quantize_squash_pass", graph.get()); +void CPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("cpu_quantize_squash_pass", graph); std::unordered_map nodes_keep_counter; - FindNodesToKeep(graph.get(), &nodes_keep_counter); - Squash(graph.get(), &nodes_keep_counter); - - return graph; + FindNodesToKeep(graph, &nodes_keep_counter); + Squash(graph, &nodes_keep_counter); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h index b823a2cef35b2f9994df9c9473246db3d69843e7..e873994c57ea1a6aca4345d96438e8a7c569980b 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h @@ -34,8 +34,7 @@ class CPUQuantizeSquashPass : public FusePassBase { virtual ~CPUQuantizeSquashPass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; /* * For each dequantize's output find the number of operators it is an input to diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc index 3cf51d97aa4b8be468b8c2a78dd17aafbbf0e15b..fda337066f4d43f88d0082b5bcebc587f0c7652b 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc @@ -125,7 +125,7 @@ void MainTest(const ProgramDesc& prog, int removed_nodes_num) { int original_nodes_num = graph->Nodes().size(); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); int current_nodes_num = graph->Nodes().size(); diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc index 7851e8c84bca2e3b05d3b1603eaa4c0ca5909e10..e854559ae7a8765da604c2043e8e4e8cedbbcf88 100644 --- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.cc @@ -25,10 +25,9 @@ namespace ir { auto* id = subgraph.at(pattern.RetrieveNode(#id)); \ PADDLE_ENFORCE_NOT_NULL(id, "subgraph has no node %s", #id); -std::unique_ptr DepthwiseConvMKLDNNPass::ApplyImpl( - std::unique_ptr graph) const { - PADDLE_ENFORCE(graph.get()); - FusePassBase::Init("depthwise_conv_mkldnn_pass", graph.get()); +void DepthwiseConvMKLDNNPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE(graph); + FusePassBase::Init("depthwise_conv_mkldnn_pass", graph); GraphPatternDetector gpd; auto* pattern = gpd.mutable_pattern(); @@ -45,9 +44,8 @@ std::unique_ptr DepthwiseConvMKLDNNPass::ApplyImpl( found_depthwise_conv_mkldnn_count++; }; - gpd(graph.get(), handler); + gpd(graph, handler); AddStatis(found_depthwise_conv_mkldnn_count); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h index 8ca6a7325186401c26eb7f9375cf83b7b97cc1c9..ca314afde57bbc5a339b2016a2540309b31f0598 100644 --- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass.h @@ -25,8 +25,7 @@ class DepthwiseConvMKLDNNPass : public FusePassBase { virtual ~DepthwiseConvMKLDNNPass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc index 1783e3322b1df8125f580f09a12aefe64d246c1a..f2dfbc84a5a5a7feac2514731445eb191bd6f784 100644 --- a/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/depthwise_conv_mkldnn_pass_tester.cc @@ -86,7 +86,7 @@ TEST(DepthwiseConvMKLDNNPass, basic) { counters before{1, 1, 1, 1}; - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); // initialize counters before loop counters after{0, 0, 0, 0}; diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc index ccac65f3b3ad22d0f424ef9de9a7bd506e8ac862..500419e4b7819e576e4e9f2dcc9a01a414519ff8 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.cc @@ -14,13 +14,13 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h" #include +#include namespace paddle { namespace framework { namespace ir { -std::unique_ptr MKLDNNPlacementPass::ApplyImpl( - std::unique_ptr graph) const { +void MKLDNNPlacementPass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Applies MKL-DNN placement strategy."; const auto& op_types_list = Get>("mkldnn_enabled_op_types"); @@ -37,7 +37,6 @@ std::unique_ptr MKLDNNPlacementPass::ApplyImpl( } } } - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h index c071d9aed20bd40f5c1076d2dc5d3098a4e65495..ffa62273ece084c6c60855f628b7a921a004ac3e 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass.h @@ -26,8 +26,7 @@ namespace ir { */ class MKLDNNPlacementPass : public Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc index b6ec7e4d68b95125d630ce4a60635eb7b711e820..5885f327e610a5c3d931a00b36066194dac8994a 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_placement_pass_tester.cc @@ -97,7 +97,7 @@ void MainTest(std::initializer_list mkldnn_enabled_op_types, pass->Set("mkldnn_enabled_op_types", new std::unordered_set(mkldnn_enabled_op_types)); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); unsigned use_mkldnn_true_count = 0; diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc index 9e77f98e9efb2c770cbce3b988914ea473a96de1..dcc48fb934e7a06f2e85fa34fde335261f551415 100644 --- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc +++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc @@ -16,8 +16,9 @@ #include #include +#include +#include #include - #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/op_proto_maker.h" @@ -68,8 +69,7 @@ VarDesc UpdateGradVarDesc( return *var_desc; } -std::unique_ptr BatchMergePass::ApplyImpl( - std::unique_ptr graph) const { +void BatchMergePass::ApplyImpl(ir::Graph* graph) const { int num_repeats = Get(kNumRepeats); std::vector forward_backward_ops; std::vector optimize_ops; @@ -325,7 +325,6 @@ std::unique_ptr BatchMergePass::ApplyImpl( } result.ResolveHazard(created); - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.h b/paddle/fluid/framework/ir/multi_batch_merge_pass.h index c1e5aef20dbc60c18ed03038818bfd8ab217bf28..a89616683d9c625111272fd8c1de237a5c9dbe8f 100644 --- a/paddle/fluid/framework/ir/multi_batch_merge_pass.h +++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.h @@ -36,7 +36,7 @@ class BatchMergePass : public Pass { virtual ~BatchMergePass() {} protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const override; + void ApplyImpl(Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc index 33ccee6aa0a94b8fd8308214d6144ae832d40bab..c0ed0519b1ff6aa5960c20e9af697fd1da74a8b5 100644 --- a/paddle/fluid/framework/ir/pass.cc +++ b/paddle/fluid/framework/ir/pass.cc @@ -18,8 +18,8 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { -std::unique_ptr Pass::Apply(std::unique_ptr graph) const { - PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty."); +Graph* Pass::Apply(Graph* graph) const { + PADDLE_ENFORCE(graph, "graph passed to Pass::Apply() cannot be empty."); for (const std::string& attr : required_pass_attrs_) { PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(), "Required pass atrribute %s not set.", attr); @@ -28,16 +28,16 @@ std::unique_ptr Pass::Apply(std::unique_ptr graph) const { PADDLE_ENFORCE(graph->Has(attr), "Required graph atrribute %s not set.", attr); } - auto* native_graph = graph.get(); - auto applied_graph = ApplyImpl(std::move(graph)); + auto* native_graph = graph; + ApplyImpl(graph); // TODO(panyx0718): Add more verifications. - PADDLE_ENFORCE(!HasCircle(*applied_graph), + PADDLE_ENFORCE(!HasCircle(*graph), "Illegal Pass. Generated graph shouldn't has cycle."); - PADDLE_ENFORCE(applied_graph.get() == native_graph, + PADDLE_ENFORCE(graph == native_graph, "Pass::Apply() cannot delete the passed graph and shouldn't " "return a new graph.(For the need of pybind11)"); applied_ = true; - return applied_graph; + return graph; } PassRegistry& PassRegistry::Instance() { diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index 27746ff1453b1b336da8c31497c066c338843b68..6cbe9a8212775512431860591526b52665ec4037 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -16,8 +16,10 @@ limitations under the License. */ #include #include +#include #include - +#include +#include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/program_desc.h" @@ -44,7 +46,7 @@ class Pass { std::string Type() const { return type_; } - std::unique_ptr Apply(std::unique_ptr graph) const; + Graph *Apply(Graph *graph) const; // Get a reference to the attributed previously set. template @@ -98,9 +100,8 @@ class Pass { } protected: - virtual std::unique_ptr ApplyImpl(std::unique_ptr graph) const { + virtual void ApplyImpl(Graph *graph) const { LOG(FATAL) << "Calling virtual Pass not implemented."; - return graph; } private: diff --git a/paddle/fluid/framework/ir/pass_test.cc b/paddle/fluid/framework/ir/pass_test.cc index 6ad7d1df8bdd016b617c820c022ef55f23ba21cd..87e3c96416926cb07550b1eb4d1fd3ec6131c8ec 100644 --- a/paddle/fluid/framework/ir/pass_test.cc +++ b/paddle/fluid/framework/ir/pass_test.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/pass.h" +#include #include +#include #include "gtest/gtest.h" #include "paddle/fluid/framework/ir/graph.h" @@ -39,7 +41,7 @@ void BuildCircleGraph(Graph* g) { class TestPass : public Pass { protected: - std::unique_ptr ApplyImpl(std::unique_ptr graph) const { + void ApplyImpl(ir::Graph* graph) const { graph->Set("copy_test_pass_attr", new int); graph->Set("copy_test_graph_attr", new int); @@ -48,7 +50,6 @@ class TestPass : public Pass { int test_graph_attr = graph->Get("test_graph_attr"); graph->Get("copy_test_graph_attr") = test_graph_attr + 1; - return graph; } }; @@ -58,7 +59,7 @@ TEST(PassTest, TestPassAttrCheck) { std::unique_ptr graph(new Graph(prog)); std::string exception; try { - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); } catch (paddle::platform::EnforceNotMet e) { exception = std::string(e.what()); } @@ -69,7 +70,7 @@ TEST(PassTest, TestPassAttrCheck) { pass->SetNotOwned("test_pass_attr", &val); try { - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); } catch (paddle::platform::EnforceNotMet e) { exception = std::string(e.what()); } @@ -78,14 +79,14 @@ TEST(PassTest, TestPassAttrCheck) { graph.reset(new Graph(prog)); graph->Set("test_graph_attr", new int); graph->Get("test_graph_attr") = 1; - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); ASSERT_EQ(graph->Get("copy_test_pass_attr"), 2); ASSERT_EQ(graph->Get("copy_test_graph_attr"), 2); // Allow apply more than once. graph.reset(new Graph(prog)); graph->Set("test_graph_attr", new int); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); pass = PassRegistry::Instance().Get("test_pass"); pass->SetNotOwned("test_pass_attr", &val); @@ -94,7 +95,7 @@ TEST(PassTest, TestPassAttrCheck) { graph->Set("test_graph_attr", new int); graph->Get("test_graph_attr") = 2; try { - auto tmp = pass->Apply(std::move(graph)); + pass->Apply(graph.release()); } catch (paddle::platform::EnforceNotMet e) { exception = std::string(e.what()); } diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc index 84a4ff2de173d86184fcef53b8e55fe17958fb8c..00263b8a34851b6d4cf2aac1456b3b4514356acd 100644 --- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h" #include // for max #include +#include #include #include "paddle/fluid/framework/lod_tensor.h" @@ -365,17 +366,14 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, return fusion_count; } -std::unique_ptr RepeatedFCReluFusePass::ApplyImpl( - std::unique_ptr graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void RepeatedFCReluFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); int fusion_count = 0; for (int i = MAX_NUM_FC; i > 1; --i) { fusion_count += - BuildFusion(graph.get(), name_scope_ + "/" + std::to_string(i), i); + BuildFusion(graph, name_scope_ + "/" + std::to_string(i), i); } AddStatis(fusion_count); - - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h index ede0bea07ff4130a0f6b3d21d6e34222a5013170..ae777bccebec9f99b4752fe495f96d3da38aac23 100644 --- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h +++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.h @@ -31,8 +31,7 @@ class RepeatedFCReluFusePass : public FusePassBase { virtual ~RepeatedFCReluFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"repeated_fc_relu_fuse"}; }; diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc index 67b29512c4cf3512e4b2b4b5a18ba60a3d9120dc..c7cf9b0dc342bbfaa80b622d7dcd0f6348f78d42 100644 --- a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc +++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc @@ -20,15 +20,13 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr RuntimeContextCachePass::ApplyImpl( - std::unique_ptr graph) const { +void RuntimeContextCachePass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Applies Runtime Context Cache strategy."; for (const Node* n : graph->Nodes()) { if (n->IsOp()) { n->Op()->SetAttr(kEnableCacheRuntimeContext, true); } } - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.h b/paddle/fluid/framework/ir/runtime_context_cache_pass.h index a6cf1a9ae5035f185dd3ab52bf0762a6eaf0f6e5..e4783166e0cbde0be9037df5afe3e903a40a2065 100644 --- a/paddle/fluid/framework/ir/runtime_context_cache_pass.h +++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.h @@ -23,8 +23,7 @@ namespace ir { class RuntimeContextCachePass : public Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc index 012e68036c35ccb27447129e49c407fe1c6f045c..b230c50167136d2616068078ce619e8362c38fde 100644 --- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.cc @@ -12,13 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h" #include #include - +#include #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" -#include "paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h" #include "paddle/fluid/framework/lod_tensor.h" namespace paddle { @@ -178,9 +178,8 @@ PDNode* BuildFCPattern(PDPattern* pattern, PDNode* fc_x) { return fc_out; } -std::unique_ptr SeqConcatFcFusePass::ApplyImpl( - std::unique_ptr graph) const { - FusePassBase::Init("seq_concat_fc_fuse", graph.get()); +void SeqConcatFcFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init("seq_concat_fc_fuse", graph); GraphPatternDetector detector; auto* pattern = detector.mutable_pattern(); auto* concat_out = BuildSeqExpandConcatPattern(pattern); @@ -194,8 +193,8 @@ std::unique_ptr SeqConcatFcFusePass::ApplyImpl( int fuse_count{0}; - detector(graph.get(), [&](const GraphPatternDetector::subgraph_t& subgraph, - Graph* graph) { + detector(graph, [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { VLOG(4) << "get one concat pattern"; // fc GET_NODE(fc_w, detector.pattern()); @@ -246,8 +245,6 @@ std::unique_ptr SeqConcatFcFusePass::ApplyImpl( }); AddStatis(fuse_count); - - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h index 06e18f9dc327bf2ffaf8d2ab64edcbddea2eb04c..d68840a554777e64082f7f9e467221bc0948d9dd 100644 --- a/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h +++ b/paddle/fluid/framework/ir/seq_concat_fc_fuse_pass.h @@ -27,8 +27,7 @@ class SeqConcatFcFusePass : public FusePassBase { virtual ~SeqConcatFcFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc index 0a1f65d274708dd208d7783c6273160c4c61738a..3fd368741fb09d41351a97c5e9cf1a5436f350d0 100644 --- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h" #include +#include #include "paddle/fluid/framework/lod_tensor.h" namespace paddle { @@ -83,14 +84,11 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope) { return fusion_count; } -std::unique_ptr SeqConvEltAddReluFusePass::ApplyImpl( - std::unique_ptr graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void SeqConvEltAddReluFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); - int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope()); + int fusion_count = BuildFusion(graph, name_scope_, param_scope()); AddStatis(fusion_count); - - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h index c36c6b76a238dd21eb0c9308e780761aa9e4e27a..fde9b586c85712b14d285cec49f9e09efad78fc7 100644 --- a/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h +++ b/paddle/fluid/framework/ir/seqconv_eltadd_relu_fuse_pass.h @@ -28,8 +28,7 @@ class SeqConvEltAddReluFusePass : public FusePassBase { virtual ~SeqConvEltAddReluFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"seqconv_eltadd_relu_fuse"}; }; diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc index 63a0c24f2a6b6e1afe3d25210ec6eb3cbaac2f2f..4ac379eb0471ea1a8a72c393dad405be90b2fa33 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h" #include +#include #include #include "paddle/fluid/framework/lod_tensor.h" @@ -194,17 +195,14 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, return fusion_count; } -std::unique_ptr SeqPoolConcatFusePass::ApplyImpl( - std::unique_ptr graph) const { - FusePassBase::Init(name_scope_, graph.get()); +void SeqPoolConcatFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); int fusion_count = 0; for (int i = MAX_CONCAT_INPUTS; i > 0; --i) { fusion_count += - BuildFusion(graph.get(), name_scope_ + "/" + std::to_string(i), i); + BuildFusion(graph, name_scope_ + "/" + std::to_string(i), i); } AddStatis(fusion_count); - - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h index a5db3528da36ad08bb7f4d2765ee78222c569a5c..40a9edc5e642320996f5bd3451479fe347f24081 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass.h @@ -42,8 +42,7 @@ class SeqPoolConcatFusePass : public FusePassBase { virtual ~SeqPoolConcatFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"seqpool_concat_fuse"}; }; diff --git a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc index 35d1d5129bba7043026e5489b806480775473257..d3668038518429ee04b6abba5b1f7f09eea1c9f3 100644 --- a/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/seqpool_concat_fuse_pass_tester.cc @@ -59,7 +59,7 @@ std::unique_ptr GetNumNodesOfBeforeAfter( const std::string& pass_type = "seqpool_concat_fuse_pass") { auto pass = PassRegistry::Instance().Get(pass_type); *before = graph->Nodes().size(); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); *after = graph->Nodes().size(); return graph; } diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc index 84fb8063e6f020d5ada2c6af7a0307360aa1c92c..e1ddc444707148b1b781a922429de13a715f3b60 100644 --- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc +++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc @@ -24,11 +24,11 @@ namespace framework { namespace ir { template -std::unique_ptr SimplifyAnakinDetectionPatternPass::ApplyImpl( - std::unique_ptr graph) const { +void SimplifyAnakinDetectionPatternPass::ApplyImpl( + ir::Graph *graph) const { const std::string pattern_name = "simplify_anakin_detection_pattern_pass" + std::to_string(times); - FusePassBase::Init(pattern_name, graph.get()); + FusePassBase::Init(pattern_name, graph); GraphPatternDetector gpd; std::vector input_nodes; @@ -207,11 +207,10 @@ std::unique_ptr SimplifyAnakinDetectionPatternPass::ApplyImpl( multiclass_nms_out->inputs.push_back(detection_out_op); // Delete the unneeded nodes. - GraphSafeRemoveNodes(graph.get(), delete_nodes); + GraphSafeRemoveNodes(graph, delete_nodes); }; - gpd(graph.get(), handler); - return graph; + gpd(graph, handler); } template class SimplifyAnakinDetectionPatternPass<1>; diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h index 2338e4c38b253e2110addcca494e1cae5b58beaf..e4a266cbe843ac56a8c0e4fb1e6f166afea6bfac 100644 --- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h +++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h @@ -32,8 +32,7 @@ class SimplifyAnakinDetectionPatternPass : public FusePassBase { virtual ~SimplifyAnakinDetectionPatternPass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc index 78c8cabb10f5b7718375f8052644074869929d04..42f4a91a6f421c28826d62bf30cbd4b2cb73805a 100644 --- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc +++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h" #include +#include #include #include "paddle/fluid/framework/lod_tensor.h" @@ -362,13 +363,10 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) { return fusion_count; } -std::unique_ptr SquaredMatSubFusePass::ApplyImpl( - std::unique_ptr graph) const { - FusePassBase::Init(name_scope_, graph.get()); - int fusion_count = BuildFusion(graph.get(), name_scope_); +void SquaredMatSubFusePass::ApplyImpl(ir::Graph* graph) const { + FusePassBase::Init(name_scope_, graph); + int fusion_count = BuildFusion(graph, name_scope_); AddStatis(fusion_count); - - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h index c21ba65c40a8d54c315ab347e5a8a3266a143779..b6165a512acdb9b6e3bdbf49196692ef83edb58f 100644 --- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h +++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.h @@ -31,8 +31,7 @@ class SquaredMatSubFusePass : public FusePassBase { virtual ~SquaredMatSubFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; const std::string name_scope_{"squared_mat_sub_fuse"}; }; diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc index b37003991505140b0d531a4ea2b481c6d4b09d75..f4f924a604a231d1a25e169c4dd13f51eb90f266 100644 --- a/paddle/fluid/framework/ir/sync_batch_norm_pass.cc +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.cc @@ -21,8 +21,7 @@ namespace paddle { namespace framework { namespace ir { -std::unique_ptr SyncBatchNormPass::ApplyImpl( - std::unique_ptr graph) const { +void SyncBatchNormPass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Use synchronous batch norm"; for (const Node* n : graph->Nodes()) { if (n->IsOp()) { @@ -35,7 +34,6 @@ std::unique_ptr SyncBatchNormPass::ApplyImpl( } } } - return graph; } } // namespace ir diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass.h b/paddle/fluid/framework/ir/sync_batch_norm_pass.h index 51cce3dca69330071f7d12efef08e2006e8bd7ac..694fae74943060880ef199298064d20c5a526d18 100644 --- a/paddle/fluid/framework/ir/sync_batch_norm_pass.h +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass.h @@ -23,8 +23,7 @@ namespace ir { class SyncBatchNormPass : public Pass { protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc index 9c94c1746a6590df5a43c099b9c4c3678ca6e393..894f96050edd607e1ea7df1c319cfeb3570662e5 100644 --- a/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc +++ b/paddle/fluid/framework/ir/sync_batch_norm_pass_tester.cc @@ -60,7 +60,7 @@ TEST(IsTestPass, basic) { auto pass = PassRegistry::Instance().Get("sync_batch_norm_pass"); - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); for (auto* node : graph->Nodes()) { if (node->IsOp()) { diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc index cab69c408defadad32eba83e47d18f0f82ccc771..61c12d4b6e76bf3021a92aa99953df626b0e45e7 100644 --- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc +++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.cc @@ -26,11 +26,10 @@ namespace framework { namespace ir { template -std::unique_ptr TransposeFlattenConcatFusePass::ApplyImpl( - std::unique_ptr graph) const { +void TransposeFlattenConcatFusePass::ApplyImpl(ir::Graph *graph) const { const std::string pattern_name = "transpose_flatten" + std::to_string(times) + "_concat_fuse"; - FusePassBase::Init(pattern_name, graph.get()); + FusePassBase::Init(pattern_name, graph); GraphPatternDetector gpd; std::vector input_nodes; @@ -117,11 +116,10 @@ std::unique_ptr TransposeFlattenConcatFusePass::ApplyImpl( concat_out->inputs.push_back(new_conv_op); // Delete the unneeded nodes. - GraphSafeRemoveNodes(graph.get(), delete_nodes); + GraphSafeRemoveNodes(graph, delete_nodes); }; - gpd(graph.get(), handler); - return graph; + gpd(graph, handler); } template class TransposeFlattenConcatFusePass<1>; diff --git a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h index a7d18ec86da1c02aef84c25c378691eb8f651015..366d26d800c9899c455a3699f3f73f6e481aa0e0 100644 --- a/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h +++ b/paddle/fluid/framework/ir/transpose_flatten_concat_fuse_pass.h @@ -30,8 +30,7 @@ class TransposeFlattenConcatFusePass : public FusePassBase { virtual ~TransposeFlattenConcatFusePass() {} protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(ir::Graph* graph) const override; }; } // namespace ir diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 6d8ba430bd0b9c9e48b4a80a07feb24b2da7d7b8..a02e53dcf764368601646a900833ac650c5bb31a 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -365,6 +365,9 @@ class ExecutionContext { auto shared_allocation = std::shared_ptr( allocation_ptr, deleter); + PADDLE_ENFORCE( + dynamic_cast(allocation_ptr) != nullptr, + "The AllocationPtr must be TemporaryAllocation."); PADDLE_ENFORCE_GE(allocation_ptr->size(), framework::product(dim) * sizeof(T)); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 20a8c47d5d85f5962d697b48ec1bdaad74cbe4d7..ab0947c631fe9a409406b3b092972ae6512beae7 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -77,8 +77,7 @@ class ParallelExecutorPrivate { } } - std::unique_ptr PrepareGCAndRefCnts( - std::unique_ptr graph, size_t max_memory_size); + ir::Graph *PrepareGCAndRefCnts(ir::Graph *graph, size_t max_memory_size); inline bool HasGarbageCollectors() const { return !gcs_.empty(); } @@ -118,8 +117,8 @@ class ParallelExecutorPrivate { details::GarbageCollectorMap gcs_; }; -std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( - std::unique_ptr graph, size_t max_memory_size) { +ir::Graph *ParallelExecutorPrivate::PrepareGCAndRefCnts( + ir::Graph *graph, size_t max_memory_size) { for (size_t i = 0; i < places_.size(); ++i) { auto &place = places_[i]; if (gcs_.count(place) > 0) { @@ -161,7 +160,7 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( &global_ref_cnts_); ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars, &last_live_ops_of_vars); - graph = ref_cnt_pass->Apply(std::move(graph)); + graph = ref_cnt_pass->Apply(graph); VLOG(10) << "ReferenceCountPass Applied"; auto eager_deletion_pass = @@ -172,10 +171,9 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars, &last_live_ops_of_vars); eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); - graph = eager_deletion_pass->Apply(std::move(graph)); + graph = eager_deletion_pass->Apply(graph); VLOG(10) << "EagerDeletionPass Applied"; } - return graph; } @@ -220,13 +218,11 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, } } - std::unique_ptr temp_owned_graph(graph); - // FIXME(Yancey1989): parallel graph mode get better performance // in GPU allreduce distributed training. Need an elegant way to // choice the execution strategy. - build_strategy.enable_parallel_graph_ = EnableParallelGraphExecution( - *temp_owned_graph, exec_strategy, build_strategy); + build_strategy.enable_parallel_graph_ = + EnableParallelGraphExecution(*graph, exec_strategy, build_strategy); if (build_strategy.enable_parallel_graph_) VLOG(0) << "The Executor would execute the graph by ParallelGraph " "Execution which can get better performance," @@ -304,27 +300,21 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - - temp_owned_graph = build_strategy.Apply( - std::move(temp_owned_graph), member_->places_, loss_var_name, - member_->local_scopes_, member_->nranks_, member_->use_cuda_, - member_->nccl_ctxs_.get()); + graph = build_strategy.Apply(graph, member_->places_, loss_var_name, + member_->local_scopes_, member_->nranks_, + member_->use_cuda_, member_->nccl_ctxs_.get()); #else - temp_owned_graph = build_strategy.Apply( - std::move(temp_owned_graph), member_->places_, loss_var_name, - member_->local_scopes_, member_->nranks_, member_->use_cuda_); + graph = build_strategy.Apply(graph, member_->places_, loss_var_name, + member_->local_scopes_, member_->nranks_, + member_->use_cuda_); #endif auto max_memory_size = GetEagerDeletionThreshold(); VLOG(10) << "Eager Deletion Threshold " << static_cast(max_memory_size) / (1 << 30); if (max_memory_size >= 0) { - graph = member_ - ->PrepareGCAndRefCnts(std::move(temp_owned_graph), - static_cast(max_memory_size)) - .release(); - } else { - graph = temp_owned_graph.release(); + graph = member_->PrepareGCAndRefCnts(graph, + static_cast(max_memory_size)); } // Step 3. Create vars in each scope. Passes may also create new vars. diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index ef096c2b810187c50fbcde7d93d9e5a2ecd8b0f3..ea7f8c496a9fc3ff78fce06b69fb21e44e5be9ee 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -70,7 +70,7 @@ Tensor& Tensor::ShareDataWith(const Tensor& src) { return *this; } -Tensor Tensor::Slice(int begin_idx, int end_idx) const { +Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const { check_memory_size(); PADDLE_ENFORCE_GE(begin_idx, 0, "The start row index must be greater than 0."); diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 88f5b757a8111f6a7e269ff71054dab425c0de01..0fa76f943ec1417dc712771565f7ff2b263e6365 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include #include #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/ddim.h" @@ -27,10 +28,6 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -#ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/platform/mkldnn_utils.h" -#endif - namespace paddle { namespace framework { @@ -41,34 +38,10 @@ class Tensor { #ifdef PADDLE_WITH_MKLDNN public: - // TODO(jczaja): This is depracted and will be removed - inline mkldnn::memory::format format() const { - if (layout_ == DataLayout::kMKLDNN) { - return static_cast(mem_pd_.desc().data.format); - } else { - return mkldnn::memory::format::format_undef; - } - } + inline mkldnn::memory::format format() const { return format_; } - // TODO(jczaja): This is depracted and will be removed - inline void set_format( - const mkldnn::memory::format fmt, - mkldnn::memory::data_type data_type = mkldnn::memory::f32) { - mem_pd_ = paddle::platform::create_prim_desc_from_format( - paddle::framework::vectorize2int(dims()), fmt, data_type); - layout_ = DataLayout::kMKLDNN; - } - - inline mkldnn::memory::primitive_desc get_mkldnn_prim_desc() const { - return mem_pd_; - } - - inline void set_mkldnn_prim_desc( - const mkldnn::memory::primitive_desc& mem_pd) { - // Internally MKL-DNN is just copying (increasing reference counter) - // to shared_ptr. So asignment should be quite cheap - mem_pd_ = mem_pd; - layout_ = DataLayout::kMKLDNN; + inline void set_format(const mkldnn::memory::format format) { + format_ = format; } protected: @@ -76,9 +49,12 @@ class Tensor { * @brief the detail format of memory block which have layout as kMKLDNN * * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, - * nChw16c, etc. For a MKLDNN memory block, we store memory descriptor + * nChw16c, etc. For a MKLDNN memory block, layout will be set as + * DataLayout::kMKLDNN meanwhile detail memory format will be kept in + * this field. */ - mutable mkldnn::memory::primitive_desc mem_pd_; + + mkldnn::memory::format format_ = mkldnn::memory::format::format_undef; #endif public: @@ -157,7 +133,7 @@ class Tensor { * @param[in] end_idx The index of the end row(exclusive) to slice. * The index number begins from 0. */ - Tensor Slice(int begin_idx, int end_idx) const; + Tensor Slice(int64_t begin_idx, int64_t end_idx) const; platform::Place place() const { PADDLE_ENFORCE_NOT_NULL( diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 5f21dae60586e926472fc512eca7bcbb55dc8eda..a7f09df4917532e7261cee471c711897c8eb3447 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -44,11 +44,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, << dst_place; return; } -#ifdef PADDLE_WITH_MKLDNN - if (src.layout() == DataLayout::kMKLDNN) { - dst->set_mkldnn_prim_desc(src.get_mkldnn_prim_desc()); - } -#endif memory::Copy(boost::get(dst_place), dst_ptr, boost::get(src_place), src_ptr, size); } diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 7a96ac11d8ef754f38070862a70744947412882b..78e502c670f0eb2480b560964cf31e247990a367 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -140,7 +140,7 @@ std::unique_ptr IRPassManager::Apply(std::unique_ptr graph) { if (pass->Type() != "graph_viz_pass") { PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type()); } - graph = pass->Apply(std::move(graph)); + graph.reset(pass->Apply(graph.release())); } return graph; } @@ -156,7 +156,7 @@ framework::proto::ProgramDesc IRPassManager::AcquireProgram( desc.CopyFrom(*program->Proto()); pass->SetNotOwned("program", &desc); auto *the_graph = graph->release(); - *graph = pass->Apply(std::unique_ptr(the_graph)); + graph->reset(pass->Apply(the_graph)); return *desc.Proto(); } diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc index 12deed2533bba713701849d58f8c5cf3269b85da..9e05aa5c16186d67200c4630619cc53fa241aa1b 100644 --- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc @@ -35,8 +35,8 @@ namespace analysis { using framework::ir::Node; -std::unique_ptr analysis::AnakinSubgraphPass::ApplyImpl( - std::unique_ptr graph) const { +void analysis::AnakinSubgraphPass::ApplyImpl( + framework::ir::Graph *graph) const { framework::ir::FusePassBase::Init("anakin_subgraph_pass", graph.get()); auto teller = [](const framework::ir::Node *node) { @@ -72,8 +72,6 @@ std::unique_ptr analysis::AnakinSubgraphPass::ApplyImpl( framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); graph->Set(framework::ir::kRepetitiveParamAttr, new std::vector(repetitive_params)); - - return graph; } std::string GenerateAnakinEngineKey(const std::set &engine_inputs, diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h index c13b9ecda42336a79187185070104ba9ac4b67bc..e80b8bb612096a1da7cd5835c948085d51fdfe7a 100644 --- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.h @@ -29,8 +29,7 @@ namespace analysis { class AnakinSubgraphPass : public framework::ir::FusePassBase { public: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(framework::ir::Graph *graph) const override; private: void CreateAnakinOp(framework::ir::Node *x, framework::ir::Graph *graph, diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 59399403276b59c143fc3e06a53643e0a85cf559..ef5872c52c6a1b3f3ade40ea43e78e2120fa6643 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -31,16 +31,16 @@ namespace analysis { using framework::ir::Node; -std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( - std::unique_ptr graph) const { - framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); +void analysis::TensorRtSubgraphPass::ApplyImpl( + framework::ir::Graph *graph) const { + framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph); auto teller = [](const framework::ir::Node *node) { if (!node->IsOp() || !node->Op()) return false; return tensorrt::OpTeller::Global().Tell(node->Op()->Type(), *node->Op()); }; - SubGraphFuser fuser(graph.get(), teller, + SubGraphFuser fuser(graph, teller, Get("min_subgraph_size") /*min subgraph size*/); fuser(); @@ -52,12 +52,11 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( for (auto *node : graph->Nodes()) { if (node->IsOp() && !Agent(node).subgraph()->empty()) { - CreateTensorRTOp(node, graph.get(), graph_param_names, - &repetitive_params); + CreateTensorRTOp(node, graph, graph_param_names, &repetitive_params); std::unordered_set nodes2remove( Agent(node).subgraph()->begin(), Agent(node).subgraph()->end()); - framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); + framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); } } @@ -67,11 +66,9 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( nodes2remove.insert(node); } } - framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); + framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); graph->Set(framework::ir::kRepetitiveParamAttr, new std::vector(repetitive_params)); - - return graph; } std::string GenerateEngineKey(const std::set &engine_inputs, diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h index f043670c5af39c1bdf8d4f00c7294fb53a4c9039..f530a5a0b337666ba6c470fbf63247cc62041d82 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h @@ -28,8 +28,7 @@ namespace analysis { class TensorRtSubgraphPass : public framework::ir::FusePassBase { public: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override; + void ApplyImpl(framework::ir::Graph *graph) const override; private: void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph, diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc index 6b3d80fcef0be1527062edbb37ea39cc5d95a168..35df396fe89eb23317b8f086c668396fdb3a4559 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h" +#include #include "paddle/fluid/framework/ir/graph_to_program_pass.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/program_desc.h" @@ -37,8 +38,7 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) { framework::ProgramDesc desc; desc.CopyFrom(*argument->main_program().Proto()); pass->SetNotOwned("program", &desc); - auto thegraph = pass->Apply(std::move(graph)); - thegraph.release(); // the argument still own the graph. + pass->Apply(graph.release()); // the argument still own the graph. argument->SetIrAnalyzedProgram( new framework::proto::ProgramDesc(*desc.Proto())); diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 0f6014ae8aa28f090cb51401ee2cb0772bca7a45..ac77c3d2a500816a4eb41ed13f23ee628290f287 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -4,7 +4,6 @@ cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler) -cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator) if (WITH_GPU) @@ -38,20 +37,30 @@ else () set(AllocatorFacadeDeps) endif() -list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator zero_size_allocator) - cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator) +cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator) cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator) -cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) -cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) +cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags) +cc_library(allocator_facade SRCS allocator_facade.cc DEPS + ${AllocatorFacadeDeps} + cpu_allocator + locked_allocator + best_fit_allocator + aligned_allocator + auto_increment_allocator + zero_size_allocator + conditional_allocator + retry_allocator + buffered_allocator + allocator_strategy + legacy_allocator + ) nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade) cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator) -cc_test(naive_best_fit_allocator_facade_test SRCS naive_best_fit_allocator_facade_test.cc DEPS allocator_facade) - cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade) cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade) diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h index b536d4276e3b6236d0748eee588d345dd15c6954..064acd06e71da98802126913e0af843cfbf717e7 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.h +++ b/paddle/fluid/memory/allocation/aligned_allocator.h @@ -94,8 +94,6 @@ class AlignedAllocator : public ThinAlignedAllocator { underlying_allocator_->Allocate(size + kAlignment, attr); return new AlignedAllocation(std::move(raw_allocation), size); } - - void FreeImpl(Allocation* allocation) override { delete allocation; } }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc index 5a5253d911abc722c026730e7e88eb326bb82afd..8fb8a5fb897a736d7515951ba08c633da9a7706c 100644 --- a/paddle/fluid/memory/allocation/allocator.cc +++ b/paddle/fluid/memory/allocation/allocator.cc @@ -27,24 +27,16 @@ bool Allocator::IsAllocThreadSafe() const { return false; } AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) { auto ptr = AllocateImpl(size, attr); - ptr->RegisterDecoratedAllocator(this); + ptr->set_allocator(this); return AllocationPtr(ptr); } -void Allocator::FreeImpl(Allocation* allocation) { - Allocator* allocator = allocation->TopDecoratedAllocator(); - allocator->Free(allocation); -} - -void Allocator::Free(Allocation* allocation) { - allocation->PopDecoratedAllocator(); - FreeImpl(allocation); -} +void Allocator::Free(Allocation* allocation) { delete allocation; } const char* BadAlloc::what() const noexcept { return msg_.c_str(); } void AllocationDeleter::operator()(Allocation* allocation) const { - Allocator* allocator = allocation->TopDecoratedAllocator(); + auto* allocator = allocation->allocator(); allocator->Free(allocation); } diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index 33b816b90812d7fedc450a67743b5d7d20579302..3465278935f7ce05456e94bb3a7d1ae9f114ff96 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -46,56 +46,13 @@ class Allocator; // NOTE: this is the base class of Allocation. Each allocator can use its own // allocation object. // NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0 - -/** - * Allocation is returned by Allocator::Allocate() method. - * - * An allocator may be decorated by another allocator. For example, we can - * decorate - * a RetryAllocator to any allocator to perform allocation retry when first - * allocation request fails. - * - * Explanations of Allocator design is as follows: - * - * Suppose we have an allocator which is decorated by several allocators: - * - * A(1) <- A(2) <- A(3) <- ... <- A(n) - * - * , and the public allocator is A(1). - * - * The allocation process would be: - * - * A(n).Allocate() -> ... -> A(2).Allocate() -> A(1).Allocate() - * - * , and the free process would be: - * - * A(1).Free() -> A(2).Free() -> ... -> A(n).Free() - * - * Therefore, we should record the allocator chain when allocating, so - * that we can free the allocation in the reverse order of allocator chain. - * The field `decorated_allocators_` is used to record this chain. - * - * Another example is that we want to add additional fields in Allocation, - * e.g., something what is done in AlignedAllocator, etc. - * In this case, we should declare a derived class of Allocation, which - * contains an underlying Allocation allocated by the underlying allocator. - * Therefore, `decorated_allocators_` of the new Allocation object would - * be a new chain, differing from the underlying Allocation object. - */ class Allocation { public: Allocation(void* ptr, size_t size, platform::Place place) - : ptr_(ptr), size_(size), place_(place) { - // NOTE(zjl): Since decorated_allocators_ is usually a small vector - // We reserve a small buffer to it to prevent frequent heap allocation - // Not quite sure whether we need something like gtl vector. - decorated_allocators_.reserve(8); - } + : allocator_(nullptr), ptr_(ptr), size_(size), place_(place) {} Allocation(const Allocation& o) = delete; Allocation& operator=(const Allocation& o) = delete; - Allocation(Allocation&& o) = delete; - Allocation& operator=(Allocation&& o) = delete; // Returns the holding pointer. // NOTE: For performance consideration, it is better not to make this method @@ -117,31 +74,17 @@ class Allocation { const platform::Place& place() const { return place_; } - virtual ~Allocation(); - - private: - const std::vector& DecoratedAllocators() const { - return decorated_allocators_; - } - - inline void RegisterDecoratedAllocator(Allocator* allocator) { - decorated_allocators_.push_back(allocator); - } + Allocator* allocator() { return allocator_; } - inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); } + void set_allocator(Allocator* allocator) { allocator_ = allocator; } - inline Allocator* TopDecoratedAllocator() { - return decorated_allocators_.back(); - } + virtual ~Allocation(); private: + Allocator* allocator_; void* ptr_; size_t size_; platform::Place place_; - std::vector decorated_allocators_; - - friend class Allocator; - friend class AllocationDeleter; }; using AllocationPtr = std::unique_ptr; @@ -191,12 +134,9 @@ class Allocator { // True if the `Allocate` is thread safe. virtual bool IsAllocThreadSafe() const; - // This function should not be called outside - void Free(Allocation* allocation); - protected: + virtual void Free(Allocation* allocation); virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0; - virtual void FreeImpl(Allocation* allocation); private: friend class AllocationDeleter; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 09328aded58cb0cccd9de0aba399f5c49313042f..a3b73e3ba31c89c2a94955b0fea64df4ab0ffc26 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -49,17 +49,6 @@ namespace paddle { namespace memory { namespace allocation { -static inline std::shared_ptr WrapRetryAllocator( - std::shared_ptr allocator, int64_t retry_time) { - if (retry_time > 0) { - auto* retry_allocator = - new RetryAllocator(std::move(allocator), retry_time); - allocator.reset(retry_allocator); - } - - return allocator; -} - // TODO(yy): Dirty code here. This class should be configurable in runtime. class CPUManagedAllocator : public Allocator { public: @@ -123,10 +112,14 @@ class ChunkedAllocator : public Allocator { std::shared_ptr CreateAllocatorWithChunk() { chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_)); auto* allocation = chunks_.back().get(); - std::shared_ptr allocator(new LockedAllocator( - std::shared_ptr(new BestFitAllocator(allocation)))); + std::unique_ptr allocator(new LockedAllocator( + std::unique_ptr(new BestFitAllocator(allocation)))); - allocator = WrapRetryAllocator(allocator, retry_time_); + if (retry_time_ > 0) { + auto* retry_allocator = + new RetryAllocator(std::move(allocator), retry_time_); + allocator.reset(retry_allocator); + } return std::make_shared>(std::move(allocator)); } @@ -197,23 +190,13 @@ class AllocatorFacadePrivate { ~AllocatorFacadePrivate() = default; AllocatorFacadePrivate() { - auto strategy = GetAllocatorStrategy(); - switch (strategy) { - case AllocatorStrategy::kLegacy: { - InitLegacyAllocator(); - break; - } - case AllocatorStrategy::kNaiveBestFit: { - InitCPUAllocator(); - InitCUDAAllocator(); - InitCUDAPinnedAllocator(); - WrapZeroSizeAllocator(); - break; - } - default: { - PADDLE_THROW("Unsupported allocator strategy: %d", - static_cast(strategy)); - } + if (GetAllocatorStrategy() == AllocatorStrategy::kLegacy) { + InitLegacyAllocator(); + } else { + InitCPUAllocator(); + InitCUDAAllocator(); + InitCUDAPinnedAllocator(); + WrapZeroSizeAllocator(); } } @@ -271,7 +254,8 @@ AllocatorFacade& AllocatorFacade::Instance() { std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size, Allocator::Attr attr) { - return std::shared_ptr(Alloc(place, size, attr)); + return std::shared_ptr(Alloc(place, size, attr).release(), + AllocationDeleter()); } AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc index fff94c01e709613603eea7150a08df3c2611dec2..8cebda9005b29b5b3259de0830c42eb10ef90e66 100644 --- a/paddle/fluid/memory/allocation/allocator_strategy.cc +++ b/paddle/fluid/memory/allocation/allocator_strategy.cc @@ -19,22 +19,16 @@ DEFINE_string( allocator_strategy, "legacy", "The allocation strategy. Legacy means the original allocator of Fluid." - "naive_best_fit means the experimental best fit allocator. " - "allocator. Enum in [legacy, naive_best_fit]."); + "New means the experimental allocators of Fluid. in [legacy, new]"); namespace paddle { namespace memory { namespace allocation { static AllocatorStrategy GetStrategyFromFlag() { - if (FLAGS_allocator_strategy == "legacy") { - return AllocatorStrategy::kLegacy; - } else if (FLAGS_allocator_strategy == "naive_best_fit") { - return AllocatorStrategy::kNaiveBestFit; - } else { - PADDLE_THROW("Unsupported allocator strategy: %s", - FLAGS_allocator_strategy); - } + return FLAGS_allocator_strategy == "legacy" + ? AllocatorStrategy::kLegacy + : AllocatorStrategy::kNaiveBestFit; } AllocatorStrategy GetAllocatorStrategy() { diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc index d87dd9a4b6df288065389a335a9ddb4047dd096a..e3d6c2f511ef083ef9ecc1fe8df96051b2b85cc2 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc @@ -109,7 +109,7 @@ size_t BestFitAllocator::NumFreeChunks() const { } return num; } -void BestFitAllocator::FreeImpl(Allocation* allocation) { +void BestFitAllocator::Free(Allocation* allocation) { auto* bf_allocation = dynamic_cast(allocation); PADDLE_ENFORCE_NOT_NULL(bf_allocation, "The input allocation is not BestFitAllocation."); diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h index c137438c0c35a575d366a1dfdf950262f711defa..4f10f2b53e8543d4197097f1cae8de765bceeb0f 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/best_fit_allocator.h @@ -119,7 +119,7 @@ class BestFitAllocator : public Allocator { void InsertFreeNode(const ListIt& it); protected: - void FreeImpl(Allocation* allocation) override; + void Free(Allocation* allocation) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; private: diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc index e04c0aa34b1cd6200806cc2a012161e3478eca0b..fc75abc9dfee6c9df5bc87faa493002cc1fe6298 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator.cc +++ b/paddle/fluid/memory/allocation/buffered_allocator.cc @@ -22,11 +22,11 @@ namespace paddle { namespace memory { namespace allocation { -BufferedAllocator::BufferedAllocator(std::shared_ptr allocator) +BufferedAllocator::BufferedAllocator(std::unique_ptr &&allocator) : underlying_allocator_(std::move(allocator)) { PADDLE_ENFORCE_NOT_NULL( underlying_allocator_, - "Underlying allocator of BufferedAllocator must not be null"); + "Underlying allocator of BufferedAllocator must be unmanaged"); if (underlying_allocator_->IsAllocThreadSafe()) { mtx_.reset(new std::mutex()); } @@ -41,19 +41,19 @@ void BufferedAllocator::FreeCache(size_t size) { while (!allocations_.empty()) { // free the largest auto it = --allocations_.end(); cur += it->second->size(); - underlying_allocator_->Free(it->second.release()); + delete it->second.release(); allocations_.erase(it); if (cur >= size) return; } } -bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; } - -void BufferedAllocator::FreeImpl(Allocation *allocation) { +bool BufferedAllocator::IsAllocThreadSafe() const { + return this->underlying_allocator_->IsAllocThreadSafe(); +} +void BufferedAllocator::Free(Allocation *allocation) { platform::LockGuardPtr guard(mtx_); allocations_.emplace(allocation->size(), AllocationPtr(allocation)); } - Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { { platform::LockGuardPtr guard(mtx_); @@ -61,15 +61,17 @@ Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { if (it != allocations_.end() && it->first < size * 2) { AllocationPtr result(std::move(it->second)); allocations_.erase(it); - return result.release(); + return new AllocationWithUnderlying(std::move(result)); } } try { - return underlying_allocator_->Allocate(size, attr).release(); + return new AllocationWithUnderlying( + underlying_allocator_->Allocate(size, attr)); } catch (BadAlloc &) { FreeCache(size); - return underlying_allocator_->Allocate(size, attr).release(); + return new AllocationWithUnderlying( + underlying_allocator_->Allocate(size, attr)); } } diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h index c728395705842d29a7b2a8441a7048a7e4bf5e6b..d44a3f85beba712b1e735ba14008689bce7d0d64 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator.h +++ b/paddle/fluid/memory/allocation/buffered_allocator.h @@ -31,7 +31,7 @@ namespace allocation { // underlying_allocator_ class BufferedAllocator : public Allocator { public: - explicit BufferedAllocator(std::shared_ptr allocator); + explicit BufferedAllocator(std::unique_ptr &&allocator); ~BufferedAllocator(); @@ -44,11 +44,11 @@ class BufferedAllocator : public Allocator { void FreeCache(size_t size); protected: - void FreeImpl(Allocation *allocation) override; + void Free(Allocation *allocation) override; Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; private: - std::shared_ptr underlying_allocator_; + std::unique_ptr underlying_allocator_; std::multimap allocations_; std::unique_ptr mtx_; }; diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc index 854a117b0e7532962d5e0c95fd947527ac3b307a..c8bd5292ca0f6c3e7ebdc7f5908523b0b7c8ba3a 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc +++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/memory/allocation/buffered_allocator.h" #include +#include #include #include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" @@ -65,7 +66,7 @@ class StubAllocator : public Allocator { size_t GetFreeCount() const { return destruct_count_; } protected: - void FreeImpl(Allocation *allocation) override { + void Free(Allocation *allocation) override { auto *alloc = dynamic_cast(allocation); PADDLE_ENFORCE_NOT_NULL(alloc); if (alloc->ptr()) delete[] static_cast(alloc->ptr()); diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc index 90c49c87a677aa38bce35774b3a7bb698e6f43e7..cc81a6f7b8b1950b07b6fb1571b53d9b5ddb1b9f 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.cc +++ b/paddle/fluid/memory/allocation/cpu_allocator.cc @@ -20,27 +20,25 @@ namespace paddle { namespace memory { namespace allocation { +CPUAllocation::CPUAllocation(void *ptr, size_t size) + : Allocation(ptr, size, platform::CPUPlace()) {} + bool CPUAllocator::IsAllocThreadSafe() const { return true; } -void CPUAllocator::FreeImpl(Allocation *allocation) { - void *p = allocation->ptr(); -#ifdef _WIN32 - _aligned_free(p); -#else - free(p); -#endif +void CPUAllocator::Free(Allocation *allocation) { + PADDLE_ENFORCE_NOT_NULL(dynamic_cast(allocation)); + free(allocation->ptr()); delete allocation; } Allocation *CPUAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { - void *p; -#ifdef _WIN32 - p = _aligned_malloc(size, kAlignment); -#else - PADDLE_ENFORCE_EQ(posix_memalign(&p, kAlignment, size), 0, "Alloc %ld error!", - size); -#endif - return new Allocation(p, size, platform::CPUPlace()); + void *ptr; + auto status = posix_memalign(&ptr, kAlignment, size); + if (UNLIKELY(status) != 0) { + throw BadAlloc(string::Sprintf("Cannot allocate cpu memory %d. Errno is %d", + size, status)); + } + return new CPUAllocation(ptr, size); } } // namespace allocation } // namespace memory diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h index 3eb1416b0efa9327f2052e1f128359bc93f94986..26d3643f4edff1f2d71b1c761e915a6dacb485ad 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.h +++ b/paddle/fluid/memory/allocation/cpu_allocator.h @@ -31,13 +31,19 @@ namespace allocation { // // NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import // an open-sourced allocator into Paddle. +class CPUAllocator; +class CPUAllocation : public Allocation { + public: + CPUAllocation(void* ptr, size_t size); +}; + class CPUAllocator : public Allocator { public: - constexpr static size_t kAlignment = 4096UL; + constexpr static size_t kAlignment = 64u; bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation* allocation) override; + void Free(Allocation* allocation) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index 895a24a6a2a6b8e399ec2ace48136d1ef16c62f6..430bf0be98e08787ac4412a8b6e0fcc310ffe2b4 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -23,14 +23,15 @@ namespace paddle { namespace memory { namespace allocation { bool CUDAAllocator::IsAllocThreadSafe() const { return true; } -void CUDAAllocator::FreeImpl(Allocation* allocation) { +void CUDAAllocator::Free(Allocation* allocation) { platform::CUDADeviceGuard guard(place_.device); - PADDLE_ENFORCE_EQ(boost::get(allocation->place()), + auto* cuda_allocation = dynamic_cast(allocation); + PADDLE_ENFORCE_NOT_NULL(cuda_allocation); + PADDLE_ENFORCE_EQ(boost::get(cuda_allocation->place()), place_); PADDLE_ENFORCE(cudaFree(allocation->ptr())); delete allocation; } - Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { platform::CUDADeviceGuard guard(place_.device); void* ptr; @@ -40,9 +41,8 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device, status, cudaGetErrorString(status))); } - return new Allocation(ptr, size, platform::Place(place_)); + return new CUDAAllocation(ptr, size, platform::Place(place_)); } - } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h index 580a2d1df1d5997a27180740393741ec8973bf18..63726f5820b1c81565117c7a9bf798c17c9681f6 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_allocator.h @@ -20,6 +20,13 @@ namespace paddle { namespace memory { namespace allocation { +// CUDA System allocator and allocation. +// Just a flag type. +class CUDAAllocation : public Allocation { + public: + using Allocation::Allocation; +}; + class CUDAAllocator : public Allocator { public: explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {} @@ -28,7 +35,7 @@ class CUDAAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation* allocation) override; + void Free(Allocation* allocation) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; private: diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 0dc2de37467b7e7d23c88b4a255c14795db4c275..514ac7883ad2effdf3518be8afe3f448a5ac10b2 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -134,22 +134,26 @@ size_t Used(const platform::CPUPlace &place) { } #ifdef PADDLE_WITH_CUDA -class GPUBuddyAllocatorList { - public: - GPUBuddyAllocatorList() - : allocators_(platform::GetCUDADeviceCount()), - flags_(platform::GetCUDADeviceCount()) { - allocation::GPUMemMonitor.Initialize(allocators_.size()); - } +BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { + static std::once_flag init_flag; + static detail::BuddyAllocator **a_arr = nullptr; + static std::vector devices; + + std::call_once(init_flag, [gpu_id]() { + devices = platform::GetSelectedDevices(); + int gpu_num = devices.size(); - BuddyAllocator *Get(size_t dev_id) { - PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id); - std::call_once(flags_[dev_id], [this, dev_id] { + allocation::GPUMemMonitor.Initialize(devices.size()); + + a_arr = new BuddyAllocator *[gpu_num]; + for (size_t i = 0; i < devices.size(); ++i) { + int dev_id = devices[i]; + a_arr[i] = nullptr; platform::SetDeviceId(dev_id); - allocators_[dev_id] = new BuddyAllocator( - std::unique_ptr( - new detail::GPUAllocator(dev_id)), - platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + a_arr[i] = new BuddyAllocator(std::unique_ptr( + new detail::GPUAllocator(dev_id)), + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); VLOG(10) << "\n\nNOTE:\n" << "You can set GFlags environment variable " @@ -163,19 +167,13 @@ class GPUBuddyAllocatorList { << FLAGS_initial_gpu_memory_in_mb << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is " << FLAGS_reallocate_gpu_memory_in_mb << "\n\n"; - }); - return allocators_[dev_id]; - } - - private: - std::vector allocators_; - std::vector flags_; -}; + } + }); -BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { - static GPUBuddyAllocatorList allocators; platform::SetDeviceId(gpu_id); - return allocators.Get(gpu_id); + auto pos = std::distance(devices.begin(), + std::find(devices.begin(), devices.end(), gpu_id)); + return a_arr[pos]; } #endif @@ -194,7 +192,7 @@ void *Alloc(const platform::CUDAPlace &place, #ifdef PADDLE_WITH_CUDA auto *buddy_allocator = GetGPUBuddyAllocator(place.device); auto *ptr = buddy_allocator->Alloc(size); - if (ptr == nullptr && size > 0) { + if (ptr == nullptr) { int cur_dev = platform::GetCurrentDeviceId(); platform::SetDeviceId(place.device); size_t avail, total; @@ -349,7 +347,7 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { return tmp_alloc; } -void LegacyAllocator::FreeImpl(Allocation *allocation) { +void LegacyAllocator::Free(Allocation *allocation) { boost::apply_visitor( legacy::FreeVisitor(allocation->ptr(), allocation->size()), allocation->place()); diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h index 27cd42ea35012f07ae7db79c46d767138ddaafff..d9bdae153da6439598f76f5cac226897e6e0c596 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.h +++ b/paddle/fluid/memory/allocation/legacy_allocator.h @@ -73,7 +73,7 @@ class LegacyAllocator : public Allocator { protected: Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; - void FreeImpl(Allocation *allocation) override; + void Free(Allocation *allocation) override; private: platform::Place place_; diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc index c43099cc88f839ad92d36774d49aafd7192f916f..62d768c580607f32db8c49eb3d62f0f32c9dbeeb 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.cc +++ b/paddle/fluid/memory/allocation/locked_allocator.cc @@ -17,7 +17,6 @@ #include #include "paddle/fluid/memory/allocation/allocation_with_underlying.h" #include "paddle/fluid/platform/lock_guard_ptr.h" - namespace paddle { namespace memory { namespace allocation { @@ -25,24 +24,26 @@ namespace allocation { bool LockedAllocator::IsAllocThreadSafe() const { return true; } LockedAllocator::LockedAllocator( - std::shared_ptr underlying_allocator) + std::unique_ptr &&underlying_allocator) : underlying_allocator_(std::move(underlying_allocator)) { PADDLE_ENFORCE_NOT_NULL(underlying_allocator_); if (!underlying_allocator_->IsAllocThreadSafe()) { mtx_.reset(new std::mutex()); } } - -void LockedAllocator::FreeImpl(Allocation *allocation) { - platform::LockGuardPtr guard(mtx_); - underlying_allocator_->Free(allocation); +void LockedAllocator::Free(Allocation *allocation) { + { + platform::LockGuardPtr guard(mtx_); + reinterpret_cast(allocation) + ->allocation_.reset(); // Destroy inner allocation + } + delete allocation; } - Allocation *LockedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { platform::LockGuardPtr guard(mtx_); - return underlying_allocator_->Allocate(size, attr).release(); + return new AllocationWithUnderlying( + underlying_allocator_->Allocate(size, attr)); } - } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h index b735ccef101417b3f880eb6dcdd9964cffbe875c..4967b9bb8d3ad101cff4657b0a45b49b76e2deb2 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.h +++ b/paddle/fluid/memory/allocation/locked_allocator.h @@ -24,15 +24,15 @@ namespace allocation { // A allocator to make underlying allocator thread safe. class LockedAllocator : public Allocator { public: - explicit LockedAllocator(std::shared_ptr underlying_allocator); + explicit LockedAllocator(std::unique_ptr &&underlying_allocator); bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation *allocation) override; + void Free(Allocation *allocation) override; Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; private: - std::shared_ptr underlying_allocator_; + std::unique_ptr underlying_allocator_; std::unique_ptr mtx_; }; diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc deleted file mode 100644 index 3334589a4beb407447cf89c173f6128654bb245a..0000000000000000000000000000000000000000 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include "paddle/fluid/memory/allocation/allocator_facade.h" - -#ifdef PADDLE_WITH_CUDA -DECLARE_double(fraction_of_gpu_memory_to_use); -DECLARE_double(fraction_of_cuda_pinned_memory_to_use); -DECLARE_int64(gpu_allocator_retry_time); -#endif - -DECLARE_string(allocator_strategy); - -namespace paddle { -namespace memory { -namespace allocation { - -TEST(allocator, allocator) { -#ifdef PADDLE_WITH_CUDA - FLAGS_fraction_of_gpu_memory_to_use = 0.01; - FLAGS_gpu_allocator_retry_time = 500; - FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; -#endif - - FLAGS_allocator_strategy = "naive_best_fit"; - - auto &instance = AllocatorFacade::Instance(); - platform::Place place; - size_t size = 1024; - - { - place = platform::CPUPlace(); - size = 1024; - auto cpu_allocation = instance.Alloc(place, size); - ASSERT_NE(cpu_allocation, nullptr); - ASSERT_NE(cpu_allocation->ptr(), nullptr); - ASSERT_EQ(cpu_allocation->place(), place); - ASSERT_EQ(cpu_allocation->size(), size); - } - -#ifdef PADDLE_WITH_CUDA - { - place = platform::CUDAPlace(0); - size = 1024; - auto gpu_allocation = instance.Alloc(place, size); - ASSERT_NE(gpu_allocation, nullptr); - ASSERT_NE(gpu_allocation->ptr(), nullptr); - ASSERT_EQ(gpu_allocation->place(), place); - ASSERT_GE(gpu_allocation->size(), size); - } - - { - // Allocate 2GB gpu memory - place = platform::CUDAPlace(0); - size = 2 * static_cast(1 << 30); - auto gpu_allocation = instance.Alloc(place, size); - ASSERT_NE(gpu_allocation, nullptr); - ASSERT_NE(gpu_allocation->ptr(), nullptr); - ASSERT_EQ(gpu_allocation->place(), place); - ASSERT_GE(gpu_allocation->size(), size); - } - - { - place = platform::CUDAPinnedPlace(); - size = (1 << 20); - auto cuda_pinned_allocation = - instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20); - ASSERT_NE(cuda_pinned_allocation, nullptr); - ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr); - ASSERT_EQ(cuda_pinned_allocation->place(), place); - ASSERT_GE(cuda_pinned_allocation->size(), size); - } -#endif -} - -} // namespace allocation -} // namespace memory -} // namespace paddle diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index 5a3d817211750d3e19e65344d1eab5a96800c674..de81d12cca6ca280289371abdec225c9e2b8f4d0 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -20,15 +20,20 @@ namespace paddle { namespace memory { namespace allocation { bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; } -void CPUPinnedAllocator::FreeImpl(Allocation *allocation) { +void CPUPinnedAllocator::Free(Allocation *allocation) { + PADDLE_ENFORCE_NOT_NULL(dynamic_cast(allocation)); PADDLE_ENFORCE(cudaFreeHost(allocation->ptr())); delete allocation; } Allocation *CPUPinnedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { + // PADDLE_ENFORCE_EQ( + // attr, kCrossDevice, + // "CPUPinnedAllocator should be used for Cross-Device Communication"); + void *ptr; PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); - return new Allocation(ptr, size, platform::CUDAPinnedPlace()); + return new CPUPinnedAllocation(ptr, size); } } // namespace allocation } // namespace memory diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h index deeb55a8fb0396a312286f5c2692114e9e4afc8d..42d0938f2afbb1efca8bfdd7035bc0eada30f06b 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.h +++ b/paddle/fluid/memory/allocation/pinned_allocator.h @@ -20,12 +20,18 @@ namespace memory { namespace allocation { // Allocator uses `cudaHostAlloc` +class CPUPinnedAllocation : public Allocation { + public: + CPUPinnedAllocation(void *ptr, size_t size) + : Allocation(ptr, size, platform::CUDAPinnedPlace()) {} +}; + class CPUPinnedAllocator : public Allocator { public: bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation *allocation) override; + void Free(Allocation *allocation) override; Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; }; diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc index 7e888988f9602e362d73f64c1b45552e84e3349c..981705051b449e6a35c2dcce9138dc2efae52920 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.cc +++ b/paddle/fluid/memory/allocation/retry_allocator.cc @@ -18,15 +18,25 @@ namespace paddle { namespace memory { namespace allocation { -void RetryAllocator::FreeImpl(Allocation* allocation) { +bool RetryAllocator::IsAllocThreadSafe() const { + return underlying_allocator_->IsAllocThreadSafe(); +} + +void RetryAllocator::Free(Allocation* allocation) { // Delete underlying allocation first. - underlying_allocator_->Free(allocation); - cv_.notify_all(); + reinterpret_cast(allocation)->allocation_.reset(); + { + // notify all waited allocators, they can try to allocate memory after free. + std::lock_guard lock(mutex_); + cv_.notify_all(); + } + delete allocation; } Allocation* RetryAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { auto alloc_func = [&, this]() { - return underlying_allocator_->Allocate(size, attr).release(); + return new AllocationWithUnderlying( + underlying_allocator_->Allocate(size, attr)); }; // In fact, we can unify the code of allocation success and failure // But it would add lock even when allocation success at the first time diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h index 379f576d6e1ed8f256a0233b203423a487ee73e4..6ab8ca8fbec0077b2c95cf727731ca0095716197 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.h +++ b/paddle/fluid/memory/allocation/retry_allocator.h @@ -25,25 +25,32 @@ namespace paddle { namespace memory { namespace allocation { +class RetryAllocator; + class RetryAllocator : public Allocator { public: - RetryAllocator(std::shared_ptr allocator, size_t retry_ms) + RetryAllocator(std::unique_ptr&& allocator, size_t retry_ms) : underlying_allocator_(std::move(allocator)), retry_time_(retry_ms) { + EnforceCheck(); + } + + bool IsAllocThreadSafe() const override; + + private: + void EnforceCheck() { PADDLE_ENFORCE_NOT_NULL( - underlying_allocator_, - "UnderlyingAllocator of RetryAllocator must not be null"); + underlying_allocator_.get(), + "UnderlyingAllocator of RetryAllocator must be UnmanagedAllocator"); PADDLE_ENFORCE(underlying_allocator_->IsAllocThreadSafe(), "UnderlyingAllocator of RetryAllocator must be thread-safe"); } - bool IsAllocThreadSafe() const override { return true; } - protected: - void FreeImpl(Allocation* allocation) override; + void Free(Allocation* allocation) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; private: - std::shared_ptr underlying_allocator_; + std::unique_ptr underlying_allocator_; std::chrono::milliseconds retry_time_; std::mutex mutex_; std::condition_variable cv_; @@ -51,6 +58,8 @@ class RetryAllocator : public Allocator { // For debug, We can add an atomic integer to record how many memory sizes are // waited to allocate // std::atomic waited_allocate_size_{0}; + + friend class RetryAllocation; }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.cc b/paddle/fluid/memory/allocation/zero_size_allocator.cc index 39743bcb10c700c9a8446b9040c8a8707d57ec7d..cb2df1a029815478bbc9d3b09425f3ef145c5fb3 100644 --- a/paddle/fluid/memory/allocation/zero_size_allocator.cc +++ b/paddle/fluid/memory/allocation/zero_size_allocator.cc @@ -24,20 +24,11 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const { Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { if (size == 0) { - return new Allocation(nullptr, 0, place_); + return new ZeroSizeAllocation(place_); } else { return underlying_allocator_->Allocate(size, attr).release(); } } - -void ZeroSizeAllocator::FreeImpl(Allocation *allocation) { - if (allocation->size() == 0) { - delete allocation; - } else { - underlying_allocator_->Free(allocation); - } -} - } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.h b/paddle/fluid/memory/allocation/zero_size_allocator.h index 08a7a06dbf290b55994a407fe478f792b0c0964a..0f01dfcdf5b1179c52d8c0204b655cab10770d95 100644 --- a/paddle/fluid/memory/allocation/zero_size_allocator.h +++ b/paddle/fluid/memory/allocation/zero_size_allocator.h @@ -24,6 +24,12 @@ namespace allocation { // The allocator handles the request's size is zero. Allocator will always // return an allocation even the request size is zero. However, the // allocation.ptr() is nullptr +class ZeroSizeAllocation : public Allocation { + public: + explicit ZeroSizeAllocation(const platform::Place& p) + : Allocation(nullptr, 0, p) {} +}; + class ZeroSizeAllocator : public Allocator { public: ZeroSizeAllocator(std::shared_ptr underlying_allocator, @@ -34,7 +40,6 @@ class ZeroSizeAllocator : public Allocator { protected: Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; - void FreeImpl(Allocation* allocation) override; private: std::shared_ptr underlying_allocator_; diff --git a/paddle/fluid/operators/alloc_continuous_space_op.cc b/paddle/fluid/operators/alloc_continuous_space_op.cc index df0e9911cf7186e952cfd7fbf7f43889e9098c84..d4bdecff62c016a31011266a0f066076d85fcdef 100644 --- a/paddle/fluid/operators/alloc_continuous_space_op.cc +++ b/paddle/fluid/operators/alloc_continuous_space_op.cc @@ -65,7 +65,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel { // Get numel and dtype size_t numel = 0; auto dtype = kDefaultDtype; - GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype); + GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype, + context.GetPlace()); // Alloc the continuous space auto fused_tensor = context.Output("FusedOutput"); @@ -74,14 +75,18 @@ class AllocContinuousSpaceKernel : public framework::OpKernel { // Init the continuous space auto out_tensors = context.MultiOutput("Output"); - int64_t offset = 0; + size_t offset = 0; + size_t size_of_dtype = framework::SizeOfType(dtype); if (context.Attr("copy_data")) { for (size_t i = 0; i < in_var_names.size(); ++i) { - int64_t len = out_tensors[i]->numel(); - auto sub_tensor = fused_tensor->Slice(offset, offset + len); - offset += len; - framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx, + size_t len = static_cast(in_tensors[i]->numel()); + auto sub_tensor = fused_tensor->Slice( + static_cast(offset), static_cast(offset + len)); + framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor); + + offset += + Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype; } } else if (context.Attr("set_constant")) { math::SetConstant set_constant; @@ -92,11 +97,13 @@ class AllocContinuousSpaceKernel : public framework::OpKernel { // Make the outputs point to the continuous space. offset = 0; for (size_t i = 0; i < out_tensors.size(); ++i) { - int64_t len = out_tensors[i]->numel(); + size_t len = static_cast(out_tensors[i]->numel()); auto dim = out_tensors[i]->dims(); out_tensors[i] - ->ShareDataWith(fused_tensor->Slice(offset, offset + len)) + ->ShareDataWith(fused_tensor->Slice( + static_cast(offset), static_cast(offset + len))) .Resize(dim); + len = Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype; offset += len; VLOG(10) << "alloc_space_for_vars: output(" << out_var_names[i] << ") ,dim:(" << dim << ")" @@ -104,12 +111,28 @@ class AllocContinuousSpaceKernel : public framework::OpKernel { } } + private: + // Note(zcd): Addresses should be aligned, otherwise, the results may have + // diff. + size_t Alignment(size_t size, const platform::Place &place) const { + // Allow to allocate the minimum chunk size is 4 KB. + size_t alignment = 1 << 12; + if (platform::is_gpu_place(place)) { + // Allow to allocate the minimum chunk size is 256 B. + alignment = 1 << 8; + } + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); + } + void GetMemSizeAndDtype( const std::vector &lod_tensors, const std::vector var_names, size_t *numel, - framework::proto::VarType::Type *dtype) const { + framework::proto::VarType::Type *dtype, + const platform::Place &place) const { PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size()); *numel = 0; + size_t size_of_dtype = 0; for (size_t i = 0; i < var_names.size(); ++i) { PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.", var_names[i]); @@ -119,6 +142,7 @@ class AllocContinuousSpaceKernel : public framework::OpKernel { PADDLE_ENFORCE_NE(p_dtype, kDefaultDtype, "%s's type should not be %s.", var_names[i], kDefaultDtype); *dtype = p_dtype; + size_of_dtype = framework::SizeOfType(p_dtype); } PADDLE_ENFORCE_EQ(p_dtype, *dtype, "Input vars is not equal."); @@ -126,7 +150,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel { PADDLE_ENFORCE_GT(size, 0); VLOG(10) << "alloc_space_for_vars: input(" << var_names[i] << ") ,dim:(" << lod_tensors[i]->dims() << ")"; - *numel += size; + *numel += Alignment(static_cast(size) * size_of_dtype, place) / + size_of_dtype; } } }; diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc index f349c51d8a99aaab43a15580a8904d4e4fd0d9b7..b2dbaecfcfd67cc679d02e22d4e89cfedeeba80c 100644 --- a/paddle/fluid/operators/bpr_loss_op.cc +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/bpr_loss_op.h" +#include namespace paddle { namespace operators { @@ -127,6 +128,23 @@ neural networks>(https://arxiv.org/abs/1511.06939) )DOC"); } }; + +class BprLossGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("bpr_loss_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Label", Input("Label")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; } // namespace operators } // namespace paddle @@ -134,7 +152,7 @@ namespace ops = paddle::operators; using CPUCtx = paddle::platform::CPUDeviceContext; REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, ops::BprLossOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::BprLossGradDescMaker); REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp); REGISTER_OP_CPU_KERNEL(bpr_loss, ops::BprLossOpKernel, ops::BprLossOpKernel); diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc index a97828e6fe9cf3ed963da3c784a975f61ecec4a5..5b84221cfa5902d01540a06c6bc61fe9eac986f0 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" @@ -568,13 +569,31 @@ class ROIPerspectiveTransformOpMaker } }; +class ROIPerspectiveTransformGradDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("roi_perspective_transform_grad"); + op->SetInput("X", Input("X")); + op->SetInput("ROIs", Input("ROIs")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(roi_perspective_transform, ops::ROIPerspectiveTransformOp, ops::ROIPerspectiveTransformOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ROIPerspectiveTransformGradDescMaker); REGISTER_OPERATOR(roi_perspective_transform_grad, ops::ROIPerspectiveTransformGradOp); REGISTER_OP_CPU_KERNEL(roi_perspective_transform, diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 7aaa607f1585c98fe2dd816e8d66e5c6fd171e80..6a6741d8fc54d22addca91b75dfabf5950c1a35a 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -77,7 +77,8 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { } else { functor.RunMidWise(n, pre, post); } - z->set_mkldnn_prim_desc(x->get_mkldnn_prim_desc()); + z->set_layout(DataLayout::kMKLDNN); + z->set_format(x->format()); } else { PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN && x->format() != memory::format::format_undef, @@ -115,8 +116,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd); // create mkldnn memory for dst - auto dst_mem_pd = sum_pd.dst_primitive_desc(); - memory dst_memory = memory(dst_mem_pd, z_data); + memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data); std::vector inputs; inputs.push_back(srcs[0]); @@ -129,7 +129,9 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { pipeline.push_back(sum_prim); stream(stream::kind::eager).submit(pipeline).wait(); - z->set_mkldnn_prim_desc(dst_mem_pd); + z->set_layout(DataLayout::kMKLDNN); + z->set_format( + (memory::format)dst_memory.get_primitive_desc().desc().data.format); } } }; @@ -150,19 +152,24 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { auto* out = dout; auto *x = dout, *y = dout; + auto set_mkldnn_format = [](Tensor* in, const Tensor* out) { + in->set_layout(DataLayout::kMKLDNN); + in->set_format(out->format()); + }; + if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) { if (dx->dims() == dy->dims()) { auto blas = math::GetBlas(ctx); if (dx) { blas.VCOPY(dout->numel(), dout->data(), dx->mutable_data(ctx.GetPlace())); - dx->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc()); + set_mkldnn_format(dx, dout); } if (dy) { blas.VCOPY(dout->numel(), dout->data(), dy->mutable_data(ctx.GetPlace())); - dy->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc()); + set_mkldnn_format(dy, dout); } } } else { diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc index 4a974281481c8bc02589b428098475d73b8a0ba5..98ebe1fdf4bb3308b2f07a073072031e79e14146 100644 --- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc +++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc @@ -65,11 +65,17 @@ by input arguments. } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + GaussianRandomBatchSizeLikeNoNeedBufferVarsInference, "Input"); + } // namespace operators } // namespace paddle -REGISTER_OP_WITHOUT_GRADIENT( +REGISTER_OPERATOR( gaussian_random_batch_size_like, paddle::operators::GaussianRandomBatchSizeLikeOp, - paddle::operators::GaussianRandomBatchSizeLikeOpMaker); + paddle::operators::GaussianRandomBatchSizeLikeOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::operators::GaussianRandomBatchSizeLikeNoNeedBufferVarsInference); + // Kernels are registered in gaussian_random_op.cc and gaussian_random_op.cu diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc index 8efd43928aac994c7630a213f6724e8f50abc7e0..44fd95edef253b814a166f724ca67fcafe979b99 100644 --- a/paddle/fluid/operators/im2sequence_op.cc +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/im2sequence_op.h" +#include #include #include @@ -146,12 +147,28 @@ class Im2SequenceGradOp : public framework::OperatorWithKernel { } }; +class Im2SequenceGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("im2sequence_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::Im2SequenceGradDescMaker); REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp); REGISTER_OP_CPU_KERNEL( im2sequence, diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index 10d01af982d01800bdd2d5d59761cfb09e2a8139..edee8c08d070742d54f761083592466658a445c9 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -10,6 +10,7 @@ limitations under the License. */ #include "paddle/fluid/operators/interpolate_op.h" +#include #include #include #include "paddle/fluid/framework/op_registry.h" @@ -194,21 +195,46 @@ class InterpolateOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.GetPlace()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } +}; + +class InterpolateGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType(ForwardOp().Type() + "_grad"); + op->SetInput("X", Input("X")); + if (ForwardOp().Inputs().count("OutSize") > 0) { + op->SetInput("OutSize", Input("OutSize")); + } + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(InterpolateGradNoNeedBufferVarsInference, + "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(bilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad); + ops::InterpolateGradDescMaker); +REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad, + ops::InterpolateGradNoNeedBufferVarsInference); REGISTER_OPERATOR(nearest_interp, ops::InterpolateOp, ops::InterpolateOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad); + ops::InterpolateGradDescMaker); +REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad, + ops::InterpolateGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::InterpolateKernel, ops::InterpolateKernel, ops::InterpolateKernel); diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc index bc115090acb473ac3175999ca96c5e00c0aeaeae..2696d0bef9e322fce1251984c9e0f5b7429eeea8 100644 --- a/paddle/fluid/operators/l1_norm_op.cc +++ b/paddle/fluid/operators/l1_norm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/l1_norm_op.h" +#include namespace paddle { namespace operators { @@ -62,12 +63,28 @@ $$Out = \sum{|X|}$$ } }; +class L1NormGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("l1_norm_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::L1NormGradDescMaker); REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp); REGISTER_OP_CPU_KERNEL( l1_norm, ops::L1NormKernel); diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc index da59bd53bce010d0d6ad2ab14acaffb9cc2f99e6..6d0af573184b10a783f9c5802d1db3630eb55538 100644 --- a/paddle/fluid/operators/label_smooth_op.cc +++ b/paddle/fluid/operators/label_smooth_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/label_smooth_op.h" +#include #include namespace paddle { @@ -105,10 +106,23 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel { : OperatorWithKernel(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) shouldn't be null."); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); + } +}; + +class LabelSmoothGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("label_smooth_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; } }; @@ -117,7 +131,7 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::LabelSmoothGradDescMaker); REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp); REGISTER_OP_CPU_KERNEL( label_smooth, diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc index e17b6cb59898524d793f3cc78a09232f5b664617..fa09cb61e64aacd2aebf1ecf9826a15f9dcef877 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cc +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/linear_chain_crf_op.h" +#include namespace paddle { namespace operators { @@ -250,14 +251,46 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { } }; +class LinearChainCRFGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("linear_chain_crf_grad"); + op->SetAttrMap(Attrs()); + + op->SetInput("Emission", Input("Emission")); + op->SetInput("Transition", Input("Transition")); + op->SetInput("Label", Input("Label")); + + op->SetInput("Alpha", Output("Alpha")); + op->SetInput("EmissionExps", Output("EmissionExps")); + op->SetInput("TransitionExps", Output("TransitionExps")); + + op->SetInput(framework::GradVarName("LogLikelihood"), + OutputGrad("LogLikelihood")); + + op->SetOutput(framework::GradVarName("Emission"), InputGrad("Emission")); + op->SetOutput(framework::GradVarName("Transition"), + InputGrad("Transition")); + + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + LinearChainCRFGradNoNeedBufferVarsInference, "Transition", "Emission"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(linear_chain_crf, ops::LinearChainCRFOp, - ops::LinearChainCRFOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp); + ops::LinearChainCRFOpMaker, ops::LinearChainCRFGradDescMaker); +REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp, + ops::LinearChainCRFGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( linear_chain_crf, ops::LinearChainCRFOpKernel, diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index ef1fb83aa6e34c14637b6e761fd7d2dbadee36b8..e8850a1e582dc5c0a9ad64d26ba9b824349ee4e3 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/log_loss_op.h" +#include namespace paddle { namespace operators { @@ -100,12 +101,29 @@ class LogLossGradOp : public framework::OperatorWithKernel { } }; +class LogLossGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("log_loss_grad"); + op->SetInput("Predicted", Input("Predicted")); + op->SetInput("Labels", Input("Labels")); + op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); + op->SetOutput(framework::GradVarName("Predicted"), InputGrad("Predicted")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::LogLossGradDescMaker); REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp); REGISTER_OP_CPU_KERNEL( log_loss, ops::LogLossKernel); diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc index 4a199d681f328318401e3aec9457d59b959a9e0c..52e4e8be28746d42ebbda9a5148a9495d0d80c6a 100644 --- a/paddle/fluid/operators/lstm_op.cc +++ b/paddle/fluid/operators/lstm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/lstm_op.h" +#include #include namespace paddle { @@ -264,12 +265,51 @@ class LSTMGradOp : public framework::OperatorWithKernel { } }; +class LSTMGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("lstm_grad"); + op->SetAttrMap(Attrs()); + op->SetInput("Input", Input("Input")); + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + + if (ForwardOp().Inputs().count("H0") > 0) { + op->SetInput("H0", Input("H0")); + op->SetOutput(framework::GradVarName("H0"), InputGrad("H0")); + } + + if (ForwardOp().Inputs().count("C0") > 0) { + op->SetInput("C0", Input("C0")); + op->SetOutput(framework::GradVarName("C0"), InputGrad("C0")); + } + + op->SetInput("Weight", Input("Weight")); + op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight")); + + op->SetInput("Bias", Input("Bias")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + + op->SetInput("Cell", Output("Cell")); + + op->SetInput("Hidden", Output("Hidden")); + op->SetInput(framework::GradVarName("Hidden"), OutputGrad("Hidden")); + + op->SetInput("BatchGate", Output("BatchGate")); + op->SetInput("BatchCellPreAct", Output("BatchCellPreAct")); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(lstm, ops::LSTMOp, ops::LSTMOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::LSTMGradOpDescMaker); REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp); REGISTER_OP_CPU_KERNEL( lstm, ops::LSTMKernel, diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc index b643ba9d7fa61d758e871ebe7a463c22e937fa2c..fca3532551730a39bda7cfad60151de97ef881de 100644 --- a/paddle/fluid/operators/margin_rank_loss_op.cc +++ b/paddle/fluid/operators/margin_rank_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/margin_rank_loss_op.h" +#include namespace paddle { namespace operators { @@ -94,8 +95,6 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null."); - PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null."); - PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) shouldn't be null."); PADDLE_ENFORCE(ctx->HasInput("Activated"), @@ -106,13 +105,31 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel { } }; +class MarginRankLossGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("margin_rank_loss_grad"); + op->SetInput("Activated", Output("Activated")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("Label", Input("Label")); + op->SetOutput(framework::GradVarName("X1"), InputGrad("X1")); + op->SetOutput(framework::GradVarName("X2"), InputGrad("X2")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(margin_rank_loss, ops::MarginRankLossOp, ops::MarginRankLossOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::MarginRankLossGradDescMaker); REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp); REGISTER_OP_CPU_KERNEL( margin_rank_loss, diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc index 35b6d7b5e3b16ced845a9dca619539d7753c55e6..2b2f8450768b9885381f10b19631a6a200c7f703 100644 --- a/paddle/fluid/operators/mean_op.cc +++ b/paddle/fluid/operators/mean_op.cc @@ -13,7 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/mean_op.h" +#include #include +#include + namespace paddle { namespace operators { @@ -61,7 +64,8 @@ class MeanGradOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto input_data_type = ctx.Input("X")->type(); + auto input_data_type = + ctx.Input(framework::GradVarName("Out"))->type(); return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -81,13 +85,16 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker { } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(MeanGradNoNeedBufferVarsInference, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanOpInferVarType, ops::MeanGradMaker); -REGISTER_OPERATOR(mean_grad, ops::MeanGradOp); +REGISTER_OPERATOR(mean_grad, ops::MeanGradOp, + ops::MeanGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( mean, ops::MeanKernel, ops::MeanKernel); diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index 43559940d925e6fff29f0c5c66ec1a3dc717aaf4..5b7505f3c4acdef94fead04efd00b47825274117 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -96,7 +96,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx, std::vector src_tz = framework::vectorize2int(x->dims()); - auto src_format = x->format(); + auto src_format = + src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format(); const std::string key = gethash(src_tz, algorithm); const std::string key_src_data = @@ -126,8 +127,10 @@ void eltwise_forward(const framework::ExecutionContext &ctx, if (p_fwd == nullptr) { // create mkldnn memory for input X + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), src_format); auto src_memory = std::shared_ptr( - new memory(x->get_mkldnn_prim_desc(), to_void_cast(x_data))); + new memory({src_md, mkldnn_engine}, to_void_cast(x_data))); // save src_memory to be referred in backward path dev_ctx.SetBlob(key_src_mem, src_memory); @@ -174,7 +177,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx, pipeline.push_back(*p_fwd); stream(stream::kind::eager).submit(pipeline).wait(); - y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc()); + y->set_layout(DataLayout::kMKLDNN); + y->set_format(GetMKLDNNFormat(*dst_memory)); } template @@ -192,6 +196,9 @@ void eltwise_grad(const framework::ExecutionContext &ctx, std::vector diff_dst_tz = framework::vectorize2int(diff_y->dims()); + auto diff_y_format = + diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format(); + const std::string key = gethash(diff_dst_tz, algorithm); const std::string key_src_data = key + ctx.op().Input("Out") + "@eltwise_fwd_src_data"; @@ -203,8 +210,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx, key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem"; const std::string key_fwd_pd = key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd"; - const std::string key_with_layouts = key + std::to_string(*p_src_layout) + - "-" + std::to_string(diff_y->format()); + const std::string key_with_layouts = + key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format); const std::string key_diff_src_mem = key_with_layouts + "@eltwise_diff_src_mem"; const std::string key_diff_dst_mem = @@ -227,8 +234,10 @@ void eltwise_grad(const framework::ExecutionContext &ctx, if (p_grad == nullptr) { // create mkldnn memory for input diff_y + auto diff_dst_md = platform::MKLDNNMemDesc( + diff_dst_tz, platform::MKLDNNGetDataType(), diff_y_format); auto diff_dst_memory = std::shared_ptr( - new memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data))); + new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data))); dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory); // retrieve eltwise primitive desc from device context @@ -272,7 +281,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx, pipeline.push_back(*p_grad); stream(stream::kind::eager).submit(pipeline).wait(); - diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc()); + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format(GetMKLDNNFormat(*diff_src_memory)); } template diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc index 04e45d4853907bb7d6b5ce362892a2183fd4b60e..bddca232e6c8a2a7fde998877006e37ee6d3d0dc 100644 --- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc @@ -206,14 +206,17 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu; // create mkldnn memory from input x tensor + mkldnn::memory::format input_format = + platform::MKLDNNFormatForSize(src_tz.size(), x->format()); // keys for backward pass const std::string key = BatchNormMKLDNNHandler::GetHash( - src_tz, epsilon, flags, global_stats, x->format(), + src_tz, epsilon, flags, global_stats, input_format, ctx.op().Output("SavedMean")); const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; - auto user_src_md = x->get_mkldnn_prim_desc().desc(); + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, platform::MKLDNNGetDataType(), input_format); // create primitive descriptor for batch norm forward using bn_fwd_types = bn_type_traits; @@ -227,8 +230,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine, key); - auto src_memory = handler.AcquireSrcMemory(x->get_mkldnn_prim_desc(), - to_void_cast(x_data)); + auto src_memory = + handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data)); // crate mkldnn memory for weights(scale/shift) auto scaleshift_memory = @@ -262,7 +265,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { variance_memory, false); } - y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc()); + y->set_layout(DataLayout::kMKLDNN); + y->set_format(platform::GetMKLDNNFormat(*dst_memory)); std::vector pipeline; pipeline.push_back(*batch_norm_p); @@ -332,6 +336,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { using bn_bwd_types = bn_type_traits; + mkldnn::memory::format dst_format = + platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format()); + mkldnn::memory::format input_format = platform::MKLDNNFormatForSize(src_tz.size(), x->format()); @@ -339,14 +346,14 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { // keys from forward pass const std::string key = BatchNormMKLDNNHandler::GetHash( - src_tz, epsilon, flags, false, x->format(), + src_tz, epsilon, flags, false, input_format, ctx.op().Input("SavedMean")); const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; // keys for primitives reuse const std::string key_with_hash = key + BatchNormMKLDNNHandler::GetHash(src_tz, epsilon, flags, false, - x->format()); + input_format); const std::string key_batch_norm_bwd_p = key_with_hash + "@batch_norm_bwd_p"; const std::string key_batch_norm_src_mem_p = @@ -366,8 +373,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { primitive reorder_diff_dst; bool is_diff_dst_reordered = false; - auto user_diff_dst_memory = - memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data)); + auto user_diff_dst_memory = memory( + {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine}, + to_void_cast(diff_y_data)); // MKLDNN requires a single piece of memory for scale and shift/bias data const size_t scaleshift_size = 2 * ic; @@ -451,7 +459,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { dev_ctx.SetBlob(key_batch_norm_diff_dst_mem_p, diff_dst_memory); // set layout/format of output tensors - diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc()); + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc() + .desc() + .data.format); } else { // primitives already exist UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data)); @@ -476,7 +487,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { } // set layout/format of output tensors - diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc()); + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc() + .desc() + .data.format); } // execute optional reorder and batch_norm backward primitive diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index 97387af92ffbd123ae6e795f17ef2273dadeab9d..50fe2e6e4c5a5e3e0ed1d9a9827e75094454c2fc 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -210,7 +210,8 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { stream(stream::kind::eager).submit({*concat_p}).wait(); - output->set_mkldnn_prim_desc(concat_pd->dst_primitive_desc()); + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetDstMemFormat(*concat_pd)); } }; } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 8d96ae7e4215c2488564322e1dda46a81b46a665..5e4d79f1c35af42f662711ae9d8bfc650bab2b4f 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -96,8 +96,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; auto* output = ctx.Output("Output"); - PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN); - PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN); + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5, "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5, @@ -144,19 +148,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector pipeline; - // For convolution with groups we need to recreate primitive descriptor - // as Paddle tensor is not having group dims while mkldnn treats - // group as another dimensions - mkldnn::memory::primitive_desc user_weights_mpd = - filter->get_mkldnn_prim_desc(); - if (g > 1) { - mkldnn::memory::format weights_format = - GetWeightsFormat(filter->format(), g, is_conv3d); - auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), weights_format); - user_weights_mpd = - mkldnn::memory::primitive_desc(user_weights_md, mkldnn_engine); - } + auto src_format = input->format(); + mkldnn::memory::format weights_format = + GetWeightsFormat(filter->format(), g, is_conv3d); + + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, platform::MKLDNNGetDataType(), src_format); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), weights_format); /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose @@ -166,7 +165,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto chosen_memory_format = platform::data_format_to_memory_format(data_format); - mkldnn::memory::format weights_format = mkldnn::memory::format::any; + weights_format = mkldnn::memory::format::any; // Check the format for user's special output if (chosen_memory_format != mkldnn::memory::format::any) { if (is_conv3d) { @@ -206,10 +205,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key); // create mkldnn memory from input tensors (data/weights) - auto user_src_memory_p = handler.AcquireSrcMemory( - input->get_mkldnn_prim_desc(), to_void_cast(input_data)); + auto user_src_memory_p = + handler.AcquireSrcMemory(user_src_md, to_void_cast(input_data)); auto user_weights_memory_p = handler.AcquireWeightsMemory( - user_weights_mpd, to_void_cast(filter_data)); + user_weights_md, to_void_cast(filter_data)); // create reorder primitive if the input format is not the preferred one auto src_memory_p = @@ -282,7 +281,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { pipeline.push_back(*conv_p); stream(stream::kind::eager).submit(pipeline).wait(); - output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc()); + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(*dst_memory_p)); } void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const { const bool is_test = ctx.Attr("is_test"); @@ -948,8 +948,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { // push primitive to stream and wait until it's executed pipeline.push_back(*conv_bwd_weights_p); - auto filter_grad_mpd = diff_weights_memory_p->get_primitive_desc(); - filter_grad->set_mkldnn_prim_desc(filter_grad_mpd); + filter_grad->set_layout(DataLayout::kMKLDNN); + filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p)); } if (input_grad) { @@ -972,7 +972,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { pipeline.push_back(*conv_bwd_data_p); - input_grad->set_mkldnn_prim_desc(diff_src_memory_p->get_primitive_desc()); + input_grad->set_layout(DataLayout::kMKLDNN); + input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p)); } stream(stream::kind::eager).submit(pipeline).wait(); } diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index 79a0c5c7683d677daeb4feea10deab86407f944c..317d4cebe26b81ff03c212e6328233d5152ed1b4 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -221,7 +221,8 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { pipeline.push_back(*conv_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); - output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc()); + output->set_layout(DataLayout::kMKLDNN); + output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); } private: diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc index d01e8dbf4ce0c92bb81fc76df68d5424f9da0717..76b00b396c1349eff5db1059268e7cf280a8fc64 100644 --- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc @@ -42,12 +42,8 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel { // The format of output is set as the mkldnn's format // TODO(@mozga-intel) The format of matrix sets inside the another layers. - // TODO(jczaja): Remove this hack after checking performance on block layout - - auto tensor_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(tensor->dims()), - mkldnn::memory::format::oihw); - tensor->set_mkldnn_prim_desc(tensor_mem_pd); + tensor->set_layout(DataLayout::kMKLDNN); + tensor->set_format(mkldnn::memory::format::oihw); } }; } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc index 4ff27ab12280b56abdf72056fe69ec713f2f2f46..097ba01d401dbc7969e30f576cac2567c874ed99 100644 --- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc @@ -81,7 +81,10 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto e_mid = framework::EigenTensor::From(*mid); e_mid = e_mid.constant(k); - auto src_md = x->get_mkldnn_prim_desc().desc(); + auto dims = paddle::framework::vectorize2int(x->dims()); + + auto src_md = paddle::platform::MKLDNNMemDesc( + dims, mkldnn::memory::data_type::f32, x->format()); auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward, mkldnn::lrn_across_channels, @@ -91,7 +94,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { beta, k}; - auto src_memory_pd = x->get_mkldnn_prim_desc(); + auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; if (!is_test) { const std::string key = ctx.op().Output("Out"); @@ -108,15 +111,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { src_memory->set_data_handle( static_cast(const_cast(input_data))); - auto dst_memory_pd = forward_pd->dst_primitive_desc(); - auto dst_memory = - mkldnn::memory(dst_memory_pd, static_cast(output_data)); + auto dst_memory = mkldnn::memory(forward_pd->dst_primitive_desc(), + static_cast(output_data)); auto workspace_memory = insert_to_context( key_workspace_memory, dev_ctx, forward_pd->workspace_primitive_desc()); run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory); - out->set_mkldnn_prim_desc(dst_memory_pd); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(dst_memory)); } else { auto forward_pd = mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine}; @@ -124,12 +128,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { src_memory_pd, static_cast(const_cast(input_data))}; auto workspace_memory = mkldnn::memory{forward_pd.workspace_primitive_desc()}; - auto dst_memory_pd = forward_pd.dst_primitive_desc(); auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(), static_cast(output_data)); run_primitive(forward_pd, src_memory, workspace_memory, dst_memory); - out->set_mkldnn_prim_desc(dst_memory_pd); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(dst_memory)); } } }; diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index 0ce552219458859e147ba207c94270bf84a1fe75..dc1176f0848b93dd6872f676c3a71dab4f3455fd 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -158,14 +158,6 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { auto softmax_p = handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p); - // We cannot use softmax_dst_memory_p to get prim desc as - // it contains flattened dims (2D) while output tensor can - // have 2,3,4+ dims - auto output_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(output->dims()), - mkldnn::memory::format::blocked); - output->set_mkldnn_prim_desc(output_mem_pd); - std::vector pipeline{ *(static_cast(softmax_p.get()))}; stream(stream::kind::eager).submit(pipeline).wait(); diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index aef5b7d4311adfedb3db157f17506c3a2c76fbf6..6f64157b64e2f6247db8b49dc94cd10bfb6e861f 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -106,12 +106,12 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { memory::desc(dst_tz, memory::data_type::f32, memory::format::any); auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd); - auto dst_mem_pd = sum_pd.dst_primitive_desc(); + std::shared_ptr dst_mem; if (in_place) { - dst_mem.reset(new memory(dst_mem_pd)); + dst_mem.reset(new memory(sum_pd.dst_primitive_desc())); } else { - dst_mem.reset(new memory(dst_mem_pd, output_data)); + dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data)); } std::vector inputs; for (size_t i = 0; i < srcs_mem.size(); ++i) { @@ -136,7 +136,8 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { if (in_place) pipeline.push_back(reorder_prim); stream(stream::kind::eager).submit(pipeline).wait(); - output->set_mkldnn_prim_desc(dst_mem_pd); + output->set_layout(DataLayout::kMKLDNN); + output->set_format(output_format); } else { // Fallback to naive version // TODO(@mozga-intel) Add MKLDNN SelectedRows & LoDTensorArray support SumKernel reference_kernel; diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc index 4debc7ca5ec90d6cc781d10e817e9ed8650f12aa..95cee806ac451235a8fb03567e6057e10aa56427 100644 --- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc @@ -52,7 +52,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { mkldnn_engine, key); auto transpose_src_memory_p = handler.AcquireSrcMemory( - input->get_mkldnn_prim_desc(), platform::to_void_cast(input_data)); + input->format(), platform::to_void_cast(input_data)); auto transpose_dst_memory_p = handler.AcquireDstMemory(output, ctx.GetPlace()); auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, @@ -62,14 +62,8 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { pipeline.push_back(*transpose_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); - // Transpose did change logical dimensions of Tensor, but reorder does not. - // Reorder does change only physical layout eg. format , strides - // so we need to create new primitive descriptor with changed logical layout - // so it match output shape - auto output_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(output->dims()), - mkldnn::memory::format::blocked); - output->set_mkldnn_prim_desc(output_mem_pd); + output->set_layout(DataLayout::kNCHW); + output->set_format(mkldnn::memory::format::format_undef); } }; @@ -134,9 +128,8 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx, mkldnn_engine, key); - auto transpose_src_memory_p = - handler.AcquireSrcMemory(out_grad->get_mkldnn_prim_desc(), - platform::to_void_cast(out_grad_data)); + auto transpose_src_memory_p = handler.AcquireSrcMemory( + out_grad->format(), platform::to_void_cast(out_grad_data)); auto transpose_dst_memory_p = handler.AcquireDstMemory(x_grad, ctx.GetPlace()); auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, @@ -145,15 +138,6 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { std::vector pipeline; pipeline.push_back(*transpose_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); - - // Transpose did change logical dimensions of Tensor, but reorder does not. - // Reorder does change only physical layout eg. format , strides - // so we need to create new primitive descriptor with changed logical layout - // so it match output shape - auto x_grad_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(x_grad->dims()), - mkldnn::memory::format::blocked); - x_grad->set_mkldnn_prim_desc(x_grad_mem_pd); } }; diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc index 1801f2915e09b5ac6ee1ee27726e66d26c9c6a8f..7cb213e89958e017c62d7cded261570307d3e64b 100644 --- a/paddle/fluid/operators/multiplex_op.cc +++ b/paddle/fluid/operators/multiplex_op.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/multiplex_op.h" +#include +#include namespace paddle { namespace operators { @@ -111,28 +113,47 @@ class MultiplexGradOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(!ctx->Inputs("X").empty(), "Input(X) should not be null."); - PADDLE_ENFORCE(!ctx->Outputs(framework::GradVarName("X")).empty(), - "Output(X@Grad) should not be null."); + auto& dxs = ctx->Outputs(framework::GradVarName("X")); + PADDLE_ENFORCE(!dxs.empty(), "Output(X@Grad) should not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) should not be null."); - ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out")); + ctx->SetOutputsDim(framework::GradVarName("X"), + std::vector(dxs.size(), dout_dim)); } protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.MultiInput("X")[0]->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); + } +}; + +class MultiplexGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("multiplex_grad"); + op->SetInput("Ids", Input("Ids")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X", false)); + op->SetAttrMap(Attrs()); + return op; } }; } // namespace operators } // namespace paddle + namespace ops = paddle::operators; REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::MultiplexGradDescMaker); REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); REGISTER_OP_CPU_KERNEL( multiplex, diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu index 2f8a602f3c5c0a7c262235f99943ce336e20a7b4..1ef54ecc732f3d2098ed51d955f8feed4cb1a821 100644 --- a/paddle/fluid/operators/multiplex_op.cu +++ b/paddle/fluid/operators/multiplex_op.cu @@ -53,20 +53,25 @@ class MultiplexGradGPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto ins = ctx.MultiInput("X"); auto* ids = ctx.Input("Ids"); auto d_ins = ctx.MultiOutput(framework::GradVarName("X")); + + size_t idx = -1UL; for (size_t i = 0; i < d_ins.size(); i++) { if (d_ins[i]) { d_ins[i]->mutable_data(ctx.GetPlace()); auto t = framework::EigenVector::Flatten(*d_ins[i]); t.device(*ctx.template device_context().eigen_device()) = t.constant(static_cast(0)); + + idx = i; } } - auto rows = ins[0]->dims()[0]; - auto cols = ins[0]->numel() / rows; + if (idx == -1UL) return; + + auto rows = d_ins[idx]->dims()[0]; + auto cols = d_ins[idx]->numel() / rows; // copy index to cpu Tensor index_t_cpu; TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h index 87de000971941c39ee84e1bca46e2cd18e262fd8..44d6cc84a6493a326257d96f19b43c83c62f7b31 100644 --- a/paddle/fluid/operators/multiplex_op.h +++ b/paddle/fluid/operators/multiplex_op.h @@ -52,20 +52,25 @@ class MultiplexGradCPUKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const { auto* d_out = ctx.Input(framework::GradVarName("Out")); auto* ids = ctx.Input("Ids"); - auto ins = ctx.MultiInput("X"); auto d_ins = ctx.MultiOutput(framework::GradVarName("X")); + + size_t idx = -1UL; for (size_t i = 0; i < d_ins.size(); i++) { if (d_ins[i]) { d_ins[i]->mutable_data(ctx.GetPlace()); auto t = framework::EigenVector::Flatten(*d_ins[i]); t.device(*ctx.template device_context().eigen_device()) = t.constant(static_cast(0)); + + idx = i; } } - auto rows = ins[0]->dims()[0]; - auto cols = ins[0]->numel() / rows; + if (idx == -1UL) return; + + auto rows = d_ins[idx]->dims()[0]; + auto cols = d_ins[idx]->numel() / rows; auto* index = ids->data(); platform::CPUPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc index d4b631a6f5bf9332f4ed1d1a4bda529fbb6ada0a..c28106d31273cb54e3974d186296644272d2014c 100644 --- a/paddle/fluid/operators/pad_op.cc +++ b/paddle/fluid/operators/pad_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/pad_op.h" +#include namespace paddle { namespace operators { @@ -29,7 +30,7 @@ class PadOp : public framework::OperatorWithKernel { "Output(Out) of PadOp should not be null."); auto x_dim = ctx->GetInputDim("X"); - auto paddings = ctx->Attrs().Get>("paddings"); + auto& paddings = ctx->Attrs().Get>("paddings"); PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()), "Size of paddings should be equal to 2 * dimension size " "of input tensor."); @@ -99,13 +100,20 @@ class PadOpGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null"); - auto x_dims = ctx->GetInputDim("X"); + auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); + auto& paddings = ctx->Attrs().Get>("paddings"); + for (int i = 0; i < dout_dims.size(); ++i) { + dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]); + } + auto x_grad_name = framework::GradVarName("X"); if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); + auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); + auto& paddings = ctx->Attrs().Get>("paddings"); + for (int i = 0; i < dout_dims.size(); ++i) { + dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]); + } + ctx->SetOutputDim(x_grad_name, dout_dims); } } }; @@ -117,7 +125,6 @@ class PadOpGradMaker : public framework::SingleGradOpDescMaker { protected: std::unique_ptr Apply() const override { auto* bind = new framework::OpDesc(); - bind->SetInput("X", Input("X")); bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); bind->SetOutput(framework::GradVarName("X"), InputGrad("X")); bind->SetAttrMap(Attrs()); diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc index 78989582b7a0da5b7ff326cea1606df9993bed4c..dce9108eb17d76cfdf1c1b2313d975fd9fbdf9a7 100644 --- a/paddle/fluid/operators/psroi_pool_op.cc +++ b/paddle/fluid/operators/psroi_pool_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/psroi_pool_op.h" +#include namespace paddle { namespace operators { @@ -154,12 +155,29 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel { } }; +class PSROIPoolGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("psroi_pool_grad"); + op->SetInput("X", Input("X")); + op->SetInput("ROIs", Input("ROIs")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::PSROIPoolGradDescMaker); REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp); REGISTER_OP_CPU_KERNEL( psroi_pool, diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index 313cf01541dd88a0f4f8bf54fe4436984c2cbcf8..45daa6b955639e3695211c1032869c743ede9b2c 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/rank_loss_op.h" +#include #include namespace paddle { @@ -116,6 +117,25 @@ class RankLossGradOp : public framework::OperatorWithKernel { } }; +class RankLossGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("rank_loss_grad"); + op->SetInput("Label", Input("Label")); + op->SetInput("Left", Input("Left")); + op->SetInput("Right", Input("Right")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("Left"), InputGrad("Left")); + op->SetOutput(framework::GradVarName("Right"), InputGrad("Right")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index 6857b5ed9dbccb06a71063c3da9045e1f79ef6f6..7bb10ce063109dbd8520430d2b32ac9370ef8d25 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/roi_align_op.h" +#include namespace paddle { namespace operators { @@ -147,12 +148,29 @@ Thus avoid the misaligned problem. } }; +class ROIAlignGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("roi_align_grad"); + op->SetInput("X", Input("X")); + op->SetInput("ROIs", Input("ROIs")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ROIAlignGradDescMaker); REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp); REGISTER_OP_CPU_KERNEL( roi_align, diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index e46d92d6fc3a9830535a8bb07824b26b92a5dbde..cfac7e09e123c43204454adacb87a7c3c158690e 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/roi_pool_op.h" +#include namespace paddle { namespace operators { @@ -158,12 +159,30 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn } }; +class ROIPoolGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("roi_pool_grad"); + op->SetInput("X", Input("X")); + op->SetInput("ROIs", Input("ROIs")); + op->SetInput("Argmax", Output("Argmax")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ROIPoolGradDescMaker); REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp); REGISTER_OP_CPU_KERNEL( roi_pool, diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index ad418d51bcdb0e9e7959961bdf344a80f85c3f17..8e0e3bd6054018852b242d1dba5c250394ed81ce 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/scatter_op.h" +#include #include "paddle/fluid/framework/ddim.h" namespace paddle { @@ -63,14 +64,16 @@ class ScatterGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { ctx->SetOutputDim(framework::GradVarName("Updates"), ctx->GetInputDim("Updates")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); } protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); } }; @@ -95,12 +98,34 @@ $$ } }; +class ScatterGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("scatter_grad"); + op->SetInput("Ids", Input("Ids")); + op->SetInput("Updates", Input("Updates")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Updates"), InputGrad("Updates")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ScatterGradNoNeedBufferVarsInference, + "Updates"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp); + ops::ScatterGradDescMaker); +REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp, + ops::ScatterGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel); REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel); diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc index 9349912e090f2ad3248923c87b50c8d72b0d84d1..26355e58615454c8e9aea1d6a5405368e6006e87 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cc +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/shuffle_channel_op.h" +#include namespace paddle { namespace operators { @@ -91,13 +92,28 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel { } }; +class ShuffleChannelGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("shuffle_channel_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(shuffle_channel, ops::ShuffleChannelOp, - ops::ShuffleChannelOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ShuffleChannelOpMaker, ops::ShuffleChannelGradDescMaker); REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp); diff --git a/paddle/fluid/operators/spectral_norm_op.cc b/paddle/fluid/operators/spectral_norm_op.cc index 357d055756523cd83bf0e4b30719155b32c65974..04f659a465a345653d251cbe6703309c804fe614 100644 --- a/paddle/fluid/operators/spectral_norm_op.cc +++ b/paddle/fluid/operators/spectral_norm_op.cc @@ -10,6 +10,9 @@ limitations under the License. */ #include "paddle/fluid/operators/spectral_norm_op.h" + +#include + #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -156,6 +159,28 @@ class SpectralNormOpMaker : public framework::OpProtoAndCheckerMaker { } }; +class SpectralNormGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("spectral_norm_grad"); + + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("Weight", Input("Weight")); + op->SetInput("U", Input("U")); + op->SetInput("V", Input("V")); + + op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight")); + + op->SetAttrMap(Attrs()); + + return op; + } +}; + class SpectralNormOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -185,7 +210,7 @@ class SpectralNormOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(spectral_norm, ops::SpectralNormOp, ops::SpectralNormOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::SpectralNormGradOpDescMaker); REGISTER_OPERATOR(spectral_norm_grad, ops::SpectralNormOpGrad); REGISTER_OP_CPU_KERNEL( spectral_norm, diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 4fa6774f028bef901f6e11f2d3dafe52a10a548e..ecaad4ec070fe60a522839e0718c424a441dec0b 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include #include "paddle/fluid/framework/data_layout_transform.h" @@ -39,45 +40,6 @@ class MKLDNNHandler { return this->AcquireMemory(md, ptr, "@user_src_mem_p"); } - // TODO(jczaja): extract common part and make AcquireMemory - std::shared_ptr AcquireSrcMemory( - const mkldnn::memory::primitive_desc& mpd, void* ptr) { - auto local_key = key_ + "@user_src_mem_p"; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), - " find mem primitive in device context"); - if (mem_p == nullptr) { - mem_p = std::make_shared(mpd, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - // Mark that reusing happenned. All primitives from operator instance - // should be reused or none of them. So we check consistency - is_reusing_ = true; - } - return mem_p; - } - - std::shared_ptr AcquireWeightsMemory( - const mkldnn::memory::primitive_desc& mpd, void* ptr) { - auto local_key = key_ + "@user_weights_mem_p"; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), - " find mem primitive in device context"); - if (mem_p == nullptr) { - mem_p = std::make_shared(mpd, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - // Mark that reusing happenned. All primitives from operator instance - // should be reused or none of them. So we check consistency - is_reusing_ = true; - } - return mem_p; - } - std::shared_ptr AcquireWeightsMemory( const mkldnn::memory::desc& md, void* ptr, user_function custom_func = {}) { @@ -315,7 +277,37 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { mkldnn::engine engine, const std::string& base_key) : platform::MKLDNNHandler(dev_ctx, engine, base_key), dims_(dims), - axis_(axis) {} + axis_(axis), + logical_axis_(dims.size(), 0) {} + + std::shared_ptr AcquireSrcMemory( + const mkldnn::memory::format& fmt, void* ptr) { + auto local_key = key_ + "@user_src_mem_p"; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), + " find mem primitive in device context"); + if (mem_p == nullptr) { + // Make memory descriptor using input format, unless it + // cannot be trusted (nchw) then make up memory fmt manually + for (size_t i = 0; i < logical_axis_.size(); ++i) { + logical_axis_[i] = i; + } + auto src_md = fmt != mkldnn::memory::format::nchw + ? platform::MKLDNNMemDesc( + dims_, platform::MKLDNNGetDataType(), fmt) + : Axis2MemoryDesc(dims_, logical_axis_); + mem_p = std::make_shared( + mkldnn::memory::primitive_desc{src_md, engine_}, ptr); + dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + // Mark that reusing happenned. All primitives from operator instance + // should be reused or none of them. So we check consistency + is_reusing_ = true; + } + return mem_p; + } std::shared_ptr AcquireDstMemory(framework::Tensor* output, platform::Place place) { @@ -400,6 +392,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { private: std::vector dims_; std::vector axis_; + std::vector logical_axis_; }; template diff --git a/paddle/fluid/platform/mkldnn_utils.h b/paddle/fluid/platform/mkldnn_utils.h deleted file mode 100644 index 8c511f97d12cfe299ad5629eff1871e8d156c850..0000000000000000000000000000000000000000 --- a/paddle/fluid/platform/mkldnn_utils.h +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include - -namespace paddle { -namespace platform { - -inline mkldnn::memory::primitive_desc create_prim_desc_from_dims( - const std::vector& ltz, mkldnn::memory::format fmt, - mkldnn::memory::data_type data_type = mkldnn::memory::data_type::f32) { - mkldnn_memory_desc_t mem_fmt; - - mem_fmt.primitive_kind = mkldnn_memory; - mem_fmt.ndims = ltz.size(); - for (unsigned int i = 0; i < ltz.size(); ++i) { - mem_fmt.dims[i] = ltz[i]; // logical dimensions (nchw format, - // regardless physical layout) - } - mem_fmt.data_type = static_cast(data_type); - mem_fmt.format = static_cast(fmt); - - unsigned int total_stride = 1; - for (int i = ltz.size() - 1; i >= 0; --i) { - mem_fmt.layout_desc.blocking.padding_dims[i] = - ltz[i]; // logical dimensions (nchw format, regardless physical - // layout) - mem_fmt.layout_desc.blocking.block_dims[i] = 1; - mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0; // no offset - mem_fmt.layout_desc.blocking.strides[0][i] = total_stride; - mem_fmt.layout_desc.blocking.strides[1][i] = 1; - total_stride *= ltz[i]; - } - mem_fmt.layout_desc.blocking.offset_padding = 0; // no initial offset - - auto& pool = platform::DeviceContextPool::Instance(); - auto place = paddle::platform::CPUPlace(); - auto* dev_ctx = dynamic_cast(pool.Get(place)); - auto& cpu_engine = dev_ctx->GetEngine(); - return mkldnn::memory::primitive_desc(mem_fmt, cpu_engine); -} - -inline mkldnn::memory::primitive_desc create_prim_desc_from_format( - const std::vector& ltz, const mkldnn::memory::format format, - const mkldnn::memory::data_type data_type) { - auto md = mkldnn::memory::desc({ltz}, data_type, format); - auto& pool = platform::DeviceContextPool::Instance(); - auto place = paddle::platform::CPUPlace(); - auto dev_ctx = dynamic_cast(pool.Get(place)); - PADDLE_ENFORCE_NOT_NULL(dev_ctx, "Could not get valid device"); - auto& cpu_engine = dev_ctx->GetEngine(); - return mkldnn::memory::primitive_desc(md, cpu_engine); -} - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc index ddde7baf4cf3b44ac5d8a22fcc98acef50294575..d489ed5368ed95a1a0a8b0d6759310501cd49fcd 100644 --- a/paddle/fluid/platform/temporary_allocator.cc +++ b/paddle/fluid/platform/temporary_allocator.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/platform/temporary_allocator.h" #include +#include #include "paddle/fluid/memory/allocation/allocator_facade.h" DEFINE_int64(limit_of_tmp_allocation, -1, @@ -30,31 +31,38 @@ namespace paddle { namespace platform { namespace alloc = memory::allocation; +TemporaryAllocation::TemporaryAllocation( + alloc::AllocationPtr &&underlying_allocation) + : Allocation(underlying_allocation->ptr(), underlying_allocation->size(), + underlying_allocation->place()), + underlying_allocation_(std::move(underlying_allocation)) {} + TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) { - temp_mem_map_.reset(new std::multimap()); + temp_mem_map_.reset(new std::multimap()); } bool TemporaryAllocator::IsAllocThreadSafe() const { return true; } void TemporaryAllocator::Release(const std::function &callback) { - std::unique_ptr> t_allocations; + std::unique_ptr> t_allocations; { std::unique_lock lock(mtx_); callback(); t_allocations.swap(temp_mem_map_); - temp_mem_map_.reset(new std::multimap()); + temp_mem_map_.reset(new std::multimap()); wait_delete_mem_ = 0; } - alloc::AllocationDeleter deleter; for (auto tmp : *t_allocations) { VLOG(10) << "Delete temporary allocation " << tmp.second->ptr() << " size: " << tmp.second->size(); - deleter(tmp.second); + delete tmp.second; } } -void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) { +void TemporaryAllocator::Free(alloc::Allocation *allocation) { + auto *temp_allocation = dynamic_cast(allocation); + PADDLE_ENFORCE_NOT_NULL(temp_allocation); if (platform::is_gpu_place(temp_allocation->place())) { PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_), "The place should be the same."); @@ -78,7 +86,7 @@ void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) { } VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr() << " size: " << temp_allocation->size(); - alloc::AllocationDeleter()(temp_allocation); + delete temp_allocation; } size_t TemporaryAllocator::TemporaryAllocationQueueSize() { @@ -113,9 +121,11 @@ alloc::Allocation *TemporaryAllocator::AllocateImpl( } // If not find the the available allocation, get allocation from // AllocatorFacadeInstance. - auto temp_mem = alloc::AllocatorFacade::Instance().Alloc(place_, size, attr); + auto raw_allocation = + alloc::AllocatorFacade::Instance().Alloc(place_, size, attr); + auto temp_mem = new TemporaryAllocation(std::move(raw_allocation)); VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size; - return temp_mem.release(); + return temp_mem; } } // namespace platform diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h index 912d45eaf17fe8c05840995275dd3e2e688b38ef..f8a43b889d58d5e027aac8e08324cf51b7d82913 100644 --- a/paddle/fluid/platform/temporary_allocator.h +++ b/paddle/fluid/platform/temporary_allocator.h @@ -23,6 +23,14 @@ namespace paddle { namespace platform { +class TemporaryAllocation : public memory::allocation::Allocation { + public: + explicit TemporaryAllocation( + memory::allocation::AllocationPtr &&underlying_allocation); + + memory::allocation::AllocationPtr underlying_allocation_; +}; + /*! \brief the TemporaryAllocator is used to alloc the temporary allocation * which used by CUDA's async operation. * @@ -49,7 +57,7 @@ class TemporaryAllocator : public memory::allocation::Allocator { void SetCallback(const std::function &callback); protected: - void FreeImpl(memory::allocation::Allocation *allocation) override; + void Free(memory::allocation::Allocation *allocation) override; memory::allocation::Allocation *AllocateImpl( size_t size, memory::allocation::Allocator::Attr attr) override; @@ -58,8 +66,8 @@ class TemporaryAllocator : public memory::allocation::Allocator { platform::Place place_; // When the allocation is not held by any variable, it should be placed // to temp_mem_map immediately. - std::unique_ptr> - temp_mem_map_{nullptr}; + std::unique_ptr> temp_mem_map_{ + nullptr}; std::mutex mtx_; size_t wait_delete_mem_{0}; std::function callback_; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3b0939ef820918e1b5099551b1d90f9e882d084b..fa978f1c99b144708c660b537142fb56354c9e6b 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -324,7 +324,6 @@ PYBIND11_MODULE(core, m) { [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) { self.mutable_data(place); }) - .def("_clear", &Tensor::clear) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) @@ -1046,9 +1045,7 @@ All parameter, weight, gradient are variables in Paddle. int val) { self.Set(name, new int(val)); }) .def("type", &ir::Pass::Type) .def("apply", [](ir::Pass &self, std::shared_ptr graph) { - std::unique_ptr origin_graph(graph.get()); - auto optim_graph = self.Apply(std::move(origin_graph)); - optim_graph.release(); + self.Apply(graph.get()); }); py::class_> pb( @@ -1285,6 +1282,15 @@ All parameter, weight, gradient are variables in Paddle. it will save GPU memory and may make the execution faster. This options is only available in GPU devices. Default False)DOC") + .def_property("fuse_all_optimizer_ops", + [](const BuildStrategy &self) { + return self.fuse_all_optimizer_ops_; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE(!self.IsFinalized(), + "BuildStrategy is finlaized."); + self.fuse_all_optimizer_ops_ = b; + }) .def_property( "sync_batch_norm", [](const BuildStrategy &self) { return self.sync_batch_norm_; }, diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h index 66b768665b6d0b97b4ca1470020132bfc9576bbb..16bb3771f2e9bcc07028ef2039fed8691f9aab97 100644 --- a/paddle/fluid/string/printf.h +++ b/paddle/fluid/string/printf.h @@ -105,12 +105,14 @@ void Printf(const char* fmt, const Args&... args) { Fprintf(std::cout, fmt, args...); } -inline std::string HumanReadableSize(double f_size) { +template +std::string HumanReadableSize(T size) { size_t i = 0; + double f_size = static_cast(size); double orig = f_size; const std::vector units( {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"}); - while (f_size >= 1024) { + while (f_size > 1024) { f_size /= 1024; i++; } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 18f01ca1374a24cec3bf882d347596dd38f4fd21..24c8a6934fe355b2de388e7f90b6e40d4871f0d8 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -34,7 +34,7 @@ from . import io from . import evaluator from . import initializer from . import layers -from . import imperative +from . import dygraph from . import contrib from . import nets from . import optimizer @@ -71,7 +71,7 @@ __all__ = framework.__all__ + executor.__all__ + \ 'initializer', 'layers', 'contrib', - 'imperative', + 'dygraph', 'transpiler', 'nets', 'optimizer', diff --git a/python/paddle/fluid/contrib/model_stat.py b/python/paddle/fluid/contrib/model_stat.py new file mode 100644 index 0000000000000000000000000000000000000000..0d974c8d9685840c79de17f297fcba00b01a6c35 --- /dev/null +++ b/python/paddle/fluid/contrib/model_stat.py @@ -0,0 +1,194 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +Example: + >>from paddle.fluid.contrib.model_stat import summary + >>main_program = ... + >>summary(main_program) + +-----+------------+----------------+----------------+---------+------------+ + | No. | TYPE | INPUT | OUTPUT | PARAMs | FLOPs | + +-----+------------+----------------+----------------+---------+------------+ + | 0 | conv2d | (3, 200, 200) | (64, 100, 100) | 9408 | 188160000 | + | 1 | batch_norm | (64, 100, 100) | (64, 100, 100) | 256 | 640000 | + | 2 | relu | (64, 100, 100) | (64, 100, 100) | 0 | 640000 | + | 3 | pool2d | (64, 100, 100) | (64, 50, 50) | 0 | 1440000 | + ... + | 176 | conv2d | (512, 7, 7) | (512, 7, 7) | 2359296 | 231211008 | + | 177 | relu | (512, 7, 7) | (512, 7, 7) | 0 | 25088 | + | 178 | conv2d | (512, 7, 7) | (2048, 7, 7) | 1048576 | 102760448 | + | 179 | relu | (2048, 7, 7) | (2048, 7, 7) | 0 | 100352 | + | 180 | pool2d | (2048, 7, 7) | (2048, 1, 1) | 0 | 100352 | + +-----+------------+----------------+----------------+---------+------------+ + Total PARAMs: 48017344(0.0480G) + Total FLOPs: 11692747751(11.69G) +''' +from collections import OrderedDict +from prettytable import PrettyTable + + +def summary(main_prog): + ''' + It can summary model's PARAMS, FLOPs until now. + It support common operator like conv, fc, pool, relu, sigmoid, bn etc. + Args: + main_prog: main program + Returns: + print summary on terminal + ''' + collected_ops_list = [] + for one_b in main_prog.blocks: + block_vars = one_b.vars + for one_op in one_b.ops: + op_info = OrderedDict() + spf_res = _summary_model(block_vars, one_op) + if spf_res is None: + continue + # TODO: get the operator name + op_info['type'] = one_op.type + op_info['input_shape'] = spf_res[0][1:] + op_info['out_shape'] = spf_res[1][1:] + op_info['PARAMs'] = spf_res[2] + op_info['FLOPs'] = spf_res[3] + collected_ops_list.append(op_info) + + summary_table, total = _format_summary(collected_ops_list) + _print_summary(summary_table, total) + + +def _summary_model(block_vars, one_op): + ''' + Compute operator's params and flops. + Args: + block_vars: all vars of one block + one_op: one operator to count + Returns: + in_data_shape: one operator's input data shape + out_data_shape: one operator's output data shape + params: one operator's PARAMs + flops: : one operator's FLOPs + ''' + if one_op.type in ['conv2d', 'depthwise_conv2d']: + k_arg_shape = block_vars[one_op.input("Filter")[0]].shape + in_data_shape = block_vars[one_op.input("Input")[0]].shape + out_data_shape = block_vars[one_op.output("Output")[0]].shape + c_out, c_in, k_h, k_w = k_arg_shape + _, c_out_, h_out, w_out = out_data_shape + assert c_out == c_out_, 'shape error!' + k_groups = one_op.attr("groups") + kernel_ops = k_h * k_w * (c_in / k_groups) + bias_ops = 0 if one_op.input("Bias") == [] else 1 + params = c_out * (kernel_ops + bias_ops) + flops = h_out * w_out * c_out * (kernel_ops + bias_ops) + # base nvidia paper, include mul and add + flops = 2 * flops + + elif one_op.type == 'pool2d': + in_data_shape = block_vars[one_op.input("X")[0]].shape + out_data_shape = block_vars[one_op.output("Out")[0]].shape + _, c_out, h_out, w_out = out_data_shape + k_size = one_op.attr("ksize") + params = 0 + flops = h_out * w_out * c_out * (k_size[0] * k_size[1]) + + elif one_op.type == 'mul': + k_arg_shape = block_vars[one_op.input("Y")[0]].shape + in_data_shape = block_vars[one_op.input("X")[0]].shape + out_data_shape = block_vars[one_op.output("Out")[0]].shape + # TODO: fc has mul ops + # add attr to mul op, tell us whether it belongs to 'fc' + # this's not the best way + if 'fc' not in one_op.output("Out")[0]: + return None + k_in, k_out = k_arg_shape + # bias in sum op + params = k_in * k_out + 1 + flops = k_in * k_out + + elif one_op.type in ['sigmoid', 'tanh', 'relu', 'leaky_relu', 'prelu']: + in_data_shape = block_vars[one_op.input("X")[0]].shape + out_data_shape = block_vars[one_op.output("Out")[0]].shape + params = 0 + if one_op.type == 'prelu': + params = 1 + flops = 1 + for one_dim in in_data_shape: + flops *= one_dim + + elif one_op.type == 'batch_norm': + in_data_shape = block_vars[one_op.input("X")[0]].shape + out_data_shape = block_vars[one_op.output("Y")[0]].shape + _, c_in, h_out, w_out = in_data_shape + # gamma, beta + params = c_in * 2 + # compute mean and std + flops = h_out * w_out * c_in * 2 + + else: + return None + + return in_data_shape, out_data_shape, params, flops + + +def _format_summary(collected_ops_list): + ''' + Format summary report. + Args: + collected_ops_list: the collected operator with summary + Returns: + summary_table: summary report format + total: sum param and flops + ''' + summary_table = PrettyTable( + ["No.", "TYPE", "INPUT", "OUTPUT", "PARAMs", "FLOPs"]) + summary_table.align = 'r' + + total = {} + total_params = [] + total_flops = [] + for i, one_op in enumerate(collected_ops_list): + # notice the order + table_row = [ + i, + one_op['type'], + one_op['input_shape'], + one_op['out_shape'], + int(one_op['PARAMs']), + int(one_op['FLOPs']), + ] + summary_table.add_row(table_row) + total_params.append(int(one_op['PARAMs'])) + total_flops.append(int(one_op['FLOPs'])) + + total['params'] = total_params + total['flops'] = total_flops + + return summary_table, total + + +def _print_summary(summary_table, total): + ''' + Print all the summary on terminal. + Args: + summary_table: summary report format + total: sum param and flops + ''' + parmas = total['params'] + flops = total['flops'] + print(summary_table) + print('Total PARAMs: {}({:.4f}M)'.format( + sum(parmas), sum(parmas) / (10**6))) + print('Total FLOPs: {}({:.2f}G)'.format(sum(flops), sum(flops) / 10**9)) + print( + "Notice: \n now supported ops include [Conv, DepthwiseConv, FC(mul), BatchNorm, Pool, Activation(sigmoid, tanh, relu, leaky_relu, prelu)]" + ) diff --git a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py index 7388ecd3b096fc05d1420b904f2d65d805c3fc53..e7f5f0d6a2185521549abe7af7b6be2b0b7d90fb 100644 --- a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py +++ b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py @@ -204,6 +204,10 @@ class GraphWrapper(object): """ super(GraphWrapper, self).__init__() self.program = Program() if program is None else program + self.persistables = {} + for var in self.program.list_vars(): + if var.persistable: + self.persistables[var.name] = var self.compiled_graph = None self.in_nodes = OrderedDict(in_nodes) self.out_nodes = OrderedDict(out_nodes) @@ -467,7 +471,12 @@ class GraphWrapper(object): path(str): The path to save the persistables. exe(framework.Executor): The executor used to save the persistables. """ - io.save_persistables(exe.exe, path, main_program=self.program) + # update persistables from program + for var in self.program.list_vars(): + if var.persistable and var.name not in self.persistables: + self.persistables[var.name] = var + + io.save_vars(exe.exe, path, vars=self.persistables.values()) def load_persistables(self, path, exe): """ @@ -481,7 +490,7 @@ class GraphWrapper(object): return os.path.exists(os.path.join(path, var.name)) io.load_vars( - exe.exe, path, main_program=self.program, predicate=if_exist) + exe.exe, path, vars=self.persistables.values(), predicate=if_exist) def update_param_shape(self, scope): """ diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index ab3bd8bd182c7e933c58e2ba2f3548f2d001cbdb..3809e327943832571a1bde6a53a0a6e7fbd13bdd 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -26,6 +26,17 @@ __all__ = [ ] +def _init_var_node(var_node, value, scope, place): + assert isinstance(value, + np.ndarray), 'The type of value should be numpy array.' + assert scope is not None, \ + 'The scope cannot be set None.' + assert place is not None, \ + 'The place cannot be set None.' + tensor = scope.var(var_node.name()).get_tensor() + tensor.set(value, place) + + class QuantizationTransformPass(object): def __init__(self, scope=None, @@ -88,14 +99,14 @@ class QuantizationTransformPass(object): assert activation_quantize_type != 'channel_wise_abs_max', "The activation quantization type does not support 'channel_wise_abs_max'." if activation_quantize_type not in quant_type: raise ValueError( - "Unknown activation_quantize_type : '%s'. It can only be ", - "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.", - str(activation_quantize_type)) + "Unknown activation_quantize_type : '%s'. It can only be " + "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'." % + (str(activation_quantize_type))) if weight_quantize_type not in quant_type: raise ValueError( - "Unknown weight_quantize_type: '%s'. It can only be ", - "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'.", - str(weight_quantize_type)) + "Unknown weight_quantize_type: '%s'. It can only be " + "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'." + % (str(weight_quantize_type))) self._activation_quantize_type = activation_quantize_type self._weight_quantize_type = weight_quantize_type @@ -121,8 +132,6 @@ class QuantizationTransformPass(object): """ assert isinstance(graph, IrGraph), 'graph must be the instance of IrGraph.' - #sequential_execution = core.get_pass('sequential_execution_pass') - #sequential_execution.apply(graph.graph) self._is_test = graph.is_test() # marked the variable which has been dequantized. dequantized_vars = collections.OrderedDict() @@ -203,9 +212,12 @@ class QuantizationTransformPass(object): var_type=core.VarDesc.VarType.LOD_TENSOR, shape=[1], var_dtype=core.VarDesc.VarType.INT64) - self._init_var_node( - global_step_in, np.zeros( - [1], dtype='int64')) + _init_var_node( + global_step_in, + np.zeros( + [1], dtype='int64'), + self._scope, + self._place) global_step_out = graph.create_var_node_from_desc( global_step_in.var()) # The attribute of `op_role` is needed by ParallelExecutor. @@ -284,7 +296,12 @@ class QuantizationTransformPass(object): var_dtype=var_node.dtype()) data_type = 'float64' if var_node.dtype( ) == core.VarDesc.VarType.FP64 else 'float32' - self._init_var_node(scale_in_node, np.array([0.001], dtype=data_type)) + _init_var_node( + scale_in_node, + np.array( + [0.001], dtype=data_type), + self._scope, + self._place) scale_out_node = graph.create_var_node_from_desc(scale_in_node.var()) inputs = {'X': var_node, 'InScale': scale_in_node} @@ -299,9 +316,13 @@ class QuantizationTransformPass(object): var_dtype=var_node.dtype()) data_type = 'float64' if var_node.dtype( ) == core.VarDesc.VarType.FP64 else 'float32' - self._init_var_node( - scales_node, np.zeros( - [self._window_size], dtype=data_type)) + _init_var_node( + scales_node, + np.zeros( + [self._window_size], dtype=data_type), + self._scope, + self._place) + inputs['Iter'] = self._global_step outputs['OutScales'] = scales_node attrs = { @@ -343,7 +364,12 @@ class QuantizationTransformPass(object): var_dtype=var_node.dtype()) data_type = 'float64' if var_node.dtype( ) == core.VarDesc.VarType.FP64 else 'float32' - self._init_var_node(scale_in_node, np.array([0.001], dtype=data_type)) + _init_var_node( + scale_in_node, + np.array( + [0.001], dtype=data_type), + self._scope, + self._place) scale_out_node = graph.create_var_node_from_desc(scale_in_node.var()) ins = {'X': var_node, 'InScale': scale_in_node} @@ -356,13 +382,23 @@ class QuantizationTransformPass(object): shape=[1]) data_type = 'float64' if var_node.dtype( ) == core.VarDesc.VarType.FP64 else 'float32' - self._init_var_node(scale_in_node, np.ones([1], dtype=data_type)) + _init_var_node( + scale_in_node, + np.ones( + [1], dtype=data_type), + self._scope, + self._place) accum_in_node = graph.create_persistable_node( name=unique_name.generate('accum'), var_type=core.VarDesc.VarType.LOD_TENSOR, var_dtype=var_node.dtype(), shape=[1]) - self._init_var_node(accum_in_node, np.ones([1], dtype=data_type)) + _init_var_node( + accum_in_node, + np.ones( + [1], dtype=data_type), + self._scope, + self._place) state_out_node = graph.create_var_node_from_desc(state_in_node.var( )) accum_out_node = graph.create_var_node_from_desc(accum_in_node.var( @@ -482,16 +518,6 @@ class QuantizationTransformPass(object): graph.link_to(dequant_op_node, dequant_var_node) return dequant_var_node - def _init_var_node(self, var_node, value): - assert isinstance( - value, np.ndarray), 'The type of value should be numpy array.' - assert self._scope is not None, \ - 'The scope cannot be set None when activation_quantize_type equals to range_abs_max.' - assert self._place is not None, \ - 'The place cannot be set None when activation_quantize_type equals to range_abs_max.' - tensor = self._scope.var(var_node.name()).get_tensor() - tensor.set(value, self._place) - def _quantized_var_name(self, var_name): """ Return quantized variable name for the input `var_name`. @@ -594,8 +620,8 @@ class QuantizationFreezePass(object): self._weight_bits) self._restore_var(input_arg_name, quantized_param_v) else: - scale_v = self._to_node(op_node.outputs, - op_node.output('OutScale')[0]) + scale_v = graph._find_node_by_name( + op_node.outputs, op_node.output('OutScale')[0]) self._var_scale_map[input_arg_name] = scale_v ops = graph.all_op_nodes() @@ -627,8 +653,8 @@ class QuantizationFreezePass(object): return graph def _remove_fake_quant_and_dequant_op(self, graph, op_node): - k = self._to_node(op_node.outputs, op_node.output('Out')[0]) - v = self._to_node(op_node.inputs, op_node.input('X')[0]) + k = graph._find_node_by_name(op_node.outputs, op_node.output('Out')[0]) + v = graph._find_node_by_name(op_node.inputs, op_node.input('X')[0]) if v.node not in self._op_input_rename_map: self._op_input_rename_map[k.node] = v else: @@ -663,8 +689,8 @@ class QuantizationFreezePass(object): raise ValueError("Only support one output, but op %s has" " more than one output." % (op_node.name())) - output_var_node = self._to_node(op_node.outputs, - op_node.output_arg_names()[0]) + output_var_node = graph._find_node_by_name( + op_node.outputs, op_node.output_arg_names()[0]) weight_scale_node = graph.create_persistable_node( name=unique_name.generate('channel_scale'), var_type=core.VarDesc.VarType.LOD_TENSOR, @@ -672,7 +698,9 @@ class QuantizationFreezePass(object): var_dtype=output_var_node.dtype()) data_type = 'float64' if output_var_node.dtype( ) == core.VarDesc.VarType.FP64 else 'float32' - self._init_var_node(weight_scale_node, channel_scale.astype(data_type)) + _init_var_node(weight_scale_node, + channel_scale.astype(data_type), self._scope, + self._place) dequant_var_node = graph.create_var_node( name=self._dequantized_var_name(output_var_node.name()), var_type=output_var_node.type(), @@ -724,8 +752,8 @@ class QuantizationFreezePass(object): raise ValueError("Only support one output, but op %s has" " more than one output." % (op_node.name())) - output_var_node = self._to_node(op_node.outputs, - op_node.output_arg_names()[0]) + output_var_node = graph._find_node_by_name( + op_node.outputs, op_node.output_arg_names()[0]) dequant_var_node = graph.create_var_node( name=self._dequantized_var_name(output_var_node.name()), var_type=output_var_node.type(), @@ -746,24 +774,6 @@ class QuantizationFreezePass(object): self._op_output_rename_map[output_var_node.node] = dequant_var_node return dequant_var_node - def _init_var_node(self, var_node, value): - assert isinstance( - value, np.ndarray), 'The type of value should be numpy array.' - assert self._scope is not None, \ - 'The scope cannot be set None when activation_quantize_type equals to range_abs_max.' - assert self._place is not None, \ - 'The place cannot be set None when activation_quantize_type equals to range_abs_max.' - tensor = self._scope.var(var_node.name()).get_tensor() - tensor.set(value, self._place) - - def _to_node(self, nodes, node_name): - target_node = None - for n in nodes: - if n.name() == node_name: - target_node = n - assert target_node is not None, "Cannot find the target node in the giving set." - return target_node - def _load_var(self, name): return np.array(self._scope.find_var(name).get_tensor()) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py index 6812b4c633d5b55d84fff935b696297f30b18c6b..a22b6da020510838dc82fe7af87ab62db6e874ef 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py @@ -20,7 +20,7 @@ from .... import io from .... import core from ....compiler import CompiledProgram from ....compiler import BuildStrategy -from ....framework import IrGraph +from ....framework import IrGraph, Variable, Program from ..core.strategy import Strategy from .quantization_pass import * @@ -45,13 +45,14 @@ class QuantizationStrategy(Strategy): activation_bits=8, weight_bits=8, activation_quantize_type='abs_max', + weight_quantize_type='abs_max', save_in_nodes=None, save_out_nodes=None): """ Args: start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0 end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0 - float_model_save_path(str): The path to save model with float weights. + float_model_save_path(str): The path to save model with float weights. None means it doesn't save float model. defalut: None. mobile_model_save_path(str): The path to save model for paddle-mobile execution. None means it doesn't save mobile model. defalut: None. @@ -66,9 +67,11 @@ class QuantizationStrategy(Strategy): dynamically each step in both training and testing period. If use 'range_abs_max', a static quantization scale will be calculated during training and used in inference. - save_in_nodes(list): A list of variable names used to prune graph + weight_quantize_type (str): quantization type for weights, support 'abs_max' and 'channel_wise_abs_max'. + The 'range_abs_max' usually is not used for weight, since weights are fixed once the model is well trained. + save_in_nodes(list): A list of variable names used to prune graph for saving inference model. - save_out_nodes(list): A list of variable names used to prune graph + save_out_nodes(list): A list of variable names used to prune graph for saving inference model. """ @@ -81,43 +84,80 @@ class QuantizationStrategy(Strategy): self.activation_bits = activation_bits self.weight_bits = weight_bits self.activation_quantize_type = activation_quantize_type + self.weight_quantize_type = weight_quantize_type self.save_out_nodes = save_out_nodes self.save_in_nodes = save_in_nodes + def on_compression_begin(self, context): + """ + Restore graph when the compressoin task is inited from checkpoint. + """ + # It is inited from checkpoint and has missed start epoch. + if context.epoch_id != 0 and context.epoch_id > self.start_epoch: + _logger.info("Restore quantization task from checkpoint") + self._modify_graph_for_quantization(context) + _logger.info("Finish restoring quantization task from checkpoint") + + def _modify_graph_for_quantization(self, context): + """ + Insert fake_quantize_op and fake_dequantize_op before trainging and testing. + """ + train_ir_graph = IrGraph( + core.Graph(context.optimize_graph.program.clone().desc), + for_test=False) + test_ir_graph = IrGraph( + core.Graph(context.eval_graph.program.clone().desc), for_test=True) + transform_pass = QuantizationTransformPass( + scope=context.scope, + place=context.place, + weight_bits=self.weight_bits, + activation_bits=self.activation_bits, + activation_quantize_type=self.activation_quantize_type, + weight_quantize_type=self.weight_quantize_type) + transform_pass.apply(train_ir_graph) + transform_pass.apply(test_ir_graph) + # Put persistables created by transform_pass into context.optimize_graph.persistables + # for saving checkpoint. + program_persistables = set() + for var in context.optimize_graph.program.list_vars(): + if var.persistable: + program_persistables.add(var.name) + + program = Program() + for var_node in train_ir_graph.all_persistable_nodes(): + if var_node.name() not in program_persistables: + var_desc = var_node.var() + var = program.global_block().create_var( + name=var_node.name(), + shape=var_desc.shape(), + dtype=var_desc.dtype(), + type=var_desc.type(), + lod_level=var_desc.lod_level()) + context.optimize_graph.persistables[var.name] = var + + build_strategy = BuildStrategy() + build_strategy.enable_inplace = False + build_strategy.memory_optimize = False + # for quantization training + context.optimize_graph.compiled_graph = CompiledProgram( + train_ir_graph.graph).with_data_parallel( + loss_name=context.optimize_graph.out_nodes['loss'], + build_strategy=build_strategy) + # for evaluation. And program compiled from ir graph must be with data parallel. + context.eval_graph.compiled_graph = CompiledProgram( + test_ir_graph.graph).with_data_parallel( + build_strategy=build_strategy) + # for saving inference model after training + context.put('quantization_test_ir_graph_backup', test_ir_graph) + def on_epoch_begin(self, context): """ Insert fake_quantize_op and fake_dequantize_op before trainging and testing. """ - super(QuantizationStrategy, self).on_compression_begin(context) + super(QuantizationStrategy, self).on_epoch_begin(context) if self.start_epoch == context.epoch_id: _logger.info('QuantizationStrategy::on_epoch_begin') - train_ir_graph = IrGraph( - core.Graph(context.optimize_graph.program.desc), for_test=False) - test_ir_graph = IrGraph( - core.Graph(context.eval_graph.program.desc), for_test=True) - transform_pass = QuantizationTransformPass( - scope=context.scope, - place=context.place, - weight_bits=self.weight_bits, - activation_bits=self.activation_bits, - activation_quantize_type=self.activation_quantize_type) - transform_pass.apply(train_ir_graph) - transform_pass.apply(test_ir_graph) - - build_strategy = BuildStrategy() - build_strategy.enable_inplace = False - build_strategy.memory_optimize = False - # for quantization training - context.optimize_graph.compiled_graph = CompiledProgram( - train_ir_graph.graph).with_data_parallel( - loss_name=context.optimize_graph.out_nodes['loss'], - build_strategy=build_strategy) - # for evaluation. And program compiled from ir graph must be with data parallel. - context.eval_graph.compiled_graph = CompiledProgram( - test_ir_graph.graph).with_data_parallel( - build_strategy=build_strategy) - # for saving inference model after training - context.put('quantization_test_ir_graph_backup', test_ir_graph) + self._modify_graph_for_quantization(context) _logger.info('Finish QuantizationStrategy::on_epoch_begin') def on_epoch_end(self, context): @@ -134,7 +174,8 @@ class QuantizationStrategy(Strategy): scope=context.scope, place=context.place, weight_bits=self.weight_bits, - activation_bits=self.activation_bits) + activation_bits=self.activation_bits, + weight_quantize_type=self.weight_quantize_type) freeze_pass.apply(test_ir_graph) # for other strategies @@ -152,7 +193,7 @@ class QuantizationStrategy(Strategy): ] if self.save_in_nodes == None: - in_vars = list(context.eval_graph.out_nodes.values()) + in_vars = list(context.eval_graph.in_nodes.values()) else: in_vars = self.save_in_nodes diff --git a/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml b/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml index f29eb53f88d22d87b61f82279b676af5ec1ef497..a3a5a724fbfcac41ed4ab286caac184c2fe104ad 100644 --- a/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml +++ b/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml @@ -35,6 +35,8 @@ strategies: start_epoch: 0 end_epoch: 0 float_model_save_path: './output/float' + mobile_model_save_path: './output/mobile' + int8_model_save_path: './output/int8' weight_bits: 8 activation_bits: 8 weight_quantize_type: 'abs_max' diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py index c7feca0b82606cdba9a05fb6de821aa6d347d4e6..e896f8bb423a642bada043e3e578033d3bfdea90 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py @@ -256,8 +256,6 @@ class TestQuantizationFreezePass(unittest.TestCase): place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type) - #transform_pass = QuantizationTransformPass( - # scope=scope, place=place, activation_quantize_type=activation_quant_type) transform_pass.apply(main_graph) transform_pass.apply(test_graph) dev_name = '_gpu_' if use_cuda else '_cpu_' @@ -315,7 +313,6 @@ class TestQuantizationFreezePass(unittest.TestCase): # Freeze graph for inference, but the weight of fc/conv is still float type. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=weight_quant_type) - #freeze_pass = QuantizationFreezePass(scope=scope, place=place) freeze_pass.apply(test_graph) if not for_ci: marked_nodes = set() diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/dygraph/__init__.py similarity index 100% rename from python/paddle/fluid/imperative/__init__.py rename to python/paddle/fluid/dygraph/__init__.py diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/dygraph/base.py similarity index 88% rename from python/paddle/fluid/imperative/base.py rename to python/paddle/fluid/dygraph/base.py index 097cd2be35b01aced30486b874f202381c4d9962..d55dbbb9c72cb887e169849c3a3e32a13c202a7b 100644 --- a/python/paddle/fluid/imperative/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -22,7 +22,7 @@ __all__ = ['enabled', 'guard', 'to_variable'] def enabled(): - return framework._in_imperative_mode() + return framework._in_dygraph_mode() @signature_safe_contextmanager @@ -39,14 +39,14 @@ def guard(place=None): with framework.program_guard(train, startup): with framework.unique_name.guard(): - with framework._imperative_guard(tracer): - with framework._imperative_place_guard(place): + with framework._dygraph_guard(tracer): + with framework._dygraph_place_guard(place): yield def to_variable(value, block=None, name=None): if isinstance(value, np.ndarray): - assert enabled(), "to_variable could only be called in imperative mode" + assert enabled(), "to_variable could only be called in dygraph mode" if not block: block = framework.default_main_program().current_block() diff --git a/python/paddle/fluid/imperative/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py similarity index 93% rename from python/paddle/fluid/imperative/checkpoint.py rename to python/paddle/fluid/dygraph/checkpoint.py index 37c43f29d2ae9214058238e4f834dbbcd9e42df1..f992ae0576c81ed98a3e9f7a446b0c2e808622ea 100644 --- a/python/paddle/fluid/imperative/checkpoint.py +++ b/python/paddle/fluid/dygraph/checkpoint.py @@ -68,7 +68,7 @@ def save_persistables(vardict, dirname, filename=None): dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, init_cell) param_path = "./my_paddle_model" - fluid.imperative.checkpoint.save_persistables(ptb_model.state_dict(), dirname=param_path, + fluid.dygraph.save_persistables(ptb_model.state_dict(), dirname=param_path, layer=ptb_model) """ if isinstance(vardict, collections.OrderedDict): @@ -97,17 +97,17 @@ def load_persistables(vardict, dirname, filename=None): Examples: .. code-block:: python - my_layer = layer(fluid.imperative.Layer) + my_layer = layer(fluid.dygraph.Layer) param_path = "./my_paddle_model" - param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.parameters(), param_path) + param_dict = fluid.dygraph.load_persistables(my_layer.parameters(), param_path) param_1 = param_dict['PtbModel_0.w_1'] or: - my_layer = layer(fluid.imperative.Layer) + my_layer = layer(fluid.dygraph.Layer) param_path = "./my_paddle_model" filename = "model.file" - param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.state_dict(), param_path, + param_dict = fluid.dygraph.load_persistables(my_layer.state_dict(), param_path, filename=filename) param_1 = param_dict['PtbModel_0.w_1'] diff --git a/python/paddle/fluid/imperative/layer_object_helper.py b/python/paddle/fluid/dygraph/layer_object_helper.py similarity index 99% rename from python/paddle/fluid/imperative/layer_object_helper.py rename to python/paddle/fluid/dygraph/layer_object_helper.py index 3d4426e8cdfe79a6fa2d6452e7cb3ab0a458c0bc..c56652e103ce93bf5459b30b66c7b1f04e7c14d0 100644 --- a/python/paddle/fluid/imperative/layer_object_helper.py +++ b/python/paddle/fluid/dygraph/layer_object_helper.py @@ -16,7 +16,7 @@ from __future__ import print_function import copy import six -from ..framework import Parameter, _in_imperative_mode +from ..framework import Parameter, _in_dygraph_mode from ..param_attr import ParamAttr from .. import core from six.moves import zip diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/dygraph/layers.py similarity index 99% rename from python/paddle/fluid/imperative/layers.py rename to python/paddle/fluid/dygraph/layers.py index e64667f7f467d0d0a3c07d14ce22c3f231e82eb6..014ee41f4c5aa280fb5b366d8f1704290cc067d4 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -283,7 +283,7 @@ class PyLayer(core.PyLayer): @classmethod def __call__(cls, *inputs): - tracer = framework._imperative_tracer() + tracer = framework._dygraph_tracer() block = framework.default_main_program().current_block() ivar_inputs = [x._ivar for x in inputs] diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/dygraph/nn.py similarity index 99% rename from python/paddle/fluid/imperative/nn.py rename to python/paddle/fluid/dygraph/nn.py index 9856276b20b7affb548847d359463451bb238518..8925381119272d7462562c0952d3e157f78f25af 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -133,7 +133,7 @@ class Conv2D(layers.Layer): outputs={'Out': [pre_act]}, attrs={'axis': 1}) - # Currently, we don't support inplace in imperative mode + # Currently, we don't support inplace in dygraph mode return self._helper.append_activation(pre_act, act=self._act) @@ -265,7 +265,7 @@ class FC(layers.Layer): attrs={'axis': self._num_flatten_dims}) else: pre_activation = pre_bias - # Currently, we don't support inplace in imperative mode + # Currently, we don't support inplace in dygraph mode return self._helper.append_activation(pre_activation, act=self._act) @@ -387,7 +387,7 @@ class BatchNorm(layers.Layer): "use_global_stats": self._use_global_stats }) - # Currently, we don't support inplace in imperative mode + # Currently, we don't support inplace in dygraph mode return self._helper.append_activation(batch_norm_out, self._act) @@ -426,7 +426,7 @@ class Embedding(layers.Layer): dict_size = len(dataset.ids) input = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32') - embedding = fluid.imperative.Embedding(size=[dict_size, 16]) + embedding = fluid.dygraph.Embedding(size=[dict_size, 16]) fc = embedding(input) """ diff --git a/python/paddle/fluid/imperative/profiler.py b/python/paddle/fluid/dygraph/profiler.py similarity index 100% rename from python/paddle/fluid/imperative/profiler.py rename to python/paddle/fluid/dygraph/profiler.py diff --git a/python/paddle/fluid/imperative/tracer.py b/python/paddle/fluid/dygraph/tracer.py similarity index 95% rename from python/paddle/fluid/imperative/tracer.py rename to python/paddle/fluid/dygraph/tracer.py index 28c8586813410f7349da7943a966eaa9cc3816d2..94e212b139b2b375aa9f5252d396e90235ba33c1 100644 --- a/python/paddle/fluid/imperative/tracer.py +++ b/python/paddle/fluid/dygraph/tracer.py @@ -24,12 +24,12 @@ __all__ = ['Tracer'] def release_op(op): - del framework._imperative_tracer()._ops[op._trace_id] + del framework._dygraph_tracer()._ops[op._trace_id] class Tracer(core.Tracer): """ - Python wrapper of imperative tracer + Python wrapper of dygraph tracer """ def __init__(self, block): diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 4a5301b436e4eb4c634749e8be736fe43c958348..a49fafa97da45adc25ba7de6d2e5ff19f1a87fc4 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -75,20 +75,20 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix() CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() -_imperative_tracer_ = None -_imperative_current_expected_place_ = None +_dygraph_tracer_ = None +_dygraph_current_expected_place_ = None -def _in_imperative_mode(): - return _imperative_tracer_ is not None +def _in_dygraph_mode(): + return _dygraph_tracer_ is not None -def _imperative_tracer(): - return _imperative_tracer_ +def _dygraph_tracer(): + return _dygraph_tracer_ def _current_expected_place(): - return _imperative_current_expected_place_ + return _dygraph_current_expected_place_ def _cpu_num(): @@ -396,7 +396,7 @@ class Variable(object): if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) - if _in_imperative_mode(): + if _in_dygraph_mode(): # record vars in tracer rather than blocks self._ivar = kwargs.get("ivar", None) if not self._ivar: @@ -406,7 +406,7 @@ class Variable(object): _current_expected_place(), stop_gradient, True if persistable else False) if persistable: - _imperative_tracer().trace_var(name, self) + _dygraph_tracer().trace_var(name, self) else: self.error_clip = error_clip @@ -515,8 +515,8 @@ class Variable(object): Returns: str: The debug string. """ - if _in_imperative_mode(): - # TODO(panyx0718): add more imperative debug info. + if _in_dygraph_mode(): + # TODO(panyx0718): add more dygraph debug info. return 'name %s, dtype: %s shape: %s' % (self.name, self.dtype, self.shape) @@ -548,42 +548,42 @@ class Variable(object): @property def _stop_gradient(self): - if _in_imperative_mode(): + if _in_dygraph_mode(): return self._ivar.stop_gradient else: return self.stop_gradient @_stop_gradient.setter def _stop_gradient(self, s): - if _in_imperative_mode(): + if _in_dygraph_mode(): self._ivar.stop_gradient = s else: self.stop_gradient = s @property def persistable(self): - if _in_imperative_mode(): + if _in_dygraph_mode(): return self._ivar.persistable else: return self.desc.persistable() @persistable.setter def persistable(self, p): - if _in_imperative_mode(): + if _in_dygraph_mode(): return self._ivar.persistable else: self.desc.set_persistable(p) @property def name(self): - if _in_imperative_mode(): + if _in_dygraph_mode(): return self._ivar.name else: return cpt.to_text(self.desc.name()) @name.setter def name(self, new_name): - if _in_imperative_mode(): + if _in_dygraph_mode(): self._ivar.name = new_name else: self.desc.set_name(new_name) @@ -591,26 +591,26 @@ class Variable(object): @property def shape(self): # convert to tuple, make it as same as numpy API. - if _in_imperative_mode(): + if _in_dygraph_mode(): return self._ivar.shape else: return tuple(self.desc.shape()) @property def dtype(self): - if _in_imperative_mode(): + if _in_dygraph_mode(): return self._ivar.dtype else: return self.desc.dtype() @property def lod_level(self): - # TODO(minqiyang): Support lod_level in imperative mode + # TODO(minqiyang): Support lod_level in dygraph mode return self.desc.lod_level() @property def type(self): - if _in_imperative_mode(): + if _in_dygraph_mode(): return self._ivar.dtype else: return self.desc.type() @@ -918,7 +918,7 @@ class Operator(object): inputs=None, outputs=None, attrs=None): - if _in_imperative_mode(): + if _in_dygraph_mode(): if type is None: raise ValueError( "`type` to initialized an Operator can not be None.") @@ -1037,7 +1037,7 @@ class Operator(object): for arg in out_args: out_arg_names.append(cpt.to_text(arg.name)) # TODO(minqiyang): could we remove variable's op in static mode? - if not _in_imperative_mode(): + if not _in_dygraph_mode(): arg.op = self self.desc.set_output(out_proto.name, out_arg_names) @@ -1083,7 +1083,7 @@ class Operator(object): @property def type(self): - if _in_imperative_mode(): + if _in_dygraph_mode(): return self.iop.type else: return self.desc.type() @@ -1626,7 +1626,7 @@ class Block(object): Returns: Operator: the append Operator. """ - if _in_imperative_mode(): + if _in_dygraph_mode(): op = Operator( block=self, desc=None, @@ -1638,9 +1638,8 @@ class Block(object): # record ops in tracer rather than blocks # # TODO(minqiyang): add op stop_gradient support in static mode too. - # currently, we only support stop_gradient in imperative mode. - _imperative_tracer().trace_op(op, - kwargs.get("stop_gradient", False)) + # currently, we only support stop_gradient in dygraph mode. + _dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False)) else: op_desc = self.desc.append_op() op = Operator( @@ -1699,7 +1698,7 @@ class Block(object): return self.ops[start:end] def _prepend_op(self, *args, **kwargs): - if _in_imperative_mode(): + if _in_dygraph_mode(): op = Operator( self, None, @@ -1707,8 +1706,7 @@ class Block(object): inputs=kwargs.get("inputs", None), outputs=kwargs.get("outputs", None), attrs=kwargs.get("attrs", None)) - _imperative_tracer().trace_op(op, - kwargs.get("stop_gradient", False)) + _dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False)) else: op_desc = self.desc._prepend_op() op = Operator( @@ -2347,40 +2345,6 @@ class IrGraph(object): """ return {IrOpNode(node) for node in self.graph.nodes() if node.is_op()} - def _find_var_node(self, key): - """ - Get a variable node by the `key` from this graph. The key - can be a node name or a node id. - - WARNS: - There are some nodes may have the same name. So, be - cautious about using this method when you find the - target var node by its name. - - Args: - key(str|int): The str type denotes that the target variable node's name. - And the int type denotes that the target variable node's id. - - Raises: - ValueError: If this graph doesn't have a variable with the giving name or id. - - Returns: - IrVarNode: the variable node with the giving name or id. - """ - target_var_node = None - var_nodes = self.all_var_nodes() - if isinstance(key, six.string_types): - for var_node in var_nodes: - if var_node.name() == key: - target_var_node = var_node - elif isinstance(key, int): - for var_node in var_nodes: - if var_node.id() == key: - target_var_node = var_node - if target_var_node is None: - raise ValueError("var_node %s not in this graph" % key) - return target_var_node - def create_persistable_node(self, name, var_type, shape, var_dtype): """ Create a persistable variable node in the graph. In IrGraph, @@ -2525,14 +2489,6 @@ class IrGraph(object): core.graph_safe_remove_nodes(self.graph, original_nodes) def resolve_hazard(self): - def _to_node(nodes, node_name): - target_node = None - for n in nodes: - if n.name() == node_name: - target_node = n - assert target_node is not None, "Cannot find the target node in the giving set." - return target_node - ordered_nodes = core.topology_sort(self.graph) var_nodes = dict() for node in ordered_nodes: @@ -2540,16 +2496,17 @@ class IrGraph(object): for each_var_name in node.op().input_arg_names(): if each_var_name not in var_nodes: var_nodes[each_var_name] = [ - _to_node(node.inputs, each_var_name) + self._find_node_by_name(node.inputs, each_var_name) ] for each_var_name in node.op().output_arg_names(): if each_var_name not in var_nodes: var_nodes[each_var_name] = [ - _to_node(node.outputs, each_var_name) + self._find_node_by_name(node.outputs, each_var_name) ] else: var_nodes[each_var_name].append( - _to_node(node.outputs, each_var_name)) + self._find_node_by_name(node.outputs, + each_var_name)) self.graph.resolve_hazard(var_nodes) def has_circle(self): @@ -2662,6 +2619,17 @@ class IrGraph(object): program = Program._construct_from_desc(desc) return program + def _find_node_by_name(self, nodes, node_name): + """ + Find a node in the giving nodes set by the name. + """ + target_node = None + for n in nodes: + if n.name() == node_name: + target_node = n + assert target_node is not None, "Cannot find the target node in the giving set." + return target_node + def _update_desc_attr(self, desc, name, val): """ Update the value of desc's attribute by attribute's name. @@ -3541,22 +3509,22 @@ def _get_var(name, program=None): @signature_safe_contextmanager -def _imperative_guard(tracer): - global _imperative_tracer_ - tmp_trace = _imperative_tracer_ - _imperative_tracer_ = tracer +def _dygraph_guard(tracer): + global _dygraph_tracer_ + tmp_trace = _dygraph_tracer_ + _dygraph_tracer_ = tracer yield - _imperative_tracer_ = tmp_trace + _dygraph_tracer_ = tmp_trace @signature_safe_contextmanager -def _imperative_place_guard(place): - global _imperative_current_expected_place_ - tmp_place = _imperative_current_expected_place_ - _imperative_current_expected_place_ = place +def _dygraph_place_guard(place): + global _dygraph_current_expected_place_ + tmp_place = _dygraph_current_expected_place_ + _dygraph_current_expected_place_ = place yield - _imperative_current_expected_place_ = tmp_place + _dygraph_current_expected_place_ = tmp_place diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 8358bb1aba98d8f5699cbda27e657ba6c470d333..6aff93dceaf5cfd299bdc9f68246ed579f248f3c 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -165,7 +165,7 @@ class ConstantInitializer(Initializer): 'force_cpu': self._force_cpu or force_init_on_cpu() }, stop_gradient=True) - if not framework._in_imperative_mode(): + if not framework._in_dygraph_mode(): var.op = op return op @@ -245,7 +245,7 @@ class UniformInitializer(Initializer): attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype}) - if not framework._in_imperative_mode(): + if not framework._in_dygraph_mode(): var.op = op return op @@ -324,7 +324,7 @@ class NormalInitializer(Initializer): outputs={"Out": var}, attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype}) - if not framework._in_imperative_mode(): + if not framework._in_dygraph_mode(): var.op = op return op @@ -403,7 +403,7 @@ class TruncatedNormalInitializer(Initializer): outputs={"Out": var}, attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype}) - if not framework._in_imperative_mode(): + if not framework._in_dygraph_mode(): var.op = op return op @@ -509,7 +509,7 @@ class XavierInitializer(Initializer): "seed": self._seed }, stop_gradient=True) - if not framework._in_imperative_mode(): + if not framework._in_dygraph_mode(): var.op = op return op @@ -610,7 +610,7 @@ class MSRAInitializer(Initializer): "seed": self._seed }, stop_gradient=True) - if not framework._in_imperative_mode(): + if not framework._in_dygraph_mode(): var.op = op return op @@ -709,7 +709,7 @@ class BilinearInitializer(Initializer): 'shape': list(shape), value_name: values }) - if not framework._in_imperative_mode(): + if not framework._in_dygraph_mode(): var.op = op return op @@ -768,7 +768,7 @@ class NumpyArrayInitializer(Initializer): value_name: values }, stop_gradient=True) - if not framework._in_imperative_mode(): + if not framework._in_dygraph_mode(): var.op = op return op diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py index 3569a8bc357daf9408e8ae3eb53ad9d2942cfeaa..3cdd05533f703ac27333daab7ada0c26392a24f5 100644 --- a/python/paddle/fluid/install_check.py +++ b/python/paddle/fluid/install_check.py @@ -17,7 +17,7 @@ from .param_attr import ParamAttr from .initializer import Constant from . import layers from . import backward -from .imperative import Layer, nn +from .dygraph import Layer, nn from . import executor from . import core diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index a85ef3c13f845959200d26391f6c95923a11c6ed..7eb912645e5077d35a2d11d7d09a033d28345e15 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -17,7 +17,7 @@ from __future__ import print_function import copy import six -from .framework import Parameter, dtype_is_floating, _in_imperative_mode +from .framework import Parameter, dtype_is_floating, _in_dygraph_mode from . import unique_name from paddle.fluid.initializer import Constant, Xavier from .param_attr import ParamAttr @@ -30,9 +30,9 @@ class LayerHelper(LayerHelperBase): def __init__(self, layer_type, **kwargs): self.kwargs = kwargs name = self.kwargs.get('name', None) - # TODO(panyx0718, minqiyang): imperative mode + # TODO(panyx0718, minqiyang): dygraph mode # can not use both `layer_type` and `name`. Deprecate LayerHelper - # and write a Helper for imperative mode. + # and write a Helper for dygraph mode. if name is None: self.kwargs['name'] = unique_name.generate(layer_type) diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py index a68160d797bcaca8cff849c560960d6a8823de53..869a5f54e9cdf5740c5e216917d92880d7d61e2d 100644 --- a/python/paddle/fluid/layer_helper_base.py +++ b/python/paddle/fluid/layer_helper_base.py @@ -17,7 +17,7 @@ from __future__ import print_function import copy import numpy as np -from .framework import Variable, default_main_program, default_startup_program, _in_imperative_mode, _current_expected_place +from .framework import Variable, default_main_program, default_startup_program, _in_dygraph_mode, _current_expected_place from . import unique_name from .param_attr import ParamAttr, WeightNormParamAttr from . import core @@ -54,8 +54,8 @@ class LayerHelperBase(object): Return Variable construct from value """ if isinstance(value, np.ndarray): - assert _in_imperative_mode( - ), "to_variable could only be called in imperative mode" + assert _in_dygraph_mode( + ), "to_variable could only be called in dygraph mode" if not block: block = default_main_program().current_block() @@ -302,8 +302,8 @@ class LayerHelperBase(object): param = self._create_weight_normalize(attr, shape, dtype) WeightNormParamAttr.params_with_weight_norm.append(param) return param - if _in_imperative_mode(): - # In imperative mode, we want the returned parameter to be + if _in_dygraph_mode(): + # In dygraph mode, we want the returned parameter to be # initialized so that it can be used imperatively. return self.main_program.global_block().create_parameter( dtype=dtype, @@ -370,7 +370,7 @@ class LayerHelperBase(object): initializer: initializer to use """ assert isinstance(var, Variable) - if _in_imperative_mode(): + if _in_dygraph_mode(): initializer(var, var.block) else: self.startup_program.global_block().create_var( diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index f2413f603304f8262476ca3ae2b820c89d009c3d..f02496506c6f0ce37d135625aafaa405c88eb8cb 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -23,8 +23,8 @@ import os import inspect from ..layer_helper import LayerHelper from ..initializer import Normal, Constant, NumpyArrayInitializer -from ..framework import Variable, OpProtoHolder, _in_imperative_mode -from ..imperative import base +from ..framework import Variable, OpProtoHolder, _in_dygraph_mode +from ..dygraph import base from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ from .tensor import concat, assign @@ -32,7 +32,7 @@ from . import utils from .. import unique_name from functools import reduce from .. import core -from ..imperative import layers +from ..dygraph import layers __all__ = [ 'fc', @@ -296,7 +296,6 @@ def fc(input, data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32") fc = fluid.layers.fc(input=[data_1, data_2], size=1000, act="tanh") """ - helper = LayerHelper("fc", **locals()) dtype = helper.input_dtype() @@ -3279,6 +3278,8 @@ def layer_norm(input, >>> dtype='float32') >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1) """ + assert _in_dygraph_mode( + ) is not True, "please use FC instead of fc in dygraph mode!" helper = LayerHelper('layer_norm', **locals()) dtype = helper.input_dtype() @@ -5866,11 +5867,49 @@ def multiplex(inputs, index): """ ${comment} - >>> import paddle.fluid as fluid - >>> x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32') - >>> x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32') - >>> index = fluid.layers.data(name='index', shape=[1], dtype='int32') - >>> out = fluid.layers.multiplex(inputs=[x1, x2], index=index) + For Example: + + .. code-block:: text + + case 1: + + Given: + + X = [[[0,0,3,4], [0,1,3,4], [0,2,4,4], [0,3,3,4]], + [[1,0,3,4], [1,1,7,8], [1,2,4,2], [1,3,3,4]], + [[2,0,3,4], [2,1,7,8], [2,2,4,2], [2,3,3,4]], + [[3,0,3,4], [3,1,7,8], [3,2,4,2], [3,3,3,4]]] + + index = [3,0,1,2] + + out:[[3 0 3 4] // X[3,0] (3 = index[i], 0 = i); i=0 + [0 1 3 4] // X[0,1] (0 = index[i], 1 = i); i=1 + [1 2 4 2] // X[1,2] (0 = index[i], 2 = i); i=2 + [2 3 3 4]] // X[2,3] (0 = index[i], 3 = i); i=3 + + case 2: + + Given: + + X = [[[0,0,3,4], [0,1,3,4], [0,2,4,4], [0,3,3,4]], + [[1,0,3,4], [1,1,7,8], [1,2,4,2], [1,3,3,4]]] + + index = [1,0] + + out:[[1 0 3 4] // X[1,0] (3 = index[0], 0 = i); i=1 + [0 1 3 4] // X[0,1] (0 = index[1], 1 = i); i=2 + [0 2 4 4] // X[0,2] (0 = 0, 2 = i); i=3 + [0 3 3 4]] // X[0,3] (0 = 0, 3 = i); i=4 + + Examples: + + .. code-block:: python + + import paddle.fluid as fluid + x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32') + x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32') + index = fluid.layers.data(name='index', shape=[1], dtype='int32') + out = fluid.layers.multiplex(inputs=[x1, x2], index=index) Args: inputs (list): ${x_comment}. @@ -6405,8 +6444,8 @@ def squeeze(input, axes, name=None): x = layers.data(name='x', shape=[5, 1, 10]) y = layers.sequeeze(input=x, axes=[1]) """ - assert not _in_imperative_mode(), ( - "squeeze layer is not supported in imperative mode yet.") + assert not _in_dygraph_mode(), ( + "squeeze layer is not supported in dygraph mode yet.") helper = LayerHelper("squeeze", **locals()) out = helper.create_variable_for_type_inference(dtype=input.dtype) x_shape = helper.create_variable_for_type_inference(dtype=input.dtype) @@ -9144,7 +9183,7 @@ def _elementwise_op(helper): op_type = helper.layer_type x = helper.kwargs.get('x', None) y = helper.kwargs.get('y', None) - if _in_imperative_mode(): + if _in_dygraph_mode(): x = base.to_variable(x) y = base.to_variable(y) diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index ef90638c721810e618ce4760e83e1a63b86c2325..80450119f44e93aae4b483983484ea18be5b2035 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -20,7 +20,6 @@ from ..framework import convert_np_dtype_to_dtype_ from ..framework import Variable from ..initializer import Constant, force_init_on_cpu from ..core import VarDesc -from ..imperative import base as imperative_base from .layer_function_generator import templatedoc import numpy diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 8fdc7f33ab16df8aca1e9ff0b334aace7b710d62..2e596ef118e0b2cc4696043759dc33f560df79f9 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -30,8 +30,8 @@ from .initializer import Constant from .layer_helper import LayerHelper from .layers import ops from .regularizer import append_regularization_ops -from .imperative import base as imperative_base -from .imperative.learning_rate_scheduler import LearningRateDecay +from .dygraph import base as imperative_base +from .dygraph.learning_rate_scheduler import LearningRateDecay from paddle.fluid import core from paddle.fluid.layers import tensor from functools import reduce @@ -205,7 +205,7 @@ class Optimizer(object): name = self._name + "_" + name if (name in self._accumulators and param.name in self._accumulators[name]): - if framework._in_imperative_mode(): + if framework._in_dygraph_mode(): return self._accumulators[name][param.name] raise Exception("Accumulator {} already exists for parameter {}". format(name, param.name)) @@ -432,11 +432,11 @@ class Optimizer(object): """ self._dtype = loss.dtype optimize_ops = [] - if framework._in_imperative_mode(): + if framework._in_dygraph_mode(): if parameter_list is not None: parameters = parameter_list else: - parameters = framework._imperative_tracer().all_parameters() + parameters = framework._dygraph_tracer().all_parameters() params_grads = [] for param in parameters: diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index b84ce2b3aeab7963f8de85eb09ff6e085e52c198..6b8622b6f26f6102e5ee02716f30a847ed9a2fed 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -262,14 +262,14 @@ class OpTest(unittest.TestCase): if isinstance(value, tuple): data = value[0] lod = value[1] - v = fluid.imperative.base.to_variable(value=data) + v = fluid.dygraph.base.to_variable(value=data) v._ivar.value().get_tensor().set_recursive_sequence_lengths(lod) return v else: - return fluid.imperative.base.to_variable(value) + return fluid.dygraph.base.to_variable(value) - def _calc_imperative_output(self, place, parallel=False, no_check_set=None): - with fluid.imperative.base.guard(place=place): + def _calc_dygraph_output(self, place, parallel=False, no_check_set=None): + with fluid.dygraph.base.guard(place=place): block = fluid.default_main_program().global_block() # prepare input variable @@ -316,7 +316,7 @@ class OpTest(unittest.TestCase): return outputs - def _calc_output(self, place, parallel=False, no_check_set=None): + def _calc_output(self, place, parallel=False, no_check_set=None, loss=None): program = Program() block = program.global_block() self._append_ops(block) @@ -329,8 +329,14 @@ class OpTest(unittest.TestCase): use_cuda = False if isinstance(place, fluid.CUDAPlace(0)): use_cuda = True - executor = fluid.ParallelExecutor( - use_cuda=use_cuda, loss_name=loss.name, main_program=program) + if loss: + executor = fluid.ParallelExecutor( + use_cuda=use_cuda, + loss_name=loss.name, + main_program=program) + else: + executor = fluid.ParallelExecutor( + use_cuda=use_cuda, main_program=program) else: executor = Executor(place) @@ -364,9 +370,9 @@ class OpTest(unittest.TestCase): atol, no_check_set=None, equal_nan=False, - check_imperative=False): - if check_imperative: - imperative_outs = self._calc_imperative_output( + check_dygraph=False): + if check_dygraph: + dygraph_outs = self._calc_dygraph_output( place, no_check_set=no_check_set) outs, fetch_list = self._calc_output(place, no_check_set=no_check_set) @@ -393,8 +399,8 @@ class OpTest(unittest.TestCase): type(sub_out)) for item in sub_out: sub_out_name, expect = item[0], item[1] - if check_imperative: - imperative_actual = imperative_outs[sub_out_name][0] + if check_dygraph: + imperative_actual = dygraph_outs[sub_out_name][0] imperative_actual_t = np.array( imperative_actual._ivar.value().get_tensor()) idx = find_actual(sub_out_name, fetch_list) @@ -407,7 +413,7 @@ class OpTest(unittest.TestCase): actual_t, expect_t, atol=atol, equal_nan=equal_nan), "Output (" + sub_out_name + ") has diff at " + str(place)) - if check_imperative: + if check_dygraph: self.assertTrue( np.allclose( imperative_actual_t, @@ -415,21 +421,21 @@ class OpTest(unittest.TestCase): atol=atol, equal_nan=equal_nan), "Output (" + sub_out_name + ") has diff at " + - str(place) + " in imperative mode") + str(place) + " in dygraph mode") if isinstance(expect, tuple): self.assertListEqual( actual.recursive_sequence_lengths(), expect[1], "Output (" + sub_out_name + ") has different lod at " + str(place)) - if check_imperative: + if check_dygraph: self.assertListEqual( imperative_actual._ivar.value().get_tensor() .recursive_sequence_lengths(), expect[1], "Output (" + out_name + ") has different lod at " + - str(place) + " in imperative mode") + str(place) + " in dygraph mode") else: - if check_imperative: - imperative_actual = imperative_outs[out_name][0] + if check_dygraph: + imperative_actual = dygraph_outs[out_name][0] imperative_actual_t = np.array( imperative_actual._ivar.value().get_tensor()) idx = find_actual(out_name, fetch_list) @@ -443,7 +449,7 @@ class OpTest(unittest.TestCase): "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + str(expect_t) + "\n" + "But Got" + str(actual_t) + " in class " + self.__class__.__name__) - if check_imperative: + if check_dygraph: self.assertTrue( np.allclose( imperative_actual_t, @@ -458,12 +464,12 @@ class OpTest(unittest.TestCase): self.assertListEqual(actual.recursive_sequence_lengths(), expect[1], "Output (" + out_name + ") has different lod at " + str(place)) - if check_imperative: + if check_dygraph: self.assertListEqual( imperative_actual._ivar.value().get_tensor() .recursive_sequence_lengths(), expect[1], "Output (" + out_name + ") has different lod at " + - str(place) + " in imperative mode") + str(place) + " in dygraph mode") def _get_places(self): if self.dtype == np.float16: @@ -490,11 +496,11 @@ class OpTest(unittest.TestCase): atol=1e-5, no_check_set=None, equal_nan=False, - check_imperative=False): + check_dygraph=False): places = self._get_places() for place in places: self.check_output_with_place(place, atol, no_check_set, equal_nan, - check_imperative) + check_dygraph) def check_output_customized(self, checker): places = self._get_places() diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 61fd9af1275865f2d03e759199c219b36d3a0b5b..18ed02a72275437fa6106e57c0383e17647d9700 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -43,6 +43,7 @@ class TestParallelExecutorBase(unittest.TestCase): use_ir_memory_optimize=True, enable_inplace=True, fuse_elewise_add_act_ops=False, + fuse_all_optimizer_ops=False, fuse_all_reduce_ops=False, fuse_relu_depthwise_conv=False, optimizer=fluid.optimizer.Adam, @@ -81,6 +82,7 @@ class TestParallelExecutorBase(unittest.TestCase): build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize + build_strategy.fuse_all_optimizer_ops = fuse_all_optimizer_ops build_strategy.fuse_all_reduce_ops = fuse_all_reduce_ops # python memory optimization is conflict with inplace pass. # Use ir graph memory optimization after inplace pass is the correct way. diff --git a/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py b/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py index 9d5fe114bad2b2bae73cf18e17ebd7af288a91da..29eb0166b771bbea5509de8b7714bc4608a07cd1 100644 --- a/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py +++ b/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py @@ -16,8 +16,10 @@ from __future__ import print_function import unittest import numpy as np - from op_test import OpTest +from paddle.fluid import core + +alignment = 256 class TestAllocContinuousSpace(OpTest): @@ -29,11 +31,11 @@ class TestAllocContinuousSpace(OpTest): self.constant = attrs["constant"] self.set_constant = attrs["set_constant"] self.Inputs = self.init_input() - self.FusedOutput = self.init_output(self.Inputs, self.set_constant, - self.constant) + self.Outputs, self.FusedOutput = self.init_output( + self.Inputs, self.set_constant, self.constant) self.inputs = {'Input': self.Inputs} self.attrs = attrs - self.outputs = {'Output': self.Inputs, 'FusedOutput': self.FusedOutput} + self.outputs = {'Output': self.Outputs, 'FusedOutput': self.FusedOutput} def init_dtype(self): self.dtype = np.float32 @@ -52,14 +54,31 @@ class TestAllocContinuousSpace(OpTest): return {"copy_data": True, "set_constant": False, "constant": 0.0} def init_output(self, input_list, set_constant, constant): - inputs = [input[1].flatten() for input in input_list] - output = np.concatenate(inputs) + inputs = [] + outputs = input_list + + for input in input_list: + length = len(input[1].flatten()) + aligned_len = (length + alignment) / alignment * alignment + out = np.zeros(int(aligned_len)) + out[0:length] = input[1].flatten() + inputs.append(out) + + alloc_continuous_space_var = np.concatenate([input for input in inputs]) if set_constant: - output = np.ones((len(output))) * constant - return output + alloc_continuous_space_var = np.ones( + (len(alloc_continuous_space_var))) * constant + outputs = [(out[0], + np.ones(out[1].shape).astype(self.dtype) * constant) + for out in outputs] + return outputs, alloc_continuous_space_var def test_check_output(self): - self.check_output() + if core.is_compiled_with_cuda(): + self.check_output_with_place( + place=core.CUDAPlace(0), + no_check_set=["FusedOutput"], + atol=1e-5) class TestAllocContinuousSpace2(TestAllocContinuousSpace): @@ -67,7 +86,11 @@ class TestAllocContinuousSpace2(TestAllocContinuousSpace): return {"copy_data": False, "set_constant": True, "constant": 0.5} def test_check_output(self): - self.check_output(no_check_set=["Output"]) + if core.is_compiled_with_cuda(): + self.check_output_with_place( + place=core.CUDAPlace(0), + no_check_set=["FusedOutput"], + atol=1e-5) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py index b12aaea3219cb81e8fa0e7584120db510fb7b62c..9cb88d4a8553f3b750f6cf3b24115b4d188ed1d6 100644 --- a/python/paddle/fluid/tests/unittests/test_base_layer.py +++ b/python/paddle/fluid/tests/unittests/test_base_layer.py @@ -18,7 +18,7 @@ import numpy as np import paddle.fluid as fluid -class L1(fluid.imperative.Layer): +class L1(fluid.dygraph.Layer): def __init__(self, prefix): super(L1, self).__init__(prefix) self._param_attr = fluid.ParamAttr( @@ -32,7 +32,7 @@ class L1(fluid.imperative.Layer): return self.w1 + self.w2 -class L2(fluid.imperative.Layer): +class L2(fluid.dygraph.Layer): def __init__(self, prefix): super(L2, self).__init__(prefix) self.layer1 = L1(self.full_name()) @@ -42,7 +42,7 @@ class L2(fluid.imperative.Layer): return self.layer1() + self.layer2() -class L3(fluid.imperative.Layer): +class L3(fluid.dygraph.Layer): def __init__(self, prefix): super(L3, self).__init__(prefix) self.layer1 = L2(self.full_name()) @@ -54,7 +54,7 @@ class L3(fluid.imperative.Layer): class TestBaseLayer(unittest.TestCase): def test_one_level(self): - with fluid.imperative.guard(): + with fluid.dygraph.guard(): l = L1('test_one_level') ret = l() self.assertEqual(l.w1.name, "test_one_level/L1_0.w_0") @@ -62,7 +62,7 @@ class TestBaseLayer(unittest.TestCase): self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2]))) def test_three_level(self): - with fluid.imperative.guard(): + with fluid.dygraph.guard(): l = L3('test_three_level') names = [p.name for p in l.parameters()] ret = l() diff --git a/python/paddle/fluid/tests/unittests/test_fsp_op.py b/python/paddle/fluid/tests/unittests/test_fsp_op.py index 6ad7418447b4bac5e6a6034f94540091590fa189..01991f4d36caf83173452c6a032c37852fa35586 100644 --- a/python/paddle/fluid/tests/unittests/test_fsp_op.py +++ b/python/paddle/fluid/tests/unittests/test_fsp_op.py @@ -39,19 +39,21 @@ class TestFSPOp(OpTest): self.op_type = "fsp" self.initTestCase() - feature_map_0 = np.random.uniform(0, 10, self.a_shape).astype('float32') - feature_map_1 = np.random.uniform(0, 10, self.b_shape).astype('float32') + feature_map_0 = np.random.uniform(0, 10, self.a_shape).astype('float64') + feature_map_1 = np.random.uniform(0, 10, self.b_shape).astype('float64') self.inputs = {'X': feature_map_0, 'Y': feature_map_1} self.outputs = {'Out': fsp_matrix(feature_map_0, feature_map_1)} def initTestCase(self): - self.a_shape = (2, 16, 32, 31) - self.b_shape = (2, 28, 32, 31) + self.a_shape = (2, 3, 5, 6) + self.b_shape = (2, 4, 5, 6) + @unittest.skip("Disable temporarily.") def test_check_output(self): self.check_output() + @unittest.skip("Disable temporarily.") def test_check_grad_normal(self): self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05) diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..93e67deaf3c9f7fe17296049137fbbe00374c6f1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py @@ -0,0 +1,135 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from parallel_executor_test_base import TestParallelExecutorBase +import paddle.fluid as fluid +import paddle.fluid.core as core +import numpy as np +import paddle +import paddle.dataset.mnist as mnist +import unittest +import os + + +def simple_fc_net(use_feed): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + hidden = img + for _ in range(4): + hidden = fluid.layers.fc( + hidden, + size=200, + act='relu', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def fc_with_batchnorm(use_feed): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + hidden = img + for _ in range(2): + hidden = fluid.layers.fc( + hidden, + size=200, + act='relu', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + + hidden = fluid.layers.batch_norm(input=hidden) + + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestFuseAdamOps(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + + def _init_data(self, random=True): + np.random.seed(5) + if random: + img = np.random.random(size=[32, 784]).astype(np.float32) + else: + img = np.ones(shape=[32, 784], dtype='float32') + label = np.ones(shape=[32, 1], dtype='int64') + return img, label + + def _compare_fused_optimizer_ops(self, + model, + use_cuda, + random_data=True, + optimizer=fluid.optimizer.Adam): + if use_cuda and not core.is_compiled_with_cuda(): + return + img, label = self._init_data(random_data) + not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + fuse_all_optimizer_ops=False, + memory_opt=False, # avoid the gradient's name changed in Python side. + optimizer=optimizer) + fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + fuse_all_optimizer_ops=True, + memory_opt=False, # avoid the gradient's name changed in Python side. + optimizer=optimizer) + + for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + + def test_simple_fc_with_fuse_op(self): + self._compare_fused_optimizer_ops(simple_fc_net, True) + self._compare_fused_optimizer_ops(simple_fc_net, False) + + def test_batchnorm_fc_with_fuse_op(self): + self._compare_fused_optimizer_ops(fc_with_batchnorm, True) + # self._compare_fused_optimizer_ops(fc_with_batchnorm, False) + + +class TestFuseSGDOps(TestFuseAdamOps): + def sgd_optimizer(self, learning_rate=1e-4): + return fluid.optimizer.SGD(learning_rate=learning_rate) + + def test_simple_fc_with_fuse_op(self): + self._compare_fused_optimizer_ops( + simple_fc_net, True, optimizer=self.sgd_optimizer) + self._compare_fused_optimizer_ops( + simple_fc_net, False, optimizer=self.sgd_optimizer) + + def test_batchnorm_fc_with_fuse_op(self): + self._compare_fused_optimizer_ops( + fc_with_batchnorm, True, optimizer=self.sgd_optimizer) + self._compare_fused_optimizer_ops( + fc_with_batchnorm, False, optimizer=self.sgd_optimizer) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py index 848c9a4952aebcf93fd7bf12f7bc4cd15c7a8b28..c66d59aceb05dfbf9beac809ff13841a77953695 100644 --- a/python/paddle/fluid/tests/unittests/test_gru_op.py +++ b/python/paddle/fluid/tests/unittests/test_gru_op.py @@ -156,7 +156,7 @@ class TestGRUOp(OpTest): } def test_check_output(self): - self.check_output(atol=1e-8, check_imperative=True) + self.check_output(atol=1e-8, check_dygraph=True) def test_check_grad(self): self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden']) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index 4c44195a3d42a1a2a4a072b0513f212b22269c31..13f2d662178c7e1474ec43fdeadf7046516eb8e5 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -18,11 +18,11 @@ import numpy as np import paddle.fluid as fluid from paddle.fluid import core -from paddle.fluid.imperative.nn import FC +from paddle.fluid.dygraph.nn import FC from test_imperative_base import new_program_scope -class MyLayer(fluid.imperative.Layer): +class MyLayer(fluid.dygraph.Layer): def __init__(self, name_scope): super(MyLayer, self).__init__(name_scope) @@ -34,7 +34,7 @@ class MyLayer(fluid.imperative.Layer): return [x] -class MyPyLayer(fluid.imperative.PyLayer): +class MyPyLayer(fluid.dygraph.PyLayer): def __init__(self): super(MyPyLayer, self).__init__() @@ -48,7 +48,7 @@ class MyPyLayer(fluid.imperative.PyLayer): return np.array(dout) * (1 - np.square(np.array(out))) -class MLP(fluid.imperative.Layer): +class MLP(fluid.dygraph.Layer): def __init__(self, name_scope): super(MLP, self).__init__(name_scope) self._fc1 = FC(self.full_name(), @@ -71,7 +71,7 @@ class MLP(fluid.imperative.Layer): return x -class SimpleRNNCell(fluid.imperative.Layer): +class SimpleRNNCell(fluid.dygraph.Layer): def __init__(self, name_scope, step_input_size, hidden_size, output_size, param_attr): super(SimpleRNNCell, self).__init__(name_scope) @@ -159,7 +159,7 @@ class SimpleRNNCell(fluid.imperative.Layer): return reduce_out, hidden -class SimpleRNN(fluid.imperative.Layer): +class SimpleRNN(fluid.dygraph.Layer): def __init__(self, name_scope): super(SimpleRNN, self).__init__(name_scope) self.seq_len = 4 @@ -194,10 +194,10 @@ class SimpleRNN(fluid.imperative.Layer): class TestImperative(unittest.TestCase): def test_sum_op(self): x = np.ones([2, 2], np.float32) - with fluid.imperative.guard(): + with fluid.dygraph.guard(): inputs = [] for _ in range(10): - inputs.append(fluid.imperative.base.to_variable(x)) + inputs.append(fluid.dygraph.base.to_variable(x)) ret = fluid.layers.sums(inputs) loss = fluid.layers.reduce_sum(ret) loss._backward() @@ -205,17 +205,17 @@ class TestImperative(unittest.TestCase): self.assertTrue(np.allclose(inputs[0]._gradient(), x)) def test_layer(self): - with fluid.imperative.guard(): + with fluid.dygraph.guard(): cl = core.Layer() cl.forward([]) - l = fluid.imperative.Layer("l") + l = fluid.dygraph.Layer("l") self.assertRaises(NotImplementedError, l.forward, []) def test_pylayer_func_id(self): - with fluid.imperative.guard(): + with fluid.dygraph.guard(): - class PyLayer1(fluid.imperative.PyLayer): + class PyLayer1(fluid.dygraph.PyLayer): def __init__(self): super(PyLayer1, self).__init__() @@ -227,7 +227,7 @@ class TestImperative(unittest.TestCase): def backward(input): return input - class PyLayer2(fluid.imperative.PyLayer): + class PyLayer2(fluid.dygraph.PyLayer): def __init__(self): super(PyLayer2, self).__init__() @@ -241,21 +241,21 @@ class TestImperative(unittest.TestCase): py_layer_1 = PyLayer1() py_layer_2 = PyLayer2() - py_layer_1(fluid.imperative.base.to_variable(np.ones([2, 2]))) - py_layer_2(fluid.imperative.base.to_variable(np.ones([2, 2]))) + py_layer_1(fluid.dygraph.base.to_variable(np.ones([2, 2]))) + py_layer_2(fluid.dygraph.base.to_variable(np.ones([2, 2]))) id = py_layer_1.forward_id self.assertGreater(id, 0) self.assertEqual(py_layer_1.backward_id, id + 1) self.assertEqual(py_layer_2.forward_id, id + 2) self.assertEqual(py_layer_2.backward_id, id + 3) - py_layer_1(fluid.imperative.base.to_variable(np.ones([2, 2]))) + py_layer_1(fluid.dygraph.base.to_variable(np.ones([2, 2]))) self.assertEqual(py_layer_1.forward_id, id) def test_pylayer(self): np_inp = np.ones([2, 2], np.float32) - with fluid.imperative.guard(): + with fluid.dygraph.guard(): my_py_layer = MyPyLayer() - var_inp = fluid.imperative.base.to_variable(np_inp) + var_inp = fluid.dygraph.base.to_variable(np_inp) outs = my_py_layer(var_inp) dy_out = np.sum(outs[0]._numpy()) outs[0]._backward() @@ -282,8 +282,8 @@ class TestImperative(unittest.TestCase): def test_layer_in_out(self): np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32) - with fluid.imperative.guard(): - var_inp = fluid.imperative.base.to_variable(np_inp) + with fluid.dygraph.guard(): + var_inp = fluid.dygraph.base.to_variable(np_inp) l = MyLayer("my_layer") x = l(var_inp)[0] self.assertIsNotNone(x) @@ -310,8 +310,8 @@ class TestImperative(unittest.TestCase): def test_mlp(self): np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) - with fluid.imperative.guard(): - var_inp = fluid.imperative.base.to_variable(np_inp) + with fluid.dygraph.guard(): + var_inp = fluid.dygraph.base.to_variable(np_inp) mlp = MLP("mlp") out = mlp(var_inp) dy_out = out._numpy() @@ -353,8 +353,8 @@ class TestImperative(unittest.TestCase): [10.0, 11.0, 12.0]]) np_inp = np_inp.reshape((1, 4, 3)) np_inp = np_inp.astype(np.float32) - with fluid.imperative.guard(): - var_inp = fluid.imperative.base.to_variable(np_inp) + with fluid.dygraph.guard(): + var_inp = fluid.dygraph.base.to_variable(np_inp) var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3]) simple_rnn = SimpleRNN("simple_rnn") outs, pre_hiddens = simple_rnn.forward(var_inp) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py index 62c25f734598e35b7c668d1ec1b89b5c57449f73..a92b7d62fa598a3ec9b53bade2805cc033f4b9d9 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py @@ -18,11 +18,11 @@ import numpy as np import paddle import paddle.fluid as fluid from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC -from paddle.fluid.imperative.base import to_variable +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC +from paddle.fluid.dygraph.base import to_variable -class SimpleImgConvPool(fluid.imperative.Layer): +class SimpleImgConvPool(fluid.dygraph.Layer): def __init__(self, name_scope, num_channels, @@ -71,7 +71,7 @@ class SimpleImgConvPool(fluid.imperative.Layer): return x -class MNIST(fluid.imperative.Layer): +class MNIST(fluid.dygraph.Layer): def __init__(self, name_scope): super(MNIST, self).__init__(name_scope) @@ -98,12 +98,12 @@ class MNIST(fluid.imperative.Layer): return x -class TestImperativeCheckpoint(unittest.TestCase): +class TestDygraphCheckpoint(unittest.TestCase): def save_load_persistables(self): seed = 90 epoch_num = 1 - with fluid.imperative.guard(): + with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed @@ -135,14 +135,14 @@ class TestImperativeCheckpoint(unittest.TestCase): avg_loss._backward() sgd.minimize(avg_loss) - fluid.imperative.save_persistables(mnist, "save_dir") + fluid.dygraph.save_persistables(mnist, "save_dir") mnist.clear_gradients() for param in mnist.parameters(): dy_param_init_value[param.name] = param._numpy() mnist.load_dict( - fluid.imperative.load_persistables(mnist, "save_dir")) + fluid.dygraph.load_persistables(mnist, "save_dir")) restore = mnist.parameters() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py index ac123ee8db26ac23bbf9454e399a592a28c91c32..ccebd4a54727f383bd4e46ff57bfdc9381577d05 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py @@ -22,7 +22,7 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core from test_imperative_base import new_program_scope -from paddle.fluid.imperative.base import to_variable +from paddle.fluid.dygraph.base import to_variable # Can use Amusic dataset as the DeepCF describes. DATA_PATH = os.environ.get('DATA_PATH', '') @@ -32,11 +32,11 @@ NUM_BATCHES = int(os.environ.get('NUM_BATCHES', 5)) NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1)) -class DMF(fluid.imperative.Layer): +class DMF(fluid.dygraph.Layer): def __init__(self, name_scope): super(DMF, self).__init__(name_scope) - self._user_latent = fluid.imperative.FC(self.full_name(), 256) - self._item_latent = fluid.imperative.FC(self.full_name(), 256) + self._user_latent = fluid.dygraph.FC(self.full_name(), 256) + self._item_latent = fluid.dygraph.FC(self.full_name(), 256) self._user_layers = [] self._item_layers = [] @@ -45,12 +45,12 @@ class DMF(fluid.imperative.Layer): self._user_layers.append( self.add_sublayer( 'user_layer_%d' % i, - fluid.imperative.FC( + fluid.dygraph.FC( self.full_name(), self._hid_sizes[i], act='relu'))) self._item_layers.append( self.add_sublayer( 'item_layer_%d' % i, - fluid.imperative.FC( + fluid.dygraph.FC( self.full_name(), self._hid_sizes[i], act='relu'))) def forward(self, users, items): @@ -63,18 +63,18 @@ class DMF(fluid.imperative.Layer): return fluid.layers.elementwise_mul(users, items) -class MLP(fluid.imperative.Layer): +class MLP(fluid.dygraph.Layer): def __init__(self, name_scope): super(MLP, self).__init__(name_scope) - self._user_latent = fluid.imperative.FC(self.full_name(), 256) - self._item_latent = fluid.imperative.FC(self.full_name(), 256) + self._user_latent = fluid.dygraph.FC(self.full_name(), 256) + self._item_latent = fluid.dygraph.FC(self.full_name(), 256) self._match_layers = [] self._hid_sizes = [128, 64] for i in range(len(self._hid_sizes)): self._match_layers.append( self.add_sublayer( 'match_layer_%d' % i, - fluid.imperative.FC( + fluid.dygraph.FC( self.full_name(), self._hid_sizes[i], act='relu'))) self._mat @@ -88,7 +88,7 @@ class MLP(fluid.imperative.Layer): return match_vec -class DeepCF(fluid.imperative.Layer): +class DeepCF(fluid.dygraph.Layer): def __init__(self, name_scope, num_users, num_items, matrix): super(DeepCF, self).__init__(name_scope) self._num_users = num_users @@ -103,7 +103,7 @@ class DeepCF(fluid.imperative.Layer): self._mlp = MLP(self.full_name()) self._dmf = DMF(self.full_name()) - self._match_fc = fluid.imperative.FC(self.full_name(), 1, act='sigmoid') + self._match_fc = fluid.dygraph.FC(self.full_name(), 1, act='sigmoid') def forward(self, users, items): # users_emb = self._user_emb(users) @@ -191,7 +191,7 @@ def load_data(DATA_PATH): np.expand_dims(labels_np, -1), num_users, num_items, matrix -class TestImperativeDeepCF(unittest.TestCase): +class TestDygraphDeepCF(unittest.TestCase): def test_deefcf(self): seed = 90 if DATA_PATH: @@ -237,7 +237,7 @@ class TestImperativeDeepCF(unittest.TestCase): fetch_list=[loss])[0] sys.stderr.write('static loss %s\n' % static_loss) - with fluid.imperative.guard(): + with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py index 6024fb5f816d10cedad36272e353704797526676..58faa1cb85af9cedb70f3a12244cfeb44e0f4f52 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py @@ -22,12 +22,12 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC from test_imperative_base import new_program_scope -from paddle.fluid.imperative.base import to_variable +from paddle.fluid.dygraph.base import to_variable -class Discriminator(fluid.imperative.Layer): +class Discriminator(fluid.dygraph.Layer): def __init__(self, name_scope): super(Discriminator, self).__init__(name_scope) self._fc1 = FC(self.full_name(), size=32, act='elu') @@ -38,7 +38,7 @@ class Discriminator(fluid.imperative.Layer): return self._fc2(x) -class Generator(fluid.imperative.Layer): +class Generator(fluid.dygraph.Layer): def __init__(self, name_scope): super(Generator, self).__init__(name_scope) self._fc1 = FC(self.full_name(), size=64, act='elu') @@ -51,7 +51,7 @@ class Generator(fluid.imperative.Layer): return self._fc3(x) -class TestImperativeGAN(unittest.TestCase): +class TestDygraphGAN(unittest.TestCase): def test_gan_float32(self): seed = 90 @@ -130,7 +130,7 @@ class TestImperativeGAN(unittest.TestCase): scope.find_var(param.name).get_tensor()) dy_params = dict() - with fluid.imperative.guard(): + with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py index 2086fab5c81e241d1a49386d8285289b14364dc8..a8fb9ecfe4be16b73ac2144259f25ed3859ece7e 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py @@ -22,16 +22,16 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.optimizer import AdamOptimizer -from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC from test_imperative_base import new_program_scope -from paddle.fluid.imperative.base import to_variable +from paddle.fluid.dygraph.base import to_variable def gen_data(): pass -class GraphConv(fluid.imperative.Layer): +class GraphConv(fluid.dygraph.Layer): def __init__(self, name_scope, in_features, out_features): super(GraphConv, self).__init__(name_scope) @@ -50,7 +50,7 @@ class GraphConv(fluid.imperative.Layer): return fluid.layers.matmul(adj, support) + self.bias -class GCN(fluid.imperative.Layer): +class GCN(fluid.dygraph.Layer): def __init__(self, name_scope, num_hidden): super(GCN, self).__init__(name_scope) self.gc = GraphConv(self.full_name(), num_hidden, 32) @@ -61,7 +61,7 @@ class GCN(fluid.imperative.Layer): return self.gc2(x, adj) -class TestImperativeGNN(unittest.TestCase): +class TestDygraphGNN(unittest.TestCase): def test_gnn_float32(self): seed = 90 @@ -115,7 +115,7 @@ class TestImperativeGNN(unittest.TestCase): static_weight = np.array( scope.find_var(model.gc.weight.name).get_tensor()) - with fluid.imperative.guard(): + with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index ef34b998d1455c92850df77c7fd4d85b8a4fddaf..8b659a3e08e381dd6f55b666d9f5f1b172a51930 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -23,12 +23,12 @@ import paddle import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.optimizer import SGDOptimizer, Adam -from paddle.fluid.imperative.nn import FC -from paddle.fluid.imperative.base import to_variable +from paddle.fluid.dygraph.nn import FC +from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope -class MLP(fluid.imperative.Layer): +class MLP(fluid.dygraph.Layer): def __init__(self, name_scope, param_attr=None, bias_attr=None): super(MLP, self).__init__(name_scope) @@ -50,7 +50,7 @@ class TestImperativeOptimizerBase(unittest.TestCase): def _check_mlp(self): seed = 90 - with fluid.imperative.guard(): + with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py index 460ba65a48c863315cda4847aee1b4e2366bba96..998c675815ece9236c819bffc4a4b74d44ff790e 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py @@ -16,17 +16,17 @@ from __future__ import print_function import unittest import paddle.fluid as fluid -from paddle.fluid.imperative.nn import Embedding +from paddle.fluid.dygraph.nn import Embedding import paddle.fluid.framework as framework from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.imperative.base import to_variable +from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope import numpy as np import six from paddle.fluid.backward import append_backward -class SimpleLSTMRNN(fluid.imperative.Layer): +class SimpleLSTMRNN(fluid.dygraph.Layer): def __init__(self, name_scope, hidden_size, @@ -131,7 +131,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer): return real_res, last_hidden, last_cell -class PtbModel(fluid.imperative.Layer): +class PtbModel(fluid.dygraph.Layer): def __init__(self, name_scope, hidden_size, @@ -214,7 +214,7 @@ class PtbModel(fluid.imperative.Layer): return loss, last_hidden, last_cell -class TestImperativePtbRnn(unittest.TestCase): +class TestDygraphPtbRnn(unittest.TestCase): def test_ptb_rnn_cpu_float32(self): seed = 90 hidden_size = 10 @@ -224,7 +224,7 @@ class TestImperativePtbRnn(unittest.TestCase): init_scale = 0.1 batch_size = 4 - with fluid.imperative.guard(): + with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed # TODO: marsyang1993 Change seed to diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py index ab9298890bf69774fd842ec202d833be0a57f7ad..1d786d584632769e4318bcdeb24ef7ef8ea18597 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py @@ -21,8 +21,8 @@ import paddle import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.layer_helper import LayerHelper -from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC -from paddle.fluid.imperative.base import to_variable +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC +from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope batch_size = 8 @@ -57,7 +57,7 @@ def optimizer_setting(params): lr = [] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] optimizer = fluid.optimizer.SGD(learning_rate=0.01) - # TODO(minqiyang): Add learning rate scheduler support to imperative mode + # TODO(minqiyang): Add learning rate scheduler support to dygraph mode # optimizer = fluid.optimizer.Momentum( # learning_rate=params["lr"], # learning_rate=fluid.layers.piecewise_decay( @@ -68,7 +68,7 @@ def optimizer_setting(params): return optimizer -class ConvBNLayer(fluid.imperative.Layer): +class ConvBNLayer(fluid.dygraph.Layer): def __init__(self, name_scope, num_channels, @@ -99,7 +99,7 @@ class ConvBNLayer(fluid.imperative.Layer): return y -class BottleneckBlock(fluid.imperative.Layer): +class BottleneckBlock(fluid.dygraph.Layer): def __init__(self, name_scope, num_channels, @@ -156,7 +156,7 @@ class BottleneckBlock(fluid.imperative.Layer): return layer_helper.append_activation(y) -class ResNet(fluid.imperative.Layer): +class ResNet(fluid.dygraph.Layer): def __init__(self, name_scope, layers=50, class_dim=102): super(ResNet, self).__init__(name_scope) @@ -226,13 +226,13 @@ class ResNet(fluid.imperative.Layer): return y -class TestImperativeResnet(unittest.TestCase): +class TestDygraphResnet(unittest.TestCase): def test_resnet_float32(self): seed = 90 batch_size = train_parameters["batch_size"] batch_num = 20 - with fluid.imperative.guard(): + with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py index b06d3e8894072943b06456340f928cda260763c3..3bdf3349730b0c9916449cfe0658d5a3c88834ed 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import paddle.fluid as fluid -from paddle.fluid.imperative import Embedding, LayerNorm, FC, to_variable, Layer, guard +from paddle.fluid.dygraph import Embedding, LayerNorm, FC, to_variable, Layer, guard from test_imperative_base import new_program_scope from paddle.fluid import core import numpy as np @@ -623,7 +623,7 @@ class PrepareEncoderDecoderLayer(Layer): initializer=fluid.initializer.NumpyArrayInitializer(pos_inp), trainable=False)) - # use in imperative_mode to fit different length batch + # use in dygraph_mode to fit different length batch # self._pos_emb._w = to_variable( # position_encoding_init(self._src_max_len, self._src_emb_dim)) @@ -946,7 +946,7 @@ class TransFormer(Layer): return sum_cost, avg_cost, predict, token_num -class TestImperativeTransformer(unittest.TestCase): +class TestDygraphTransformer(unittest.TestCase): def test_transformer_float32(self): seed = 90 with guard(): diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 7fd9617cc7687a5a553ed22cfed560aef8058496..90487d4ef22cd47c5e503bebf40c7ac8adfd83e1 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -29,8 +29,8 @@ from paddle.fluid import core from paddle.fluid.initializer import Constant import paddle.fluid.layers as layers from test_imperative_base import new_program_scope -from paddle.fluid.imperative import nn -from paddle.fluid.imperative import base +from paddle.fluid.dygraph import nn +from paddle.fluid.dygraph import base class LayerTest(unittest.TestCase): @@ -68,7 +68,7 @@ class LayerTest(unittest.TestCase): @contextlib.contextmanager def dynamic_graph(self, force_to_use_cpu=False): - with fluid.imperative.guard( + with fluid.dygraph.guard( self._get_place(force_to_use_cpu=force_to_use_cpu)): fluid.default_startup_program().random_seed = self.seed fluid.default_main_program().random_seed = self.seed diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py index ba63213a410b8b2579b6842c5a6ecd720c7957b3..6671a2def3cccd2acd76025e73486b06b4bb1471 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -61,6 +61,11 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, param_attr=fluid.ParamAttr( name=embedding_name, trainable=False)) for x in word_input ] + # TODO(zcd): if the parameter is not trainable, the + # parameter's gradient should not generated. + for emb_layer in emb_layers: + emb_layer.stop_gradient = True + emb_layers.append(predicate_embedding) emb_layers.append(mark_embedding) @@ -113,60 +118,62 @@ class TestCRFModel(unittest.TestCase): os.environ['CPU_NUM'] = str(4) main = fluid.Program() startup = fluid.Program() - with fluid.program_guard(main, startup): - word = fluid.layers.data( - name='word_data', shape=[1], dtype='int64', lod_level=1) - predicate = fluid.layers.data( - name='verb_data', shape=[1], dtype='int64', lod_level=1) - ctx_n2 = fluid.layers.data( - name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) - ctx_n1 = fluid.layers.data( - name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) - ctx_0 = fluid.layers.data( - name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) - ctx_p1 = fluid.layers.data( - name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) - ctx_p2 = fluid.layers.data( - name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) - mark = fluid.layers.data( - name='mark_data', shape=[1], dtype='int64', lod_level=1) - - feature_out = db_lstm(**locals()) - target = fluid.layers.data( - name='target', shape=[1], dtype='int64', lod_level=1) - crf_cost = fluid.layers.linear_chain_crf( - input=feature_out, - label=target, - param_attr=fluid.ParamAttr( - name='crfw', learning_rate=1e-1)) - avg_cost = fluid.layers.mean(crf_cost) - - sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.layers.exponential_decay( - learning_rate=0.01, - decay_steps=100000, - decay_rate=0.5, - staircase=True)) - sgd_optimizer.minimize(avg_cost) - - train_data = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.conll05.test(), buf_size=8192), - batch_size=16) - - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup) - - train_cp = compiler.CompiledProgram(main).with_data_parallel( - loss_name=avg_cost.name, build_strategy=build_strategy) - - feeder = fluid.DataFeeder( - feed_list=[ - word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, - mark, target - ], - place=fluid.CPUPlace()) + scope = fluid.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(main, startup): + word = fluid.layers.data( + name='word_data', shape=[1], dtype='int64', lod_level=1) + predicate = fluid.layers.data( + name='verb_data', shape=[1], dtype='int64', lod_level=1) + ctx_n2 = fluid.layers.data( + name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) + ctx_n1 = fluid.layers.data( + name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) + ctx_0 = fluid.layers.data( + name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) + ctx_p1 = fluid.layers.data( + name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) + ctx_p2 = fluid.layers.data( + name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) + mark = fluid.layers.data( + name='mark_data', shape=[1], dtype='int64', lod_level=1) + + feature_out = db_lstm(**locals()) + target = fluid.layers.data( + name='target', shape=[1], dtype='int64', lod_level=1) + crf_cost = fluid.layers.linear_chain_crf( + input=feature_out, + label=target, + param_attr=fluid.ParamAttr( + name='crfw', learning_rate=1e-1)) + avg_cost = fluid.layers.mean(crf_cost) + + sgd_optimizer = fluid.optimizer.SGD( + learning_rate=fluid.layers.exponential_decay( + learning_rate=0.01, + decay_steps=100000, + decay_rate=0.5, + staircase=True)) + sgd_optimizer.minimize(avg_cost) + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.conll05.test(), buf_size=8192), + batch_size=16) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup) + + train_cp = compiler.CompiledProgram(main).with_data_parallel( + loss_name=avg_cost.name, build_strategy=build_strategy) + + feeder = fluid.DataFeeder( + feed_list=[ + word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, + mark, target + ], + place=fluid.CPUPlace()) data = train_data() for i in range(10): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py index 17f8f5a0b4f753aabe8af3f97c2018cd2cf54dc1..d0eca7d6dfbdf03828125508c798a9bd31f8bbd6 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py @@ -41,14 +41,15 @@ class TestBase(unittest.TestCase): fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()) exe.run(startup_prog) - for _ in six.moves.xrange(iter): - exe_strategy = fluid.ExecutionStrategy() - exe_strategy._dry_run = True - exe_strategy.use_experimental_executor = use_experimental_executor - train_cp = compiler.CompiledProgram(main_prog).with_data_parallel( - loss_name=loss.name, exec_strategy=exe_strategy) - for _ in six.moves.xrange(iter_per_pe): - exe.run(train_cp) + exe_strategy = fluid.ExecutionStrategy() + exe_strategy._dry_run = True + exe_strategy.use_experimental_executor = use_experimental_executor + train_cp = compiler.CompiledProgram( + main_prog).with_data_parallel( + loss_name=loss.name, exec_strategy=exe_strategy) + for _ in six.moves.xrange(iter): + for _ in six.moves.xrange(iter_per_pe): + exe.run(train_cp) class TestMNISTDryRun(TestBase): diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py index 076ee3baf96ab3c16f3ed9a3b9a15e2eb2aaed77..601da5839015efd81ea302e1cae65ba3c7bb22fc 100644 --- a/python/paddle/fluid/tests/unittests/test_variable.py +++ b/python/paddle/fluid/tests/unittests/test_variable.py @@ -19,7 +19,6 @@ from paddle.fluid.framework import default_main_program, Program, convert_np_dty import paddle.fluid as fluid import paddle.fluid.core as core import numpy as np -from test_imperative_base import new_program_scope class TestVariable(unittest.TestCase): @@ -153,7 +152,7 @@ class TestVariableImperative(unittest.TestCase): self.assertEqual([1, 1, 100], nw.shape) def test_slice(self): - with fluid.imperative.guard(): + with fluid.dygraph.guard(): self._test_slice() diff --git a/python/setup.py.in b/python/setup.py.in index 9f87f5644fc969f3f55fd08689f3e2bbaf36dc39..68f96273a23c725d1643e8e7397bc970411dd191 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -102,7 +102,7 @@ packages=['paddle', 'paddle.reader', 'paddle.distributed', 'paddle.fluid', - 'paddle.fluid.imperative', + 'paddle.fluid.dygraph', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', 'paddle.fluid.distributed', diff --git a/tools/print_signatures.py b/tools/print_signatures.py index d32b247342cc0c37b7bcff7b676cb47a4f429dfd..6a262529b5cac7e596e65d23de6cc4b5d720cacb 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -28,7 +28,7 @@ import hashlib member_dict = collections.OrderedDict() -experimental_namespace = {"paddle.fluid.imperative"} +experimental_namespace = {"paddle.fluid.dygraph"} def md5(doc):