diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index e6f5cb7473cdac95afabef8b133131ad71867f7b..79277a4174b7a14d53465fe27512d2130a7430a3 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -134,7 +134,7 @@ paddle.fluid.layers.sampled_softmax_with_cross_entropy (ArgSpec(args=['logits', paddle.fluid.layers.hsigmoid (ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False)), ('document', '80641ee6810b1cdc3fd6e14fc89ecc9d')) paddle.fluid.layers.beam_search (ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'is_accumulated', 'name', 'return_parent_idx'], varargs=None, keywords=None, defaults=(0, True, None, False)), ('document', 'b350b9a30a18e7efd7e1bb740eef6996')) paddle.fluid.layers.row_conv (ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)), ('document', '17485788fffe4e2d36dc58c2ac8d174e')) -paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '013795af319e2e86d3506741941078ee')) +paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None), ('document', '2c4d1ae83da6ed35e3b36ba1b3b51d23')) paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', 'de6a906950bae9f3c245cb744d22b94e')) paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '419c3a24a83cc89219a029cf4092788b')) paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '3f536aafba30d793287b52d231baff1b')) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 4d54754cec00dc435000138d4f297af243813fc3..af4d375e314277fa1f0239bf031a39c3d47eace1 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -195,8 +195,7 @@ cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry proto_desc) -cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS op_registry proto_desc op_info memory_optimize_helper) - +cc_test(inplace_op_inference_test SRCS inplace_op_inference_test.cc DEPS inplace_op_pass op_registry proto_desc op_info memory_optimize_helper pass_builder) cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 10aa7a59422f4508dda8d0bcd960583056e25938..72c50518af08b9c1b2f97e6864e5836e806c77fc 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -134,6 +134,11 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, out_layout = out_layout == DataLayout::kAnyLayout ? DataLayout::kNCHW : out_layout; + auto& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = dynamic_cast( + pool.Get(expected_kernel_type.place_)); + auto& cpu_engine = dev_ctx->GetEngine(); + std::vector in_tz = paddle::framework::vectorize2int(in.dims()); std::vector out_tz = in_tz; @@ -142,25 +147,29 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, "Input tensor type is not supported: %s", in.type()); memory::data_type out_type = in_type; + auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format()); + auto out_format = + platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout)); + // output tensor has the same dims as input. Reorder don't change dims out->Resize(in.dims()); - // tempory mem pd fr out , to make reorder - auto out_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(out->dims()), - mkldnn::memory::format::blocked, out_type); - if (in.get_mkldnn_prim_desc() != out_mem_pd) { + if (in_format != out_format) { void* in_data = GetDataFromTensor(in, in_type); auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); - auto in_memory = memory(in.get_mkldnn_prim_desc(), in_data); - auto out_memory = memory(out_mem_pd, out_data); + auto in_memory = + memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data); + auto out_memory = + memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data); platform::Reorder(in_memory, out_memory); } else { out->ShareDataWith(in); } out->set_layout(out_layout); + // reset format since the out tensor will be feed to non-MKLDNN OPkernel + out->set_format(memory::format::format_undef); #endif } diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index f0203edf05635452bf347335066dadc24ecc3138..82872224501709080ff02a13464d58543a0abda8 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -51,31 +51,13 @@ void TransformData(const OpKernelType &expected_kernel_type, #ifdef PADDLE_WITH_MKLDNN // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel // Just set layout/format. No real transform occur + + auto out_format = platform::MKLDNNFormatForSize(in.dims().size(), + ToMKLDNNFormat(lin)); + out.ShareDataWith(input_tensor); - // TODO(jczaja): Remove that once all mkldnn ops - // are modified to work with mkldnn_blocked - auto mkldnn_fmt = [&](int rank) { - switch (rank) { - case 5: - return mkldnn::memory::format::ncdhw; - case 4: - return mkldnn::memory::format::nchw; - case 3: - return mkldnn::memory::format::ncw; - case 2: - return mkldnn::memory::format::nc; - case 1: - return mkldnn::memory::format::x; - default: - return mkldnn::memory::format::blocked; - } - }; - - auto out_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(out.dims()), - mkldnn_fmt(out.dims().size())); - - out.set_mkldnn_prim_desc(out_mem_pd); + out.set_layout(DataLayout::kMKLDNN); + out.set_format(out_format); #endif } else { // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 046ec6978a84fa6eba11513860523de5a63a31d8..d4939779a2401c9828e0478f5f5de780907c767e 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -10,7 +10,10 @@ cc_library(fetch_barrier_op_handle SRCS fetch_barrier_op_handle.cc DEPS framewor cc_library(multi_devices_helper SRCS multi_devices_helper.cc DEPS graph graph_helper) cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper) cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper) + cc_library(alloc_continuous_space_for_grad_pass SRCS alloc_continuous_space_for_grad_pass.cc DEPS graph graph_helper) +cc_library(fuse_adam_op_pass SRCS fuse_adam_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper) +cc_library(fuse_sgd_op_pass SRCS fuse_sgd_op_pass.cc fuse_optimizer_op_pass.cc DEPS graph graph_helper) cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) @@ -104,5 +107,7 @@ cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass fuse_elewise_add_act_pass multi_batch_merge_pass - fuse_relu_depthwise_conv_pass - memory_optimize_pass lock_free_optimize_pass alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass) + fuse_relu_depthwise_conv_pass + memory_optimize_pass lock_free_optimize_pass + alloc_continuous_space_for_grad_pass fuse_all_reduce_op_pass + fuse_adam_op_pass fuse_sgd_op_pass) diff --git a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc index e195e93fb842ef71d4849aa2995788a91dafe924..8e8258ffb124e5008954a455264f5c0bc5cabc37 100644 --- a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc +++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/op_registry.h" + DEFINE_uint32(fuse_parameter_memory_size, 0, // 0 KB "fuse_parameter_memory_size is up limited memory size " "of one group parameters' gradient which is the input " @@ -105,20 +106,29 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { auto ele_dtype = iter->second->Var()->GetDataType(); if (dtype == kDefaultDtype) { dtype = ele_dtype; - PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype); + PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype, + "The data type should not be bool."); } - PADDLE_ENFORCE_EQ(ele_dtype, dtype); + PADDLE_ENFORCE_EQ(ele_dtype, dtype, + "The data type of input is not consistent."); } - // Create the fused variable name. + // Create a FusedVarsSet to avoid duplicating names for fused_var in other + // pass. if (!result.Has(kFusedVars)) { result.Set(kFusedVars, new FusedVars); } - const std::string prefix(kFusedVarNamePrefix); - // The fused_var_name should be unique. - auto fused_var_name = prefix + "GRAD@" + params_grads[0].second; + // the kFusedGrads is used be fuse_optimizer_op_pass. + result.Set(kFusedGrads, new FusedGrads); + + // the fused_var_name should be unique, so it appends + // params_grads.begin()->second. + auto fused_var_name = std::string(kFusedVarNamePrefix) + "@GRAD@" + + params_grads.begin()->second; + result.Get(kFusedGrads) = fused_var_name; auto &fused_var_set = result.Get(kFusedVars); - PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0); + PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0, + "%s is duplicate in FusedVars.", fused_var_name); fused_var_set.insert(fused_var_name); InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars, @@ -295,17 +305,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { return type == proto::VarType::LOD_TENSOR; } - void AppendAllocSpaceForVarsOp(const std::vector ¶ms_name, - const std::vector &grads_name, - const std::string &fused_var_name, - BlockDesc *global_block) const { - auto op_desc = global_block->AppendOp(); - op_desc->SetType("alloc_continuous_space"); - op_desc->SetInput("Input", params_name); - op_desc->SetOutput("Output", grads_name); - op_desc->SetOutput("FusedOutput", {fused_var_name}); - } - void RecordParamsAndGrads(ir::Node *node, ParamsAndGrads *params_grads) const { try { @@ -358,6 +357,7 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { } } + // Alloc continuous space for vars. std::vector grads_name; std::vector params_name; grads_name.reserve(params_grads.size()); @@ -370,7 +370,6 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name, program_desc.MutableBlock(0)); - // Run Only Once Programs for (size_t i = 0; i < local_scopes.size(); ++i) { for (auto &op_desc : program_desc.Block(0).AllOps()) { auto op = OpRegistry::CreateOp(*op_desc); @@ -378,6 +377,17 @@ class AllocContinuousSpaceForGradPass : public ir::Pass { } } } + + void AppendAllocSpaceForVarsOp(const std::vector ¶ms_name, + const std::vector &grads_name, + const std::string &fused_var_name, + BlockDesc *global_block) const { + auto op_desc = global_block->AppendOp(); + op_desc->SetType("alloc_continuous_space"); + op_desc->SetInput("Input", params_name); + op_desc->SetOutput("Output", grads_name); + op_desc->SetOutput("FusedOutput", {fused_var_name}); + } }; } // namespace details diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index fdff83b92819b39974f3b2ce0848710f1ee02a41..752c932a215bad53f47f19f143a8008b66617a51 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -27,20 +27,17 @@ void BroadcastOpHandle::RunImpl() { if (places_.size() == 1) return; // The input and output may have dummy vars. - VarHandle *in_var_handle; - { - auto in_var_handles = DynamicCast(inputs_); - PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL, - "The number of input should be one."); - in_var_handle = in_var_handles[0]; - } - + auto in_var_handles = DynamicCast(inputs_); auto out_var_handles = DynamicCast(outputs_); + PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL, + "The number of input should be one."); PADDLE_ENFORCE_EQ( out_var_handles.size(), places_.size(), "The number of output should equal to the number of places."); + VarHandle *in_var_handle = in_var_handles[0]; + WaitInputVarGenerated(); std::vector var_scopes; diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 078403f30f10072d3cddae89de4f9cea43ed956e..df69b11ec6ae3bb08ba03b749c69eb718525de4d 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -17,7 +17,6 @@ limitations under the License. */ #include #include #include - #include "paddle/fluid/framework/details/memory_optimize_helper.h" #include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" @@ -82,23 +81,43 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { AppendPass("inplace_pass"); } - if (strategy.fuse_elewise_add_act_ops_) { + if (strategy_.fuse_elewise_add_act_ops_) { VLOG(10) << "Add fuse_elewise_add_act_pass"; AppendPass("fuse_elewise_add_act_pass"); } // for single card training, fuse_all_reduce_ops is unnecessary. // alloc_continuous_space_for_grad_pass should be before of MultiDevPass. - if (strategy.fuse_all_reduce_ops_) { + if (strategy_.fuse_all_reduce_ops_) { VLOG(10) << "Add alloc_continuous_space_for_grad_pass"; AppendPass("alloc_continuous_space_for_grad_pass"); } + if (strategy_.fuse_all_optimizer_ops_) { + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce || + strategy_.is_distribution_) { + VLOG(3) + << "Currently, fuse_all_optimizer_ops only works under AllReduce " + "mode."; + strategy_.fuse_all_optimizer_ops_ = false; + } else { + VLOG(10) << "Add alloc_continuous_space_for_grad_pass"; + AppendPass("alloc_continuous_space_for_grad_pass"); + // NOTE: fuse_all_xx_ops will count the number of xx operator first, + // if the number is zero, fuse_all_reduce_ops will do nothing. + // Currently, only one type of optimization algorithm can be fused. + VLOG(10) << "Add fuse_adam_op_pass"; + AppendPass("fuse_adam_op_pass"); + VLOG(10) << "Add fuse_sgd_op_pass"; + AppendPass("fuse_sgd_op_pass"); + } + } + // Add a graph viz pass to record a graph. if (!strategy.debug_graphviz_path_.empty()) { auto viz_pass = AppendPass("graph_viz_pass"); const std::string graph_path = string::Sprintf( - "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph"); + "%s%s", strategy_.debug_graphviz_path_.c_str(), "_fused_graph"); viz_pass->Set("graph_viz_path", new std::string(graph_path)); } @@ -118,14 +137,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // the de-fact IR, any reuse on Graph is meaningless. // A side-effect of that, memory optimize cannot forsee the fetched vars // , so fetchlist should be set persistable before call the Run interface. - if (strategy.memory_optimize_) { + if (strategy_.memory_optimize_) { VLOG(10) << "Add memory_optimize_pass"; AppendPass("memory_optimize_pass"); } - AppendMultiDevPass(strategy); + AppendMultiDevPass(strategy_); - if (strategy.fuse_all_reduce_ops_) { + if (strategy_.fuse_all_reduce_ops_) { // NOTE: fuse_all_reduce_ops will count the number of all_reduce operator // first, if the number is zero, fuse_all_reduce_ops will do nothing. VLOG(10) << "Add fuse_all_reduce_op_pass"; @@ -151,7 +170,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { AppendPass("all_reduce_deps_pass"); } - if (SeqOnlyAllReduceOps(strategy)) { + if (SeqOnlyAllReduceOps(strategy_)) { VLOG(10) << "Add all_reduce_deps_pass"; AppendPass("all_reduce_deps_pass"); } @@ -165,7 +184,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Convert graph to run on multi-devices. void AppendMultiDevPass(const BuildStrategy &strategy) { ir::Pass *multi_devices_pass = nullptr; - if (strategy_.is_distribution_) { + if (strategy.is_distribution_) { VLOG(10) << "Add dist_multi_devices_pass"; multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); } else { @@ -235,17 +254,22 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, pass->Erase(kNCCLCtxs); pass->SetNotOwned(kNCCLCtxs, nctx); #endif - } else if (pass->Type() == "fuse_all_reduce_op_pass") { + } else if (pass->Type() == "alloc_continuous_space_for_grad_pass" || + pass->Type() == "fuse_adam_op_pass" || + pass->Type() == "fuse_sgd_op_pass" || + pass->Type() == "fuse_all_reduce_op_pass") { pass->Erase(kPlaces); pass->SetNotOwned>(kPlaces, &places); pass->Erase(kLocalScopes); pass->SetNotOwned>(kLocalScopes, &local_scopes); + if (pass->Type() == "fuse_all_reduce_op_pass") { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; - pass->Erase(kNCCLCtxs); - pass->SetNotOwned(kNCCLCtxs, nctx); + platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; + pass->Erase(kNCCLCtxs); + pass->SetNotOwned(kNCCLCtxs, nctx); #endif + } } else if (pass->Type() == "alloc_continuous_space_for_grad_pass") { pass->Erase(kPlaces); pass->SetNotOwned>(kPlaces, &places); @@ -294,4 +318,6 @@ USE_PASS(inplace_pass); USE_PASS(lock_free_optimize_pass); USE_PASS(alloc_continuous_space_for_grad_pass); USE_PASS(graph_to_program_pass); +USE_PASS(fuse_adam_op_pass); +USE_PASS(fuse_sgd_op_pass); USE_PASS(fuse_all_reduce_op_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 9587a6f0f9318616633a2ca8b69991cdd52b8942..85f328b7c40568cc9246fd4ecab34e8e6778439b 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -18,7 +18,6 @@ #include #include #include - #include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" @@ -76,6 +75,8 @@ struct BuildStrategy { bool fuse_elewise_add_act_ops_{false}; + bool fuse_all_optimizer_ops_{false}; + bool fuse_all_reduce_ops_{false}; bool fuse_relu_depthwise_conv_{false}; diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index d4fbea9d95118666ababde811867e95c657c07de..297ee92fc3c84c2feec9cb85bd8671ce8ad94ed0 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -31,9 +31,10 @@ FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor( local_scopes_(local_scopes), places_(places), graph_(graph), + fetch_ctxs_(places), pool_(strategy.num_threads_), - prepare_pool_(1), // add one more thread for generate op_deps - fetch_ctxs_(places) { + // add one more thread for generate op_deps + prepare_pool_(1) { for (auto &op : ir::FilterByNodeWrapper(*graph_)) { int dep = static_cast(op->NotReadyInputSize()); op_deps_.emplace(op, dep); diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h index 970298950cc8089bc5861fcbf8dc2544934b181f..f6d5160e75cc3f48c5129dae05eec4ec82d83ae5 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h @@ -14,7 +14,9 @@ #pragma once #include +#include #include +#include #include #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/details/exception_holder.h" @@ -37,6 +39,8 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { const ir::Graph &Graph() const override; private: + // Note(zcd): the ThreadPool should be placed last so that ThreadPool should + // be destroyed first. ExecutionStrategy strategy_; std::vector local_scopes_; std::vector places_; @@ -45,21 +49,22 @@ class FastThreadedSSAGraphExecutor : public SSAGraphExecutor { std::unordered_map op_deps_; std::vector bootstrap_ops_; - ::ThreadPool pool_; - ::ThreadPool prepare_pool_; platform::DeviceContextPool fetch_ctxs_; std::atomic remaining_; + std::future< + std::unique_ptr>>> + atomic_op_deps_; + ExceptionHolder exception_; + + ::ThreadPool pool_; + ::ThreadPool prepare_pool_; + void RunOpAsync(std::unordered_map> *op_deps, OpHandleBase *op, const std::shared_ptr> &complete_q); void PrepareAtomicOpDeps(); - - std::future< - std::unique_ptr>>> - atomic_op_deps_; - ExceptionHolder exception_; }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/fuse_adam_op_pass.cc b/paddle/fluid/framework/details/fuse_adam_op_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..0ef75e319244e2ccc63dfa3f93f0cd764cf67633 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_adam_op_pass.cc @@ -0,0 +1,199 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fuse_adam_op_pass.h" +#include +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace framework { +namespace details { + +const std::string FuseAdamOpPass::GetOpType() const { return "adam"; } + +const std::vector FuseAdamOpPass::GetAuxiliaryVarNames() const { + return {"Param", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow"}; +} + +void FuseAdamOpPass::FuseOptimizerOps( + const std::unordered_map> + &aux_var_set, + const std::unordered_map &fused_vars_name, + const std::vector &adam_ops, ir::Graph *graph) const { + FuseAdamOps(aux_var_set, fused_vars_name, adam_ops, graph); + FuseScaleOps(aux_var_set.at("Beta1Pow"), fused_vars_name.at("Beta1Pow"), + adam_ops, graph); + FuseScaleOps(aux_var_set.at("Beta2Pow"), fused_vars_name.at("Beta2Pow"), + adam_ops, graph); +} + +void FuseAdamOpPass::FuseAdamOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &adam_ops, ir::Graph *graph) const { + PADDLE_ENFORCE_GT(adam_ops.size(), static_cast(0)); + + // Check attributions + // NOTE: If new attribution is added, the following code maybe need change. + int op_role = boost::get( + adam_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + float beta1 = boost::get(adam_ops[0]->Op()->GetAttr("beta1")); + float beta2 = boost::get(adam_ops[0]->Op()->GetAttr("beta2")); + float epsilon = boost::get(adam_ops[0]->Op()->GetAttr("epsilon")); + bool lazy_mode = boost::get(adam_ops[0]->Op()->GetAttr("lazy_mode")); + int64_t min_row_size_to_use_multithread = boost::get( + adam_ops[0]->Op()->GetAttr("min_row_size_to_use_multithread")); + for (auto &adam_op : adam_ops) { + PADDLE_ENFORCE_EQ(beta1, + boost::get(adam_op->Op()->GetAttr("beta1"))); + PADDLE_ENFORCE_EQ(beta2, + boost::get(adam_op->Op()->GetAttr("beta2"))); + PADDLE_ENFORCE_EQ(epsilon, + boost::get(adam_op->Op()->GetAttr("epsilon"))); + PADDLE_ENFORCE_EQ(lazy_mode, + boost::get(adam_op->Op()->GetAttr("lazy_mode"))); + PADDLE_ENFORCE_EQ(min_row_size_to_use_multithread, + boost::get(adam_op->Op()->GetAttr( + "min_row_size_to_use_multithread"))); + PADDLE_ENFORCE_EQ(op_role, boost::get(adam_op->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName()))); + } + + // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var + // node. + + VLOG(10) << "Insert adam to graph "; + OpDesc adam_desc(adam_ops[0]->Op()->Block()); + adam_desc.SetType("adam"); + adam_desc.SetInput("Param", {fused_vars_name.at("Param")}); + adam_desc.SetInput("Grad", {fused_vars_name.at("Grad")}); + adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")}); + adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")}); + // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. + adam_desc.SetInput("LearningRate", adam_ops[0]->Op()->Input("LearningRate")); + adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow")); + adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow")); + + adam_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")}); + adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")}); + adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")}); + adam_desc.SetAttr("beta1", beta1); + adam_desc.SetAttr("beta2", beta2); + adam_desc.SetAttr("epsilon", epsilon); + adam_desc.SetAttr("lazy_mode", lazy_mode); + adam_desc.SetAttr("min_row_size_to_use_multithread", + min_row_size_to_use_multithread); + adam_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + + auto adam_node = graph->CreateOpNode(&adam_desc); + + InserInputAndOutputForOptOps(adam_ops, adam_node); +} + +void FuseAdamOpPass::FuseScaleOps(const std::vector &beta_name, + const std::string &fused_var_name, + const std::vector &adam_ops, + ir::Graph *graph) const { + PADDLE_ENFORCE_EQ(beta_name.size(), adam_ops.size()); + const std::string scale_op_name = "scale"; + + // Get the scale_ops of dealing the adam's beta var. + std::vector scale_ops; + scale_ops.reserve(beta_name.size()); + for (size_t i = 0; i < adam_ops.size(); ++i) { + auto &beta_1_pow_name = beta_name[i]; + auto beta_pow_iter = std::find_if( + adam_ops[i]->inputs.begin(), adam_ops[i]->inputs.end(), + [&beta_name, &beta_1_pow_name](ir::Node *var_node) -> bool { + return var_node->Var() && var_node->Var()->Name() == beta_1_pow_name; + }); + PADDLE_ENFORCE(beta_pow_iter != adam_ops[i]->inputs.end()); + + auto beta_pow_node = *beta_pow_iter; + auto scale_op_iter = std::find_if( + beta_pow_node->outputs.begin(), beta_pow_node->outputs.end(), + [&scale_op_name](ir::Node *op_node) -> bool { + return op_node->Op() && op_node->Op()->Type() == scale_op_name; + }); + PADDLE_ENFORCE(scale_op_iter != beta_pow_node->outputs.end()); + + scale_ops.emplace_back(*scale_op_iter); + } + PADDLE_ENFORCE_EQ(scale_ops.size(), beta_name.size()); + + // Check attributions + // NOTE: If new attribution is added, the following code maybe need change. + int op_role = boost::get( + scale_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + float scale = boost::get(scale_ops[0]->Op()->GetAttr("scale")); + float bias = boost::get(scale_ops[0]->Op()->GetAttr("bias")); + bool bias_after_scale = + boost::get(scale_ops[0]->Op()->GetAttr("bias_after_scale")); + for (auto &scale_op : scale_ops) { + PADDLE_ENFORCE_EQ(scale, + boost::get(scale_op->Op()->GetAttr("scale"))); + PADDLE_ENFORCE_EQ(bias, boost::get(scale_op->Op()->GetAttr("bias"))); + PADDLE_ENFORCE_EQ( + bias_after_scale, + boost::get(scale_op->Op()->GetAttr("bias_after_scale"))); + PADDLE_ENFORCE_EQ(op_role, boost::get(scale_op->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName()))); + } + + // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var + // node. + + VLOG(10) << "Insert fused scale to graph."; + OpDesc scale_desc(scale_ops[0]->Op()->Block()); + scale_desc.SetType("scale"); + scale_desc.SetInput("X", {fused_var_name}); + scale_desc.SetOutput("Out", {fused_var_name}); + scale_desc.SetAttr("scale", scale); + scale_desc.SetAttr("bias", bias); + scale_desc.SetAttr("bias_after_scale", bias_after_scale); + scale_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + auto scale_node = graph->CreateOpNode(&scale_desc); + + for (auto scale_op : scale_ops) { + // set inputs + scale_node->inputs.insert(scale_node->inputs.begin(), + scale_op->inputs.begin(), scale_op->inputs.end()); + for (auto &input : scale_op->inputs) { + std::replace(input->outputs.begin(), input->outputs.end(), scale_op, + scale_node); + } + // set outputs + scale_node->outputs.insert(scale_node->outputs.begin(), + scale_op->outputs.begin(), + scale_op->outputs.end()); + for (auto &output : scale_op->outputs) { + std::replace(output->inputs.begin(), output->inputs.end(), scale_op, + scale_node); + } + } + + // Delete scale_ops + for (auto &scale_op : scale_ops) { + graph->RemoveNode(scale_op); + } +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_adam_op_pass, paddle::framework::details::FuseAdamOpPass) + .RequirePassAttr(paddle::framework::details::kPlaces) + .RequirePassAttr(paddle::framework::details::kLocalScopes); diff --git a/paddle/fluid/framework/details/fuse_adam_op_pass.h b/paddle/fluid/framework/details/fuse_adam_op_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..5866c37552e26d9b14fa946e119f20121ecf7cb2 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_adam_op_pass.h @@ -0,0 +1,55 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { + +class FuseAdamOpPass : public FuseOptimizerOpPass { + private: + virtual const std::string GetOpType() const; + + virtual const std::vector GetAuxiliaryVarNames() const; + + // Fuse Adam Ops and Scale Ops which are used to update "Beta1Pow", "Beta2Pow" + virtual void FuseOptimizerOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &adam_ops, ir::Graph *graph) const; + + void FuseAdamOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &adam_ops, ir::Graph *graph) const; + + void FuseScaleOps(const std::vector &aux_var_set, + const std::string &fused_var_name, + const std::vector &adam_ops, + ir::Graph *graph) const; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..b49f095d428a017dd1a3bed2788a048af9afa6bb --- /dev/null +++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc @@ -0,0 +1,240 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h" +#include +#include +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace framework { +namespace details { + +void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { + ir::Graph &result = *graph; + + auto &places = Get>(kPlaces); + auto &local_scopes = Get>(kLocalScopes); + + const std::string fuse_op_type = GetOpType(); + const std::vector aux_var_names = GetAuxiliaryVarNames(); + + // Step 1: Get the specified op and auxiliary variables. + std::vector topo_nodes = ir::TopologySortOperations(result); + std::unordered_map> aux_var_set; + std::vector opt_ops; + for (auto &node : topo_nodes) { + GetSpecifiedOpsAndVars(fuse_op_type, aux_var_names, node, &opt_ops, + &aux_var_set); + } + + VLOG(10) << "Find " << fuse_op_type << " operators: " << opt_ops.size(); + if (opt_ops.size() == 0) { + return; + } + + if (result.Has(kFusedOptType)) { + VLOG(10) + << "Currently only support fusing one type optimizer op. Has fused " + << result.Get(kFusedOptType); + return; + } else { + result.Set(kFusedOptType, new FusedOptType); + } + result.Get(kFusedOptType) = fuse_op_type; + + // Step 2: Insert fused_var_name to FusedVars, and the FusedVars need be + // initialized in scopes before execution. + if (!result.Has(kFusedVars)) { + result.Set(kFusedVars, new FusedVars); + } + std::unordered_map fused_vars_name; + fused_vars_name.reserve(aux_var_names.size() + 1); + auto &fused_var_set = result.Get(kFusedVars); + const std::string prefix(kFusedVarNamePrefix); + // NOTE: the fused_var_name should be unique. + for (auto &var_name : aux_var_names) { + auto fused_var_name = prefix + "_" + fuse_op_type + "_" + var_name + "_" + + aux_var_set[var_name][0]; + VLOG(10) << fused_var_name; + fused_vars_name.emplace(var_name, fused_var_name); + PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0); + fused_var_set.insert(fused_var_name); + } + + // Step 3: Get the fused Gradient's name + auto ¶ms_grads = result.Get(kParamsAndGrads); + if (!result.Has(kFusedGrads)) { + PADDLE_THROW( + "The alloc_continuous_space_for_grad_pass should be called before this " + "pass."); + } + auto &fused_grad = result.Get(kFusedGrads); + auto &fused_vars = result.Get(kFusedVars); + auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad); + PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad."); + fused_vars_name.emplace("Grad", fused_grad); + + // Step 4: Sort the parameters and auxiliary variables according + // to parameters' name to make variables' name correspond correctly. + PADDLE_ENFORCE(result.Has(kParamsAndGrads), "Does't find kParamsAndGrads."); + PADDLE_ENFORCE_EQ(params_grads.size(), aux_var_set.begin()->second.size(), + "The size of params_grads and aux_var_set are not equal."); + SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops); + + // Step 5: Alloc continuous space for Parameters and AuxiliaryVar(e.g. + // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately. + InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names, + aux_var_set, fused_vars_name); + + // Step 6: Fuse optimizer Ops and Scale Ops + FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result); + + // Step 7: Remove optimizer Ops + for (auto &opt_op : opt_ops) { + graph->RemoveNode(opt_op); + } +} + +void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( + const std::vector &places, + const std::vector &local_scopes, + const std::vector &aux_var_names, + const std::unordered_map> + &aux_var_set, + const std::unordered_map &fused_vars_name) const { + VLOG(10) << "Init FusedVars."; + // Alloc parameters and auxiliary vars in the respective scope. + size_t idx = local_scopes.size(); + for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend(); + ++iter, --idx) { + auto &scope = *iter; + for (auto &var_name : aux_var_names) { + auto fused_var_name = fused_vars_name.at(var_name); + VLOG(10) << "Init " << fused_var_name; + PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr, + "%s has exist in scope[%d]", fused_var_name, idx); + scope->Var(fused_var_name)->GetMutable(); + } + } + + ProgramDesc program_desc; + auto *global_block = program_desc.MutableBlock(0); + for (auto &var_name : aux_var_names) { + AppendAllocContinuousSpace(aux_var_set.at(var_name), + fused_vars_name.at(var_name), true, + global_block); + } + + for (size_t i = 0; i < local_scopes.size(); ++i) { + for (auto &op_desc : global_block->AllOps()) { + auto op = OpRegistry::CreateOp(*op_desc); + op->Run(*local_scopes[i], places[i]); + } + } +} + +void FuseOptimizerOpPass::SortParametersAndAuxVars( + const std::vector> ¶ms_grads, + std::unordered_map> *aux_vars_set, + std::vector *ops) const { + PADDLE_ENFORCE_NE(aux_vars_set->count("Param"), static_cast(0)); + auto ¶m_vec = aux_vars_set->at("Param"); + + std::vector param_sort_idx; + param_sort_idx.reserve(param_vec.size()); + + for (auto &p_g : params_grads) { + auto iter = std::find(param_vec.begin(), param_vec.end(), p_g.first); + PADDLE_ENFORCE(iter != param_vec.end()); + auto idx = std::distance(param_vec.begin(), iter); + param_sort_idx.emplace_back(idx); + } + + for (auto &aux_vars : *aux_vars_set) { + std::vector sorted_vars; + sorted_vars.reserve(aux_vars.second.size()); + for (size_t i = 0; i < aux_vars.second.size(); ++i) { + sorted_vars.emplace_back(aux_vars.second.at(param_sort_idx[i])); + } + std::swap(aux_vars.second, sorted_vars); + + std::stringstream out; + for (auto &var_name : aux_vars.second) { + out << var_name << " "; + } + VLOG(10) << aux_vars.first << ": " << out.str(); + } + + std::vector sorted_ops; + sorted_ops.reserve(ops->size()); + for (size_t i = 0; i < ops->size(); ++i) { + sorted_ops.emplace_back(ops->at(param_sort_idx[i])); + } + std::swap(*ops, sorted_ops); +} + +void FuseOptimizerOpPass::GetSpecifiedOpsAndVars( + const std::string &op_type, const std::vector &aux_vars_name, + ir::Node *node, std::vector *ops, + std::unordered_map> *aux_args_name) + const { + if (node->Op()->Type() != op_type) return; + + for (auto &var_n : aux_vars_name) { + auto arg_names = node->Op()->Input(var_n); + PADDLE_ENFORCE_EQ(arg_names.size(), static_cast(1)); + (*aux_args_name)[var_n].emplace_back(arg_names[0]); + VLOG(10) << var_n << ", " << arg_names[0]; + } + ops->emplace_back(node); +} + +void FuseOptimizerOpPass::AppendAllocContinuousSpace( + const std::vector &args, const std::string &out_arg, + bool copy_data, BlockDesc *global_block) const { + auto op_desc = global_block->AppendOp(); + op_desc->SetType("alloc_continuous_space"); + op_desc->SetInput("Input", args); + op_desc->SetOutput("Output", args); + op_desc->SetOutput("FusedOutput", {out_arg}); + op_desc->SetAttr("copy_data", copy_data); + op_desc->SetAttr("check_name", true); +} + +void FuseOptimizerOpPass::InserInputAndOutputForOptOps( + const std::vector &opt_ops, ir::Node *opt_node) const { + std::unordered_set inputs; + std::unordered_set outputs; + for (auto opt_op : opt_ops) { + // set inputs + inputs.insert(opt_op->inputs.begin(), opt_op->inputs.end()); + for (auto &input : opt_op->inputs) { + replace(input->outputs.begin(), input->outputs.end(), opt_op, opt_node); + } + // set outputs + outputs.insert(opt_op->outputs.begin(), opt_op->outputs.end()); + for (auto &output : opt_op->outputs) { + replace(output->inputs.begin(), output->inputs.end(), opt_op, opt_node); + } + } + opt_node->inputs.insert(opt_node->inputs.begin(), inputs.begin(), + inputs.end()); + opt_node->outputs.insert(opt_node->outputs.begin(), outputs.begin(), + outputs.end()); +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.h b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..0240f1594d7ef9d855eb6e96e8e8a32ee1d957ba --- /dev/null +++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h @@ -0,0 +1,75 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { + +class FuseOptimizerOpPass : public ir::Pass { + protected: + void ApplyImpl(ir::Graph *graph) const override; + + protected: + virtual void SortParametersAndAuxVars( + const std::vector> ¶ms_grads, + std::unordered_map> *aux_var_set, + std::vector *ops) const; + + void InserInputAndOutputForOptOps(const std::vector &opt_ops, + ir::Node *opt_node) const; + + private: + virtual const std::string GetOpType() const = 0; + + virtual const std::vector GetAuxiliaryVarNames() const = 0; + + virtual void FuseOptimizerOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &adam_ops, ir::Graph *graph) const = 0; + + void GetSpecifiedOpsAndVars( + const std::string &op_type, const std::vector &aux_vars_name, + ir::Node *node, std::vector *ops, + std::unordered_map> *aux_args_name) + const; + + void AppendAllocContinuousSpace(const std::vector &args, + const std::string &out_arg, bool copy_data, + BlockDesc *global_block) const; + + void InitFusedVarsAndAllocSpaceForVars( + const std::vector &places, + const std::vector &local_scopes, + const std::vector &aux_var_names, + const std::unordered_map> + &aux_var_set, + const std::unordered_map &fused_vars_name) + const; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_sgd_op_pass.cc b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..f91c21e3cc869de1a6d67146eb99f27a2ca5497c --- /dev/null +++ b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fuse_sgd_op_pass.h" +#include +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace framework { +namespace details { + +const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; } + +const std::vector FuseSgdOpPass::GetAuxiliaryVarNames() const { + return {"Param"}; +} + +void FuseSgdOpPass::FuseOptimizerOps( + const std::unordered_map> + &aux_var_set, + const std::unordered_map &fused_vars_name, + const std::vector &sgd_ops, ir::Graph *graph) const { + FuseSgdOps(aux_var_set, fused_vars_name, sgd_ops, graph); +} + +void FuseSgdOpPass::FuseSgdOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &sgd_ops, ir::Graph *graph) const { + PADDLE_ENFORCE_GT(sgd_ops.size(), static_cast(0)); + + // NOTE: fused_var is only exist in scope, so the graph doesn't have fused_var + // node. + + int op_role = boost::get( + sgd_ops[0]->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())); + VLOG(10) << "Insert sgd to graph "; + // Add fused scale + OpDesc Sgd_desc(sgd_ops[0]->Op()->Block()); + Sgd_desc.SetType("sgd"); + Sgd_desc.SetInput("Param", {fused_vars_name.at("Param")}); + Sgd_desc.SetInput("Grad", {fused_vars_name.at("Grad")}); + Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")}); + + // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. + Sgd_desc.SetInput("LearningRate", sgd_ops[0]->Op()->Input("LearningRate")); + + // NOTE: multi_devices_pass requires that every op should have a role. + Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); + + auto sgd_node = graph->CreateOpNode(&Sgd_desc); + + InserInputAndOutputForOptOps(sgd_ops, sgd_node); +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(fuse_sgd_op_pass, paddle::framework::details::FuseSgdOpPass) + .RequirePassAttr(paddle::framework::details::kPlaces) + .RequirePassAttr(paddle::framework::details::kLocalScopes); diff --git a/paddle/fluid/framework/details/fuse_sgd_op_pass.h b/paddle/fluid/framework/details/fuse_sgd_op_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..b3aa6a203b726a5a1540ce533c0305d7f579d4a9 --- /dev/null +++ b/paddle/fluid/framework/details/fuse_sgd_op_pass.h @@ -0,0 +1,50 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/fuse_optimizer_op_pass.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { + +class FuseSgdOpPass : public FuseOptimizerOpPass { + private: + virtual const std::string GetOpType() const; + + virtual const std::vector GetAuxiliaryVarNames() const; + + // Fuse Sgd Ops + virtual void FuseOptimizerOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &sgd_ops, ir::Graph *graph) const; + + void FuseSgdOps( + const std::unordered_map> &vars_set, + const std::unordered_map &fused_vars_name, + const std::vector &sgd_ops, ir::Graph *graph) const; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index 644cd4e15083519d6c685ae3e6a0737692018a07..a57d670f118f2eb0bdcbeb7ed080729e4f9e4f2b 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -24,6 +24,19 @@ namespace paddle { namespace framework { namespace details { +// Note(zcd): Addresses should be aligned, otherwise, the results may have +// diff. +static size_t Alignment(size_t size, const platform::Place &place) { + // Allow to allocate the minimum chunk size is 4 KB. + size_t alignment = 1 << 12; + if (platform::is_gpu_place(place)) { + // Allow to allocate the minimum chunk size is 256 B. + alignment = 1 << 8; + } + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); +} + typedef std::vector>> GradientAndLoDTensor; @@ -111,10 +124,11 @@ void FusedAllReduceOpHandle::RunImpl() { return grad1.second->data() < grad2.second->data(); }); + size_t size_of_dtype = framework::SizeOfType(dtype); for (size_t k = 1; k < g_tensor.size(); ++k) { const void *cur_address = g_tensor.at(k - 1).second->data(); int64_t len = g_tensor.at(k - 1).second->numel(); - auto offset = len * framework::SizeOfType(dtype); + auto offset = Alignment(len * size_of_dtype, places_[0]); void *infer_next_address = reinterpret_cast( reinterpret_cast(cur_address) + offset); const void *next_address = g_tensor.at(k).second->data(); @@ -228,18 +242,21 @@ void FusedAllReduceOpHandle::GetDTypeAndNumel( const std::vector> &grad_tensor, proto::VarType::Type *dtype, int64_t *numel) const { *numel = 0; + size_t size_of_dtype = 0; for (size_t i = 0; i < grad_tensor.size(); ++i) { - // Get element number - int64_t len = grad_tensor.at(i).second->numel(); - PADDLE_ENFORCE_GT(len, 0); - *numel += len; - // Get dtype auto ele_type = grad_tensor.at(i).second->type(); if (i == 0) { *dtype = ele_type; + size_of_dtype = framework::SizeOfType(ele_type); } PADDLE_ENFORCE_EQ(ele_type, *dtype); + + // Get element number + int64_t len = grad_tensor.at(i).second->numel(); + PADDLE_ENFORCE_GT(len, 0); + // Alignment(len) + *numel += Alignment(len * size_of_dtype, places_[0]) / size_of_dtype; } } diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc index afbda33b0662e7831b7ea0d44dc7ae4ff3694b1c..79150f719e379ca4e2b87d2e7db1b2daeee9aa67 100644 --- a/paddle/fluid/framework/details/inplace_op_pass.cc +++ b/paddle/fluid/framework/details/inplace_op_pass.cc @@ -156,7 +156,6 @@ void InplacePass::ApplyImpl(ir::Graph* graph) const { continue; TryInplaceOpInputOutput(op, graph); } - // graph->ResolveHazard(var_nodes_); } void InplacePass::InplaceModifyDesc(const std::string& var, @@ -168,7 +167,7 @@ void InplacePass::InplaceModifyDesc(const std::string& var, auto* op_desc = op->Op(); op_desc->RenameInput(var, cache_var); op_desc->RenameOutput(var, cache_var); - if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var); + op_desc->Flush(); } } @@ -265,8 +264,6 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes, void InplacePass::TryInplaceOpInputOutput(ir::Node* op, ir::Graph* graph) const { VLOG(4) << "Try to inplace op " << op->Name(); - // PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr, - // "op_desc is nullptr"); // some pre-requirments need to meet if the op want to inplaced. PADDLE_ENFORCE(op->Op() != nullptr, "op_desc is nullptr"); @@ -446,19 +443,20 @@ bool GraphView::CheckDeps(ir::Node* var, ir::Node* current_op) const { // check if op2 depends on op1's output bool GraphView::CheckOpDeps(ir::Node* op1, ir::Node* op2) const { - auto print_op = [&](ir::Node* op, const char* name) { - std::ostringstream os; - os << " " << name << " : " << op->Name() << " "; - os << "Input args : "; - for (auto& arg : op->inputs) os << arg->Name() << " "; - os << "Output args : "; - for (auto& arg : op->outputs) os << arg->Name() << " "; - os << "Level : " << op_level_.at(op); - VLOG(4) << os.str(); - }; - print_op(op1, "OP1"); - print_op(op2, "OP2"); - + if (VLOG_IS_ON(4)) { + auto print_op = [&](ir::Node* op, const char* name) { + std::ostringstream os; + os << " " << name << " : " << op->Name() << " "; + os << "Input args : "; + for (auto& arg : op->inputs) os << arg->Name() << " "; + os << "Output args : "; + for (auto& arg : op->outputs) os << arg->Name() << " "; + os << "Level : " << op_level_.at(op); + VLOG(4) << os.str(); + }; + print_op(op1, "OP1"); + print_op(op2, "OP2"); + } if (op1 == op2) return true; if (op_level_.at(op1) >= op_level_.at(op2)) return false; diff --git a/paddle/fluid/framework/details/memory_optimize_helper_test.cc b/paddle/fluid/framework/details/memory_optimize_helper_test.cc index 453943af0f123a08b870f11dacb78a5fbd954a56..3fb02f69b1bb65a74a2e5f69e9de7994b4d012db 100644 --- a/paddle/fluid/framework/details/memory_optimize_helper_test.cc +++ b/paddle/fluid/framework/details/memory_optimize_helper_test.cc @@ -142,16 +142,15 @@ TEST(OrderedSet, FindBestFitNode) { for (auto& node : nodes) { pool.Insert(node.get()); } - // FIXME(liuwei1031) this API has changed, - // disable these tests temporarily - // FindNextBestFitNode - // auto* n = nodes[0].get(); - // auto* cache = pool.FindBestFitNode(n); - // PADDLE_ENFORCE(cache->Name() == "a"); - // cache = pool.FindNextBestFitNode(n, cache); - // PADDLE_ENFORCE(cache->Name() == "c"); - // cache = pool.FindNextBestFitNode(n, cache); - // PADDLE_ENFORCE(cache->Name() == "b"); + + auto* n = nodes[0].get(); + auto* cache = pool.FindBestFitNode(n); + ASSERT_TRUE(cache->Name() == "a" || cache->Name() == "c"); + auto* cache_b = pool.FindNextBestFitNode(n, cache); + ASSERT_TRUE(cache_b->Name() != cache->Name()); + ASSERT_TRUE(cache_b->Name() == "a" || cache_b->Name() == "c"); + cache = pool.FindNextBestFitNode(n, cache_b); + ASSERT_TRUE(cache == nullptr); } } // namespace details diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 884089df38de14ff65c9f766ea4c89dbc6721ce9..611693fc7c241f0afed39ab86390df69b9cf4797 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -20,7 +20,6 @@ #include #include #include - #include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph.h" @@ -34,6 +33,10 @@ namespace framework { class Scope; namespace details { +constexpr char kLossVarName[] = "loss_var_name"; +constexpr char kStrategy[] = "strategy"; +constexpr char kNRanks[] = "nranks"; + class MultiDevSSAGraphBuilderBase : public ir::Pass { protected: void ApplyImpl(ir::Graph *graph) const override; diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h index ab5e099023382c4e28a9613d321ea8dc182d3534..6e6ef074db3450ebbb5567743b908e0aee382c27 100644 --- a/paddle/fluid/framework/details/multi_devices_helper.h +++ b/paddle/fluid/framework/details/multi_devices_helper.h @@ -20,7 +20,6 @@ #include #include #include - #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/var_handle.h" @@ -41,22 +40,25 @@ namespace details { // `std::vector` is the version of varaibles. typedef std::vector>> GraphVars; -const char kGraphVars[] = "vars"; - -// aux variables to represent dependency. Useful to resolve data hazard. -typedef std::unordered_set GraphDepVars; -const char kGraphDepVars[] = "dep_vars"; +constexpr char kGraphVars[] = "vars"; -constexpr char kNCCLCtxs[] = "nccl_ctxs"; - -constexpr char kLossVarName[] = "loss_var_name"; constexpr char kPlaces[] = "places"; constexpr char kLocalScopes[] = "local_scopes"; -constexpr char kStrategy[] = "strategy"; -constexpr char kNRanks[] = "nranks"; +constexpr char kNCCLCtxs[] = "nccl_ctxs"; + +// aux variables to represent dependency. Useful to resolve data hazard. +typedef std::unordered_set GraphDepVars; +constexpr char kGraphDepVars[] = "dep_vars"; typedef std::unordered_set FusedVars; constexpr char kFusedVars[] = "fused_vars"; +constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@"; + +typedef std::string FusedOptType; +constexpr char kFusedOptType[] = "fused_opt_type"; + +typedef std::string FusedGrads; +constexpr char kFusedGrads[] = "fused_gradients"; typedef std::vector> ParamsAndGrads; constexpr char kParamsAndGrads[] = "params_grads"; @@ -65,8 +67,6 @@ typedef std::vector>> GroupGradsAndParams; constexpr char kGroupGradsAndParams[] = "group_grads_params"; -constexpr char kFusedVarNamePrefix[] = "@FUSEDVAR@"; - } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index c4254bbadfa17682f437f46f02adc9c884d24304..c00932a7bdb170e63b5fd4d43ccb2072f1a0a9c9 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -24,13 +24,13 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( const ExecutionStrategy &strategy, const std::vector &local_scopes, const std::vector &places, ir::Graph *graph) : graph_(graph), - pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) - : nullptr), - prepare_pool_(1), local_scopes_(local_scopes), places_(places), fetch_ctxs_(places), - strategy_(strategy) { + strategy_(strategy), + prepare_pool_(1), + pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_) + : nullptr) { PrepareOpDeps(); CopyOpDeps(); } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index b9bccba8fa2fa13d99a9a39a5135106101daa903..1fa5196970512ccc4a3dee698f477711be1e7101 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -63,13 +63,20 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { details::OpHandleBase *op); private: + // Note(zcd): the ThreadPool should be placed last so that ThreadPool should + // be destroyed first. ir::Graph *graph_; - std::unique_ptr<::ThreadPool> pool_; - ::ThreadPool prepare_pool_; std::vector local_scopes_; std::vector places_; platform::DeviceContextPool fetch_ctxs_; ExceptionHolder exception_holder_; + std::unique_ptr op_deps_; + std::future> op_deps_futures_; + ExecutionStrategy strategy_; + // use std::list because clear(), push_back, and for_each are O(1) + std::list> run_op_futures_; + ::ThreadPool prepare_pool_; + std::unique_ptr<::ThreadPool> pool_; void InsertPendingOp(std::unordered_map *pending_ops, OpHandleBase *op_instance) const; @@ -88,14 +95,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { void PrepareOpDeps(); void CopyOpDeps(); - - private: - std::future> op_deps_futures_; - - ExecutionStrategy strategy_; - std::unique_ptr op_deps_; - // use std::list because clear(), push_back, and for_each are O(1) - std::list> run_op_futures_; }; } // namespace details diff --git a/paddle/fluid/framework/inplace_op_inference_test.cc b/paddle/fluid/framework/inplace_op_inference_test.cc index c93e562955fb36ddc4363fac862f3942758af35d..a9b3b889229ee46bf66063c8381bdd02c7229cbd 100644 --- a/paddle/fluid/framework/inplace_op_inference_test.cc +++ b/paddle/fluid/framework/inplace_op_inference_test.cc @@ -12,9 +12,14 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include #include +#include #include +#include #include "gtest/gtest.h" +#include "paddle/fluid/framework/details/inplace_op_pass.h" +#include "paddle/fluid/framework/ir/pass_builder.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -165,118 +170,147 @@ REGISTER_OPERATOR(multi_out_grad, f::NOP, f::MultiOutGradInplaceInToOut, namespace paddle { namespace framework { -// TEST(InferInplace, SingleOpInplaceInToOut) { -// ProgramDesc prog; -// auto* op = prog.MutableBlock(0)->AppendOp(); -// op->SetType("single_op"); -// op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); -// op->SetOutput("Out", {"test2_out"}); -// -// prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); -// prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128}); -// prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); -// prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); -// prog.MutableBlock(0)->Var("test2_out"); -// prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 128, 128}); -// -// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; -// auto in_to_outs = infer_inplace(*op); -// EXPECT_EQ(in_to_outs.size(), 1ul); -// auto it = in_to_outs.begin(); -// EXPECT_EQ(it->first, "test2_a"); -// EXPECT_EQ(it->second, "test2_out"); -// } -// -// TEST(InferInplace, SingleGradOpInplaceInToOut) { -// ProgramDesc prog; -// auto* op = prog.MutableBlock(0)->AppendOp(); -// op->SetType("single_op_grad"); -// op->SetInput(GradVarName("Out"), {"test2_out"}); -// op->SetOutput(GradVarName("X"), {"test2_a", "test2_b", "test2_c"}); -// -// prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); -// prog.MutableBlock(0)->Var("test2_a")->SetShape({32, 16, 1024, 1024}); -// prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); -// prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); -// prog.MutableBlock(0)->Var("test2_out"); -// prog.MutableBlock(0)->Var("test2_out")->SetShape({32, 16, 1024, 1024}); -// -// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; -// auto in_to_outs = infer_inplace(*op); -// EXPECT_EQ(in_to_outs.size(), 1ul); -// auto it = in_to_outs.begin(); -// EXPECT_EQ(it->first, "test2_out"); -// EXPECT_EQ(it->second, "test2_a"); -// } -// -// TEST(InferInplace, MultiOutInplaceInToOut) { -// ProgramDesc prog; -// auto* op = prog.MutableBlock(0)->AppendOp(); -// op->SetType("multi_out_op"); -// op->SetInput("X", {"a0", "a1"}); -// op->SetInput("Y", {"b0"}); -// op->SetInput("Z", {"c0", "c1"}); -// op->SetOutput("Out", {"o0"}); -// op->SetOutput("YOut", {"y0"}); -// op->SetOutput("ZOut", {"z0"}); -// -// prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); -// prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); -// prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); -// prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); -// prog.MutableBlock(0)->Var("o0"); -// prog.MutableBlock(0)->Var("y0"); -// prog.MutableBlock(0)->Var("z0"); -// prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); -// prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); -// prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); -// prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); -// prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); -// prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); -// -// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; -// auto in_to_outs = infer_inplace(*op); -// EXPECT_EQ(in_to_outs.size(), 3ul); -// std::unordered_map expects = { -// {"a0", "o0"}, {"b0", "y0"}, {"c0", "z0"}, -// }; -// EXPECT_TRUE(expects == in_to_outs); -// } -// -// TEST(InferInplace, MultiGradInplaceInToOut) { -// ProgramDesc prog; -// auto* op = prog.MutableBlock(0)->AppendOp(); -// op->SetType("multi_out_grad"); -// op->SetInput(GradVarName("Out"), {"o0"}); -// op->SetInput(GradVarName("YOut"), {"y0"}); -// op->SetInput(GradVarName("ZOut"), {"z0"}); -// op->SetOutput(GradVarName("X"), {"a0", "a1"}); -// op->SetOutput(GradVarName("Y"), {"b0"}); -// op->SetOutput(GradVarName("Z"), {"c0", "c1"}); -// -// prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); -// prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); -// prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); -// prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); -// prog.MutableBlock(0)->Var("o0"); -// prog.MutableBlock(0)->Var("y0"); -// prog.MutableBlock(0)->Var("z0"); -// prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); -// prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); -// prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); -// prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); -// prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); -// prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); -// -// auto& infer_inplace = OpInfoMap::Instance().Get(op->Type()).infer_inplace_; -// auto in_to_outs = infer_inplace(*op); -// -// EXPECT_EQ(in_to_outs.size(), 3ul); -// std::unordered_map expects = { -// {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, -// }; -// EXPECT_TRUE(expects == in_to_outs); -// } +void FakeSuccData(ProgramDesc* prog) { // NOLINT + prog->MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); + prog->MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128}); + prog->MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); + prog->MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); + prog->MutableBlock(0)->Var("test2_out"); + prog->MutableBlock(0)->Var("test2_out")->SetShape({64, 32, 128, 128}); +} + +void FakeNoInplaceData(ProgramDesc* prog) { // NOLINT + prog->MutableBlock(0)->Var("test2_a")->SetType(proto::VarType::LOD_TENSOR); + prog->MutableBlock(0)->Var("test2_a")->SetShape({32, 64, 128, 128}); + prog->MutableBlock(0)->Var("test2_b")->SetType(proto::VarType::LOD_TENSOR); + prog->MutableBlock(0)->Var("test2_c")->SetType(proto::VarType::LOD_TENSOR); + prog->MutableBlock(0)->Var("test2_out"); + prog->MutableBlock(0)->Var("test2_out")->SetShape({64, 31, 128, 128}); +} + +ir::Node* GetNodeFromGraph(ir::Graph* g, std::string name) { + ir::Node* op_node = nullptr; + for (auto& item : g->Nodes()) { + if (item->Name() == name) { + op_node = item; + break; + } + } + return op_node; +} + +std::unique_ptr test_SingleOpInplaceInToOut( + std::unique_ptr g) { + std::unique_ptr pass(new details::InplacePass()); + ir::Node* op_node = GetNodeFromGraph(g.get(), "single_op"); + EXPECT_NE(op_node, nullptr); + pass->Apply(g.get()); + return g; +} + +TEST(InferInplace, SingleOpInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("single_op"); + op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); + op->SetOutput("Out", {"test2_out"}); + + FakeSuccData(&prog); + std::unique_ptr g(new ir::Graph(prog)); + g = test_SingleOpInplaceInToOut(std::move(g)); + auto op_node = GetNodeFromGraph(g.get(), "single_op"); + + EXPECT_EQ(op_node->outputs[0]->Name(), "test2_a"); +} + +TEST(InferInplace, SingleOpInplaceInToOutNoInplace) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("single_op"); + op->SetInput("X", {"test2_a", "test2_b", "test2_c"}); + op->SetOutput("Out", {"test2_out"}); + + FakeNoInplaceData(&prog); + std::unique_ptr g(new ir::Graph(prog)); + g = test_SingleOpInplaceInToOut(std::move(g)); + auto op_node = GetNodeFromGraph(g.get(), "single_op"); + + EXPECT_EQ(op_node->outputs[0]->Name(), "test2_out"); +} + +TEST(InferInplace, MultiOutInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("multi_out_op"); + op->SetInput("X", {"a0", "a1"}); + op->SetInput("Y", {"b0"}); + op->SetInput("Z", {"c0", "c1"}); + op->SetOutput("Out", {"o0"}); + op->SetOutput("YOut", {"y0"}); + op->SetOutput("ZOut", {"z0"}); + + prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("o0"); + prog.MutableBlock(0)->Var("y0"); + prog.MutableBlock(0)->Var("z0"); + prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("z0")->SetShape({32, 16, 1024, 1024}); + + std::unique_ptr g(new ir::Graph(prog)); + std::unique_ptr pass(new details::InplacePass()); + pass->Apply(g.get()); + auto op_node = GetNodeFromGraph(g.get(), "multi_out_op"); + ASSERT_TRUE(op_node != nullptr); + EXPECT_EQ(op_node->outputs[0]->Name(), "a0"); + EXPECT_EQ(op_node->outputs[1]->Name(), "b0"); + EXPECT_EQ(op_node->outputs[2]->Name(), "c0"); +} + +TEST(InferInplace, MultiGradInplaceInToOut) { + ProgramDesc prog; + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("multi_out_grad"); + op->SetInput(GradVarName("Out"), {"o0"}); + op->SetInput(GradVarName("YOut"), {"y0"}); + op->SetInput(GradVarName("ZOut"), {"z0"}); + op->SetOutput(GradVarName("X"), {"a0", "a1"}); + op->SetOutput(GradVarName("Y"), {"b0"}); + op->SetOutput(GradVarName("Z"), {"c0", "c1"}); + + prog.MutableBlock(0)->Var("a0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c0")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c1")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("o0"); + prog.MutableBlock(0)->Var("y0"); + prog.MutableBlock(0)->Var("z0"); + prog.MutableBlock(0)->Var("a0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("b0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("c0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("o0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("y0")->SetShape({32, 16, 1024, 1024}); + prog.MutableBlock(0)->Var("z0")->SetShape({32, 15, 1024, 1024}); + + std::unique_ptr g(new ir::Graph(prog)); + std::unique_ptr pass(new details::InplacePass()); + pass->Apply(g.get()); + auto op_node = GetNodeFromGraph(g.get(), "multi_out_grad"); + ASSERT_TRUE(op_node != nullptr); + EXPECT_EQ(op_node->outputs[0]->Name(), "o0"); + EXPECT_EQ(op_node->outputs[2]->Name(), "y0"); + EXPECT_EQ(op_node->outputs[3]->Name(), "c0"); + + std::unordered_map expects = { + {"o0", "a0"}, {"y0", "b0"}, {"z0", "c0"}, + }; +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b0ac73f9f52076a9303417bc1b19208ba6e6f2ec..e6628da9f360ea45e31d6b905065109f9664a17f 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -56,8 +56,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) { } } -static DDim GetDims(const Scope& scope, const std::string& name, - bool get_actual_dim = false) { +static DDim GetDimsDebug(const Scope& scope, const std::string& name, + bool get_actual_dim = false) { Variable* var = scope.FindVar(name); if (var == nullptr) { return DDim({-1}); @@ -65,9 +65,9 @@ static DDim GetDims(const Scope& scope, const std::string& name, if (var->IsType()) { const LoDTensor& tensor = var->Get(); - // if (UNLIKELY(!tensor.IsInitialized())) { - // return DDim({-1}); - // } + if (UNLIKELY(!tensor.IsInitialized())) { + return DDim({-1}); + } return tensor.dims(); } else if (var->IsType()) { if (get_actual_dim) { @@ -123,7 +123,7 @@ static int GetRowSize(const Scope& scope, const std::string& name) { return -1; } -static LoD GetLoD(const Scope& scope, const std::string& name) { +static LoD GetLoDDebug(const Scope& scope, const std::string& name) { Variable* var = scope.FindVar(name); auto default_lod = LoD({{}}); @@ -133,9 +133,9 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { if (var->IsType()) { const LoDTensor& tensor = var->Get(); - // if (UNLIKELY(!tensor.IsInitialized())) { - // return default_lod; - // } + if (UNLIKELY(!tensor.IsInitialized())) { + return default_lod; + } return tensor.lod(); } else { return default_lod; @@ -274,8 +274,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { } std::string dtype = GetDtype(*scope, var_name); ss << ":" << dtype; - ss << "[" << GetDims(*scope, var_name, true) << "]"; - ss << "(" << GetLoD(*scope, var_name) << ")"; + ss << "[" << GetDimsDebug(*scope, var_name, true) << "]"; + ss << "(" << GetLoDDebug(*scope, var_name) << ")"; } } if (i != input.second.size() - 1) { @@ -305,8 +305,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const { } std::string dtype = GetDtype(*scope, output.second[i]); ss << ":" << dtype; - ss << "[" << GetDims(*scope, var_name, true) << "]"; - ss << "(" << GetLoD(*scope, var_name) << ")"; + ss << "[" << GetDimsDebug(*scope, var_name, true) << "]"; + ss << "(" << GetLoDDebug(*scope, var_name) << ")"; } } if (i != output.second.size() - 1) { diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 6d8ba430bd0b9c9e48b4a80a07feb24b2da7d7b8..a02e53dcf764368601646a900833ac650c5bb31a 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -365,6 +365,9 @@ class ExecutionContext { auto shared_allocation = std::shared_ptr( allocation_ptr, deleter); + PADDLE_ENFORCE( + dynamic_cast(allocation_ptr) != nullptr, + "The AllocationPtr must be TemporaryAllocation."); PADDLE_ENFORCE_GE(allocation_ptr->size(), framework::product(dim) * sizeof(T)); diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index ef096c2b810187c50fbcde7d93d9e5a2ecd8b0f3..ea7f8c496a9fc3ff78fce06b69fb21e44e5be9ee 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -70,7 +70,7 @@ Tensor& Tensor::ShareDataWith(const Tensor& src) { return *this; } -Tensor Tensor::Slice(int begin_idx, int end_idx) const { +Tensor Tensor::Slice(int64_t begin_idx, int64_t end_idx) const { check_memory_size(); PADDLE_ENFORCE_GE(begin_idx, 0, "The start row index must be greater than 0."); diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 88f5b757a8111f6a7e269ff71054dab425c0de01..0fa76f943ec1417dc712771565f7ff2b263e6365 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include #include #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/ddim.h" @@ -27,10 +28,6 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" -#ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/platform/mkldnn_utils.h" -#endif - namespace paddle { namespace framework { @@ -41,34 +38,10 @@ class Tensor { #ifdef PADDLE_WITH_MKLDNN public: - // TODO(jczaja): This is depracted and will be removed - inline mkldnn::memory::format format() const { - if (layout_ == DataLayout::kMKLDNN) { - return static_cast(mem_pd_.desc().data.format); - } else { - return mkldnn::memory::format::format_undef; - } - } + inline mkldnn::memory::format format() const { return format_; } - // TODO(jczaja): This is depracted and will be removed - inline void set_format( - const mkldnn::memory::format fmt, - mkldnn::memory::data_type data_type = mkldnn::memory::f32) { - mem_pd_ = paddle::platform::create_prim_desc_from_format( - paddle::framework::vectorize2int(dims()), fmt, data_type); - layout_ = DataLayout::kMKLDNN; - } - - inline mkldnn::memory::primitive_desc get_mkldnn_prim_desc() const { - return mem_pd_; - } - - inline void set_mkldnn_prim_desc( - const mkldnn::memory::primitive_desc& mem_pd) { - // Internally MKL-DNN is just copying (increasing reference counter) - // to shared_ptr. So asignment should be quite cheap - mem_pd_ = mem_pd; - layout_ = DataLayout::kMKLDNN; + inline void set_format(const mkldnn::memory::format format) { + format_ = format; } protected: @@ -76,9 +49,12 @@ class Tensor { * @brief the detail format of memory block which have layout as kMKLDNN * * @note MKLDNN lib support various memory format like nchw, nhwc, nChw8C, - * nChw16c, etc. For a MKLDNN memory block, we store memory descriptor + * nChw16c, etc. For a MKLDNN memory block, layout will be set as + * DataLayout::kMKLDNN meanwhile detail memory format will be kept in + * this field. */ - mutable mkldnn::memory::primitive_desc mem_pd_; + + mkldnn::memory::format format_ = mkldnn::memory::format::format_undef; #endif public: @@ -157,7 +133,7 @@ class Tensor { * @param[in] end_idx The index of the end row(exclusive) to slice. * The index number begins from 0. */ - Tensor Slice(int begin_idx, int end_idx) const; + Tensor Slice(int64_t begin_idx, int64_t end_idx) const; platform::Place place() const { PADDLE_ENFORCE_NOT_NULL( diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 5f21dae60586e926472fc512eca7bcbb55dc8eda..a7f09df4917532e7261cee471c711897c8eb3447 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -44,11 +44,6 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place, << dst_place; return; } -#ifdef PADDLE_WITH_MKLDNN - if (src.layout() == DataLayout::kMKLDNN) { - dst->set_mkldnn_prim_desc(src.get_mkldnn_prim_desc()); - } -#endif memory::Copy(boost::get(dst_place), dst_ptr, boost::get(src_place), src_ptr, size); } diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 0f6014ae8aa28f090cb51401ee2cb0772bca7a45..ac77c3d2a500816a4eb41ed13f23ee628290f287 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -4,7 +4,6 @@ cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler) -cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator) if (WITH_GPU) @@ -38,20 +37,30 @@ else () set(AllocatorFacadeDeps) endif() -list(APPEND AllocatorFacadeDeps cpu_allocator locked_allocator best_fit_allocator aligned_allocator auto_increment_allocator conditional_allocator retry_allocator buffered_allocator legacy_allocator zero_size_allocator) - cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_library(auto_increment_allocator SRCS auto_increment_allocator.cc DEPS allocator) +cc_library(zero_size_allocator SRCS zero_size_allocator.cc DEPS allocator) cc_library(conditional_allocator SRCS conditional_allocator.cc DEPS allocator) -cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) -cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) +cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags) +cc_library(allocator_facade SRCS allocator_facade.cc DEPS + ${AllocatorFacadeDeps} + cpu_allocator + locked_allocator + best_fit_allocator + aligned_allocator + auto_increment_allocator + zero_size_allocator + conditional_allocator + retry_allocator + buffered_allocator + allocator_strategy + legacy_allocator + ) nv_test(allocation_and_eigen_test SRCS allocation_and_eigen_test.cu DEPS allocator_facade) cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator best_fit_allocator locked_allocator cpu_allocator) -cc_test(naive_best_fit_allocator_facade_test SRCS naive_best_fit_allocator_facade_test.cc DEPS allocator_facade) - cc_test(allocator_facade_abs_flags_test SRCS allocator_facade_abs_flags_test.cc DEPS allocator_facade) cc_test(allocator_facade_frac_flags_test SRCS allocator_facade_frac_flags_test.cc DEPS allocator_facade) diff --git a/paddle/fluid/memory/allocation/aligned_allocator.h b/paddle/fluid/memory/allocation/aligned_allocator.h index b536d4276e3b6236d0748eee588d345dd15c6954..064acd06e71da98802126913e0af843cfbf717e7 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.h +++ b/paddle/fluid/memory/allocation/aligned_allocator.h @@ -94,8 +94,6 @@ class AlignedAllocator : public ThinAlignedAllocator { underlying_allocator_->Allocate(size + kAlignment, attr); return new AlignedAllocation(std::move(raw_allocation), size); } - - void FreeImpl(Allocation* allocation) override { delete allocation; } }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/allocator.cc b/paddle/fluid/memory/allocation/allocator.cc index 5a5253d911abc722c026730e7e88eb326bb82afd..8fb8a5fb897a736d7515951ba08c633da9a7706c 100644 --- a/paddle/fluid/memory/allocation/allocator.cc +++ b/paddle/fluid/memory/allocation/allocator.cc @@ -27,24 +27,16 @@ bool Allocator::IsAllocThreadSafe() const { return false; } AllocationPtr Allocator::Allocate(size_t size, Allocator::Attr attr) { auto ptr = AllocateImpl(size, attr); - ptr->RegisterDecoratedAllocator(this); + ptr->set_allocator(this); return AllocationPtr(ptr); } -void Allocator::FreeImpl(Allocation* allocation) { - Allocator* allocator = allocation->TopDecoratedAllocator(); - allocator->Free(allocation); -} - -void Allocator::Free(Allocation* allocation) { - allocation->PopDecoratedAllocator(); - FreeImpl(allocation); -} +void Allocator::Free(Allocation* allocation) { delete allocation; } const char* BadAlloc::what() const noexcept { return msg_.c_str(); } void AllocationDeleter::operator()(Allocation* allocation) const { - Allocator* allocator = allocation->TopDecoratedAllocator(); + auto* allocator = allocation->allocator(); allocator->Free(allocation); } diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h index 33b816b90812d7fedc450a67743b5d7d20579302..3465278935f7ce05456e94bb3a7d1ae9f114ff96 100644 --- a/paddle/fluid/memory/allocation/allocator.h +++ b/paddle/fluid/memory/allocation/allocator.h @@ -46,56 +46,13 @@ class Allocator; // NOTE: this is the base class of Allocation. Each allocator can use its own // allocation object. // NOTE: the `Allocation::ptr()` could be nullptr, if the allocation size is 0 - -/** - * Allocation is returned by Allocator::Allocate() method. - * - * An allocator may be decorated by another allocator. For example, we can - * decorate - * a RetryAllocator to any allocator to perform allocation retry when first - * allocation request fails. - * - * Explanations of Allocator design is as follows: - * - * Suppose we have an allocator which is decorated by several allocators: - * - * A(1) <- A(2) <- A(3) <- ... <- A(n) - * - * , and the public allocator is A(1). - * - * The allocation process would be: - * - * A(n).Allocate() -> ... -> A(2).Allocate() -> A(1).Allocate() - * - * , and the free process would be: - * - * A(1).Free() -> A(2).Free() -> ... -> A(n).Free() - * - * Therefore, we should record the allocator chain when allocating, so - * that we can free the allocation in the reverse order of allocator chain. - * The field `decorated_allocators_` is used to record this chain. - * - * Another example is that we want to add additional fields in Allocation, - * e.g., something what is done in AlignedAllocator, etc. - * In this case, we should declare a derived class of Allocation, which - * contains an underlying Allocation allocated by the underlying allocator. - * Therefore, `decorated_allocators_` of the new Allocation object would - * be a new chain, differing from the underlying Allocation object. - */ class Allocation { public: Allocation(void* ptr, size_t size, platform::Place place) - : ptr_(ptr), size_(size), place_(place) { - // NOTE(zjl): Since decorated_allocators_ is usually a small vector - // We reserve a small buffer to it to prevent frequent heap allocation - // Not quite sure whether we need something like gtl vector. - decorated_allocators_.reserve(8); - } + : allocator_(nullptr), ptr_(ptr), size_(size), place_(place) {} Allocation(const Allocation& o) = delete; Allocation& operator=(const Allocation& o) = delete; - Allocation(Allocation&& o) = delete; - Allocation& operator=(Allocation&& o) = delete; // Returns the holding pointer. // NOTE: For performance consideration, it is better not to make this method @@ -117,31 +74,17 @@ class Allocation { const platform::Place& place() const { return place_; } - virtual ~Allocation(); - - private: - const std::vector& DecoratedAllocators() const { - return decorated_allocators_; - } - - inline void RegisterDecoratedAllocator(Allocator* allocator) { - decorated_allocators_.push_back(allocator); - } + Allocator* allocator() { return allocator_; } - inline void PopDecoratedAllocator() { decorated_allocators_.pop_back(); } + void set_allocator(Allocator* allocator) { allocator_ = allocator; } - inline Allocator* TopDecoratedAllocator() { - return decorated_allocators_.back(); - } + virtual ~Allocation(); private: + Allocator* allocator_; void* ptr_; size_t size_; platform::Place place_; - std::vector decorated_allocators_; - - friend class Allocator; - friend class AllocationDeleter; }; using AllocationPtr = std::unique_ptr; @@ -191,12 +134,9 @@ class Allocator { // True if the `Allocate` is thread safe. virtual bool IsAllocThreadSafe() const; - // This function should not be called outside - void Free(Allocation* allocation); - protected: + virtual void Free(Allocation* allocation); virtual Allocation* AllocateImpl(size_t size, Allocator::Attr attr) = 0; - virtual void FreeImpl(Allocation* allocation); private: friend class AllocationDeleter; diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 09328aded58cb0cccd9de0aba399f5c49313042f..a3b73e3ba31c89c2a94955b0fea64df4ab0ffc26 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -49,17 +49,6 @@ namespace paddle { namespace memory { namespace allocation { -static inline std::shared_ptr WrapRetryAllocator( - std::shared_ptr allocator, int64_t retry_time) { - if (retry_time > 0) { - auto* retry_allocator = - new RetryAllocator(std::move(allocator), retry_time); - allocator.reset(retry_allocator); - } - - return allocator; -} - // TODO(yy): Dirty code here. This class should be configurable in runtime. class CPUManagedAllocator : public Allocator { public: @@ -123,10 +112,14 @@ class ChunkedAllocator : public Allocator { std::shared_ptr CreateAllocatorWithChunk() { chunks_.emplace_back(raw_allocator_->Allocate(max_chunk_size_)); auto* allocation = chunks_.back().get(); - std::shared_ptr allocator(new LockedAllocator( - std::shared_ptr(new BestFitAllocator(allocation)))); + std::unique_ptr allocator(new LockedAllocator( + std::unique_ptr(new BestFitAllocator(allocation)))); - allocator = WrapRetryAllocator(allocator, retry_time_); + if (retry_time_ > 0) { + auto* retry_allocator = + new RetryAllocator(std::move(allocator), retry_time_); + allocator.reset(retry_allocator); + } return std::make_shared>(std::move(allocator)); } @@ -197,23 +190,13 @@ class AllocatorFacadePrivate { ~AllocatorFacadePrivate() = default; AllocatorFacadePrivate() { - auto strategy = GetAllocatorStrategy(); - switch (strategy) { - case AllocatorStrategy::kLegacy: { - InitLegacyAllocator(); - break; - } - case AllocatorStrategy::kNaiveBestFit: { - InitCPUAllocator(); - InitCUDAAllocator(); - InitCUDAPinnedAllocator(); - WrapZeroSizeAllocator(); - break; - } - default: { - PADDLE_THROW("Unsupported allocator strategy: %d", - static_cast(strategy)); - } + if (GetAllocatorStrategy() == AllocatorStrategy::kLegacy) { + InitLegacyAllocator(); + } else { + InitCPUAllocator(); + InitCUDAAllocator(); + InitCUDAPinnedAllocator(); + WrapZeroSizeAllocator(); } } @@ -271,7 +254,8 @@ AllocatorFacade& AllocatorFacade::Instance() { std::shared_ptr AllocatorFacade::AllocShared( const platform::Place& place, size_t size, Allocator::Attr attr) { - return std::shared_ptr(Alloc(place, size, attr)); + return std::shared_ptr(Alloc(place, size, attr).release(), + AllocationDeleter()); } AllocationPtr AllocatorFacade::Alloc(const platform::Place& place, size_t size, diff --git a/paddle/fluid/memory/allocation/allocator_strategy.cc b/paddle/fluid/memory/allocation/allocator_strategy.cc index fff94c01e709613603eea7150a08df3c2611dec2..8cebda9005b29b5b3259de0830c42eb10ef90e66 100644 --- a/paddle/fluid/memory/allocation/allocator_strategy.cc +++ b/paddle/fluid/memory/allocation/allocator_strategy.cc @@ -19,22 +19,16 @@ DEFINE_string( allocator_strategy, "legacy", "The allocation strategy. Legacy means the original allocator of Fluid." - "naive_best_fit means the experimental best fit allocator. " - "allocator. Enum in [legacy, naive_best_fit]."); + "New means the experimental allocators of Fluid. in [legacy, new]"); namespace paddle { namespace memory { namespace allocation { static AllocatorStrategy GetStrategyFromFlag() { - if (FLAGS_allocator_strategy == "legacy") { - return AllocatorStrategy::kLegacy; - } else if (FLAGS_allocator_strategy == "naive_best_fit") { - return AllocatorStrategy::kNaiveBestFit; - } else { - PADDLE_THROW("Unsupported allocator strategy: %s", - FLAGS_allocator_strategy); - } + return FLAGS_allocator_strategy == "legacy" + ? AllocatorStrategy::kLegacy + : AllocatorStrategy::kNaiveBestFit; } AllocatorStrategy GetAllocatorStrategy() { diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc index d87dd9a4b6df288065389a335a9ddb4047dd096a..e3d6c2f511ef083ef9ecc1fe8df96051b2b85cc2 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc @@ -109,7 +109,7 @@ size_t BestFitAllocator::NumFreeChunks() const { } return num; } -void BestFitAllocator::FreeImpl(Allocation* allocation) { +void BestFitAllocator::Free(Allocation* allocation) { auto* bf_allocation = dynamic_cast(allocation); PADDLE_ENFORCE_NOT_NULL(bf_allocation, "The input allocation is not BestFitAllocation."); diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.h b/paddle/fluid/memory/allocation/best_fit_allocator.h index c137438c0c35a575d366a1dfdf950262f711defa..4f10f2b53e8543d4197097f1cae8de765bceeb0f 100644 --- a/paddle/fluid/memory/allocation/best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/best_fit_allocator.h @@ -119,7 +119,7 @@ class BestFitAllocator : public Allocator { void InsertFreeNode(const ListIt& it); protected: - void FreeImpl(Allocation* allocation) override; + void Free(Allocation* allocation) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; private: diff --git a/paddle/fluid/memory/allocation/buffered_allocator.cc b/paddle/fluid/memory/allocation/buffered_allocator.cc index e04c0aa34b1cd6200806cc2a012161e3478eca0b..fc75abc9dfee6c9df5bc87faa493002cc1fe6298 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator.cc +++ b/paddle/fluid/memory/allocation/buffered_allocator.cc @@ -22,11 +22,11 @@ namespace paddle { namespace memory { namespace allocation { -BufferedAllocator::BufferedAllocator(std::shared_ptr allocator) +BufferedAllocator::BufferedAllocator(std::unique_ptr &&allocator) : underlying_allocator_(std::move(allocator)) { PADDLE_ENFORCE_NOT_NULL( underlying_allocator_, - "Underlying allocator of BufferedAllocator must not be null"); + "Underlying allocator of BufferedAllocator must be unmanaged"); if (underlying_allocator_->IsAllocThreadSafe()) { mtx_.reset(new std::mutex()); } @@ -41,19 +41,19 @@ void BufferedAllocator::FreeCache(size_t size) { while (!allocations_.empty()) { // free the largest auto it = --allocations_.end(); cur += it->second->size(); - underlying_allocator_->Free(it->second.release()); + delete it->second.release(); allocations_.erase(it); if (cur >= size) return; } } -bool BufferedAllocator::IsAllocThreadSafe() const { return mtx_ != nullptr; } - -void BufferedAllocator::FreeImpl(Allocation *allocation) { +bool BufferedAllocator::IsAllocThreadSafe() const { + return this->underlying_allocator_->IsAllocThreadSafe(); +} +void BufferedAllocator::Free(Allocation *allocation) { platform::LockGuardPtr guard(mtx_); allocations_.emplace(allocation->size(), AllocationPtr(allocation)); } - Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { { platform::LockGuardPtr guard(mtx_); @@ -61,15 +61,17 @@ Allocation *BufferedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { if (it != allocations_.end() && it->first < size * 2) { AllocationPtr result(std::move(it->second)); allocations_.erase(it); - return result.release(); + return new AllocationWithUnderlying(std::move(result)); } } try { - return underlying_allocator_->Allocate(size, attr).release(); + return new AllocationWithUnderlying( + underlying_allocator_->Allocate(size, attr)); } catch (BadAlloc &) { FreeCache(size); - return underlying_allocator_->Allocate(size, attr).release(); + return new AllocationWithUnderlying( + underlying_allocator_->Allocate(size, attr)); } } diff --git a/paddle/fluid/memory/allocation/buffered_allocator.h b/paddle/fluid/memory/allocation/buffered_allocator.h index c728395705842d29a7b2a8441a7048a7e4bf5e6b..d44a3f85beba712b1e735ba14008689bce7d0d64 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator.h +++ b/paddle/fluid/memory/allocation/buffered_allocator.h @@ -31,7 +31,7 @@ namespace allocation { // underlying_allocator_ class BufferedAllocator : public Allocator { public: - explicit BufferedAllocator(std::shared_ptr allocator); + explicit BufferedAllocator(std::unique_ptr &&allocator); ~BufferedAllocator(); @@ -44,11 +44,11 @@ class BufferedAllocator : public Allocator { void FreeCache(size_t size); protected: - void FreeImpl(Allocation *allocation) override; + void Free(Allocation *allocation) override; Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; private: - std::shared_ptr underlying_allocator_; + std::unique_ptr underlying_allocator_; std::multimap allocations_; std::unique_ptr mtx_; }; diff --git a/paddle/fluid/memory/allocation/buffered_allocator_test.cc b/paddle/fluid/memory/allocation/buffered_allocator_test.cc index 854a117b0e7532962d5e0c95fd947527ac3b307a..c8bd5292ca0f6c3e7ebdc7f5908523b0b7c8ba3a 100644 --- a/paddle/fluid/memory/allocation/buffered_allocator_test.cc +++ b/paddle/fluid/memory/allocation/buffered_allocator_test.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/memory/allocation/buffered_allocator.h" #include +#include #include #include "paddle/fluid/memory/allocation/best_fit_allocator.h" #include "paddle/fluid/memory/allocation/cpu_allocator.h" @@ -65,7 +66,7 @@ class StubAllocator : public Allocator { size_t GetFreeCount() const { return destruct_count_; } protected: - void FreeImpl(Allocation *allocation) override { + void Free(Allocation *allocation) override { auto *alloc = dynamic_cast(allocation); PADDLE_ENFORCE_NOT_NULL(alloc); if (alloc->ptr()) delete[] static_cast(alloc->ptr()); diff --git a/paddle/fluid/memory/allocation/cpu_allocator.cc b/paddle/fluid/memory/allocation/cpu_allocator.cc index 90c49c87a677aa38bce35774b3a7bb698e6f43e7..cc81a6f7b8b1950b07b6fb1571b53d9b5ddb1b9f 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.cc +++ b/paddle/fluid/memory/allocation/cpu_allocator.cc @@ -20,27 +20,25 @@ namespace paddle { namespace memory { namespace allocation { +CPUAllocation::CPUAllocation(void *ptr, size_t size) + : Allocation(ptr, size, platform::CPUPlace()) {} + bool CPUAllocator::IsAllocThreadSafe() const { return true; } -void CPUAllocator::FreeImpl(Allocation *allocation) { - void *p = allocation->ptr(); -#ifdef _WIN32 - _aligned_free(p); -#else - free(p); -#endif +void CPUAllocator::Free(Allocation *allocation) { + PADDLE_ENFORCE_NOT_NULL(dynamic_cast(allocation)); + free(allocation->ptr()); delete allocation; } Allocation *CPUAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { - void *p; -#ifdef _WIN32 - p = _aligned_malloc(size, kAlignment); -#else - PADDLE_ENFORCE_EQ(posix_memalign(&p, kAlignment, size), 0, "Alloc %ld error!", - size); -#endif - return new Allocation(p, size, platform::CPUPlace()); + void *ptr; + auto status = posix_memalign(&ptr, kAlignment, size); + if (UNLIKELY(status) != 0) { + throw BadAlloc(string::Sprintf("Cannot allocate cpu memory %d. Errno is %d", + size, status)); + } + return new CPUAllocation(ptr, size); } } // namespace allocation } // namespace memory diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h index 3eb1416b0efa9327f2052e1f128359bc93f94986..26d3643f4edff1f2d71b1c761e915a6dacb485ad 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.h +++ b/paddle/fluid/memory/allocation/cpu_allocator.h @@ -31,13 +31,19 @@ namespace allocation { // // NOTE(yy): It is no need to use `BestFitAllocator` in CPU. We can import // an open-sourced allocator into Paddle. +class CPUAllocator; +class CPUAllocation : public Allocation { + public: + CPUAllocation(void* ptr, size_t size); +}; + class CPUAllocator : public Allocator { public: - constexpr static size_t kAlignment = 4096UL; + constexpr static size_t kAlignment = 64u; bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation* allocation) override; + void Free(Allocation* allocation) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index 895a24a6a2a6b8e399ec2ace48136d1ef16c62f6..430bf0be98e08787ac4412a8b6e0fcc310ffe2b4 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -23,14 +23,15 @@ namespace paddle { namespace memory { namespace allocation { bool CUDAAllocator::IsAllocThreadSafe() const { return true; } -void CUDAAllocator::FreeImpl(Allocation* allocation) { +void CUDAAllocator::Free(Allocation* allocation) { platform::CUDADeviceGuard guard(place_.device); - PADDLE_ENFORCE_EQ(boost::get(allocation->place()), + auto* cuda_allocation = dynamic_cast(allocation); + PADDLE_ENFORCE_NOT_NULL(cuda_allocation); + PADDLE_ENFORCE_EQ(boost::get(cuda_allocation->place()), place_); PADDLE_ENFORCE(cudaFree(allocation->ptr())); delete allocation; } - Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { platform::CUDADeviceGuard guard(place_.device); void* ptr; @@ -40,9 +41,8 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { "Cannot allocate %d on GPU %d, cuda status %d, %s", size, place_.device, status, cudaGetErrorString(status))); } - return new Allocation(ptr, size, platform::Place(place_)); + return new CUDAAllocation(ptr, size, platform::Place(place_)); } - } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/cuda_allocator.h b/paddle/fluid/memory/allocation/cuda_allocator.h index 580a2d1df1d5997a27180740393741ec8973bf18..63726f5820b1c81565117c7a9bf798c17c9681f6 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.h +++ b/paddle/fluid/memory/allocation/cuda_allocator.h @@ -20,6 +20,13 @@ namespace paddle { namespace memory { namespace allocation { +// CUDA System allocator and allocation. +// Just a flag type. +class CUDAAllocation : public Allocation { + public: + using Allocation::Allocation; +}; + class CUDAAllocator : public Allocator { public: explicit CUDAAllocator(const platform::CUDAPlace& place) : place_(place) {} @@ -28,7 +35,7 @@ class CUDAAllocator : public Allocator { bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation* allocation) override; + void Free(Allocation* allocation) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; private: diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 0dc2de37467b7e7d23c88b4a255c14795db4c275..514ac7883ad2effdf3518be8afe3f448a5ac10b2 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -134,22 +134,26 @@ size_t Used(const platform::CPUPlace &place) { } #ifdef PADDLE_WITH_CUDA -class GPUBuddyAllocatorList { - public: - GPUBuddyAllocatorList() - : allocators_(platform::GetCUDADeviceCount()), - flags_(platform::GetCUDADeviceCount()) { - allocation::GPUMemMonitor.Initialize(allocators_.size()); - } +BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { + static std::once_flag init_flag; + static detail::BuddyAllocator **a_arr = nullptr; + static std::vector devices; + + std::call_once(init_flag, [gpu_id]() { + devices = platform::GetSelectedDevices(); + int gpu_num = devices.size(); - BuddyAllocator *Get(size_t dev_id) { - PADDLE_ENFORCE(dev_id < flags_.size(), "Invalid device id %s", dev_id); - std::call_once(flags_[dev_id], [this, dev_id] { + allocation::GPUMemMonitor.Initialize(devices.size()); + + a_arr = new BuddyAllocator *[gpu_num]; + for (size_t i = 0; i < devices.size(); ++i) { + int dev_id = devices[i]; + a_arr[i] = nullptr; platform::SetDeviceId(dev_id); - allocators_[dev_id] = new BuddyAllocator( - std::unique_ptr( - new detail::GPUAllocator(dev_id)), - platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + a_arr[i] = new BuddyAllocator(std::unique_ptr( + new detail::GPUAllocator(dev_id)), + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); VLOG(10) << "\n\nNOTE:\n" << "You can set GFlags environment variable " @@ -163,19 +167,13 @@ class GPUBuddyAllocatorList { << FLAGS_initial_gpu_memory_in_mb << ". Current 'FLAGS_reallocate_gpu_memory_in_mb' value is " << FLAGS_reallocate_gpu_memory_in_mb << "\n\n"; - }); - return allocators_[dev_id]; - } - - private: - std::vector allocators_; - std::vector flags_; -}; + } + }); -BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { - static GPUBuddyAllocatorList allocators; platform::SetDeviceId(gpu_id); - return allocators.Get(gpu_id); + auto pos = std::distance(devices.begin(), + std::find(devices.begin(), devices.end(), gpu_id)); + return a_arr[pos]; } #endif @@ -194,7 +192,7 @@ void *Alloc(const platform::CUDAPlace &place, #ifdef PADDLE_WITH_CUDA auto *buddy_allocator = GetGPUBuddyAllocator(place.device); auto *ptr = buddy_allocator->Alloc(size); - if (ptr == nullptr && size > 0) { + if (ptr == nullptr) { int cur_dev = platform::GetCurrentDeviceId(); platform::SetDeviceId(place.device); size_t avail, total; @@ -349,7 +347,7 @@ Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { return tmp_alloc; } -void LegacyAllocator::FreeImpl(Allocation *allocation) { +void LegacyAllocator::Free(Allocation *allocation) { boost::apply_visitor( legacy::FreeVisitor(allocation->ptr(), allocation->size()), allocation->place()); diff --git a/paddle/fluid/memory/allocation/legacy_allocator.h b/paddle/fluid/memory/allocation/legacy_allocator.h index 27cd42ea35012f07ae7db79c46d767138ddaafff..d9bdae153da6439598f76f5cac226897e6e0c596 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.h +++ b/paddle/fluid/memory/allocation/legacy_allocator.h @@ -73,7 +73,7 @@ class LegacyAllocator : public Allocator { protected: Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; - void FreeImpl(Allocation *allocation) override; + void Free(Allocation *allocation) override; private: platform::Place place_; diff --git a/paddle/fluid/memory/allocation/locked_allocator.cc b/paddle/fluid/memory/allocation/locked_allocator.cc index c43099cc88f839ad92d36774d49aafd7192f916f..62d768c580607f32db8c49eb3d62f0f32c9dbeeb 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.cc +++ b/paddle/fluid/memory/allocation/locked_allocator.cc @@ -17,7 +17,6 @@ #include #include "paddle/fluid/memory/allocation/allocation_with_underlying.h" #include "paddle/fluid/platform/lock_guard_ptr.h" - namespace paddle { namespace memory { namespace allocation { @@ -25,24 +24,26 @@ namespace allocation { bool LockedAllocator::IsAllocThreadSafe() const { return true; } LockedAllocator::LockedAllocator( - std::shared_ptr underlying_allocator) + std::unique_ptr &&underlying_allocator) : underlying_allocator_(std::move(underlying_allocator)) { PADDLE_ENFORCE_NOT_NULL(underlying_allocator_); if (!underlying_allocator_->IsAllocThreadSafe()) { mtx_.reset(new std::mutex()); } } - -void LockedAllocator::FreeImpl(Allocation *allocation) { - platform::LockGuardPtr guard(mtx_); - underlying_allocator_->Free(allocation); +void LockedAllocator::Free(Allocation *allocation) { + { + platform::LockGuardPtr guard(mtx_); + reinterpret_cast(allocation) + ->allocation_.reset(); // Destroy inner allocation + } + delete allocation; } - Allocation *LockedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { platform::LockGuardPtr guard(mtx_); - return underlying_allocator_->Allocate(size, attr).release(); + return new AllocationWithUnderlying( + underlying_allocator_->Allocate(size, attr)); } - } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/locked_allocator.h b/paddle/fluid/memory/allocation/locked_allocator.h index b735ccef101417b3f880eb6dcdd9964cffbe875c..4967b9bb8d3ad101cff4657b0a45b49b76e2deb2 100644 --- a/paddle/fluid/memory/allocation/locked_allocator.h +++ b/paddle/fluid/memory/allocation/locked_allocator.h @@ -24,15 +24,15 @@ namespace allocation { // A allocator to make underlying allocator thread safe. class LockedAllocator : public Allocator { public: - explicit LockedAllocator(std::shared_ptr underlying_allocator); + explicit LockedAllocator(std::unique_ptr &&underlying_allocator); bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation *allocation) override; + void Free(Allocation *allocation) override; Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; private: - std::shared_ptr underlying_allocator_; + std::unique_ptr underlying_allocator_; std::unique_ptr mtx_; }; diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc deleted file mode 100644 index 3334589a4beb407447cf89c173f6128654bb245a..0000000000000000000000000000000000000000 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator_facade_test.cc +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include "paddle/fluid/memory/allocation/allocator_facade.h" - -#ifdef PADDLE_WITH_CUDA -DECLARE_double(fraction_of_gpu_memory_to_use); -DECLARE_double(fraction_of_cuda_pinned_memory_to_use); -DECLARE_int64(gpu_allocator_retry_time); -#endif - -DECLARE_string(allocator_strategy); - -namespace paddle { -namespace memory { -namespace allocation { - -TEST(allocator, allocator) { -#ifdef PADDLE_WITH_CUDA - FLAGS_fraction_of_gpu_memory_to_use = 0.01; - FLAGS_gpu_allocator_retry_time = 500; - FLAGS_fraction_of_cuda_pinned_memory_to_use = 0.5; -#endif - - FLAGS_allocator_strategy = "naive_best_fit"; - - auto &instance = AllocatorFacade::Instance(); - platform::Place place; - size_t size = 1024; - - { - place = platform::CPUPlace(); - size = 1024; - auto cpu_allocation = instance.Alloc(place, size); - ASSERT_NE(cpu_allocation, nullptr); - ASSERT_NE(cpu_allocation->ptr(), nullptr); - ASSERT_EQ(cpu_allocation->place(), place); - ASSERT_EQ(cpu_allocation->size(), size); - } - -#ifdef PADDLE_WITH_CUDA - { - place = platform::CUDAPlace(0); - size = 1024; - auto gpu_allocation = instance.Alloc(place, size); - ASSERT_NE(gpu_allocation, nullptr); - ASSERT_NE(gpu_allocation->ptr(), nullptr); - ASSERT_EQ(gpu_allocation->place(), place); - ASSERT_GE(gpu_allocation->size(), size); - } - - { - // Allocate 2GB gpu memory - place = platform::CUDAPlace(0); - size = 2 * static_cast(1 << 30); - auto gpu_allocation = instance.Alloc(place, size); - ASSERT_NE(gpu_allocation, nullptr); - ASSERT_NE(gpu_allocation->ptr(), nullptr); - ASSERT_EQ(gpu_allocation->place(), place); - ASSERT_GE(gpu_allocation->size(), size); - } - - { - place = platform::CUDAPinnedPlace(); - size = (1 << 20); - auto cuda_pinned_allocation = - instance.Alloc(platform::CUDAPinnedPlace(), 1 << 20); - ASSERT_NE(cuda_pinned_allocation, nullptr); - ASSERT_NE(cuda_pinned_allocation->ptr(), nullptr); - ASSERT_EQ(cuda_pinned_allocation->place(), place); - ASSERT_GE(cuda_pinned_allocation->size(), size); - } -#endif -} - -} // namespace allocation -} // namespace memory -} // namespace paddle diff --git a/paddle/fluid/memory/allocation/pinned_allocator.cc b/paddle/fluid/memory/allocation/pinned_allocator.cc index 5a3d817211750d3e19e65344d1eab5a96800c674..de81d12cca6ca280289371abdec225c9e2b8f4d0 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/pinned_allocator.cc @@ -20,15 +20,20 @@ namespace paddle { namespace memory { namespace allocation { bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; } -void CPUPinnedAllocator::FreeImpl(Allocation *allocation) { +void CPUPinnedAllocator::Free(Allocation *allocation) { + PADDLE_ENFORCE_NOT_NULL(dynamic_cast(allocation)); PADDLE_ENFORCE(cudaFreeHost(allocation->ptr())); delete allocation; } Allocation *CPUPinnedAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { + // PADDLE_ENFORCE_EQ( + // attr, kCrossDevice, + // "CPUPinnedAllocator should be used for Cross-Device Communication"); + void *ptr; PADDLE_ENFORCE(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); - return new Allocation(ptr, size, platform::CUDAPinnedPlace()); + return new CPUPinnedAllocation(ptr, size); } } // namespace allocation } // namespace memory diff --git a/paddle/fluid/memory/allocation/pinned_allocator.h b/paddle/fluid/memory/allocation/pinned_allocator.h index deeb55a8fb0396a312286f5c2692114e9e4afc8d..42d0938f2afbb1efca8bfdd7035bc0eada30f06b 100644 --- a/paddle/fluid/memory/allocation/pinned_allocator.h +++ b/paddle/fluid/memory/allocation/pinned_allocator.h @@ -20,12 +20,18 @@ namespace memory { namespace allocation { // Allocator uses `cudaHostAlloc` +class CPUPinnedAllocation : public Allocation { + public: + CPUPinnedAllocation(void *ptr, size_t size) + : Allocation(ptr, size, platform::CUDAPinnedPlace()) {} +}; + class CPUPinnedAllocator : public Allocator { public: bool IsAllocThreadSafe() const override; protected: - void FreeImpl(Allocation *allocation) override; + void Free(Allocation *allocation) override; Allocation *AllocateImpl(size_t size, Allocator::Attr attr) override; }; diff --git a/paddle/fluid/memory/allocation/retry_allocator.cc b/paddle/fluid/memory/allocation/retry_allocator.cc index 7e888988f9602e362d73f64c1b45552e84e3349c..981705051b449e6a35c2dcce9138dc2efae52920 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.cc +++ b/paddle/fluid/memory/allocation/retry_allocator.cc @@ -18,15 +18,25 @@ namespace paddle { namespace memory { namespace allocation { -void RetryAllocator::FreeImpl(Allocation* allocation) { +bool RetryAllocator::IsAllocThreadSafe() const { + return underlying_allocator_->IsAllocThreadSafe(); +} + +void RetryAllocator::Free(Allocation* allocation) { // Delete underlying allocation first. - underlying_allocator_->Free(allocation); - cv_.notify_all(); + reinterpret_cast(allocation)->allocation_.reset(); + { + // notify all waited allocators, they can try to allocate memory after free. + std::lock_guard lock(mutex_); + cv_.notify_all(); + } + delete allocation; } Allocation* RetryAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { auto alloc_func = [&, this]() { - return underlying_allocator_->Allocate(size, attr).release(); + return new AllocationWithUnderlying( + underlying_allocator_->Allocate(size, attr)); }; // In fact, we can unify the code of allocation success and failure // But it would add lock even when allocation success at the first time diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h index 379f576d6e1ed8f256a0233b203423a487ee73e4..6ab8ca8fbec0077b2c95cf727731ca0095716197 100644 --- a/paddle/fluid/memory/allocation/retry_allocator.h +++ b/paddle/fluid/memory/allocation/retry_allocator.h @@ -25,25 +25,32 @@ namespace paddle { namespace memory { namespace allocation { +class RetryAllocator; + class RetryAllocator : public Allocator { public: - RetryAllocator(std::shared_ptr allocator, size_t retry_ms) + RetryAllocator(std::unique_ptr&& allocator, size_t retry_ms) : underlying_allocator_(std::move(allocator)), retry_time_(retry_ms) { + EnforceCheck(); + } + + bool IsAllocThreadSafe() const override; + + private: + void EnforceCheck() { PADDLE_ENFORCE_NOT_NULL( - underlying_allocator_, - "UnderlyingAllocator of RetryAllocator must not be null"); + underlying_allocator_.get(), + "UnderlyingAllocator of RetryAllocator must be UnmanagedAllocator"); PADDLE_ENFORCE(underlying_allocator_->IsAllocThreadSafe(), "UnderlyingAllocator of RetryAllocator must be thread-safe"); } - bool IsAllocThreadSafe() const override { return true; } - protected: - void FreeImpl(Allocation* allocation) override; + void Free(Allocation* allocation) override; Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; private: - std::shared_ptr underlying_allocator_; + std::unique_ptr underlying_allocator_; std::chrono::milliseconds retry_time_; std::mutex mutex_; std::condition_variable cv_; @@ -51,6 +58,8 @@ class RetryAllocator : public Allocator { // For debug, We can add an atomic integer to record how many memory sizes are // waited to allocate // std::atomic waited_allocate_size_{0}; + + friend class RetryAllocation; }; } // namespace allocation diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.cc b/paddle/fluid/memory/allocation/zero_size_allocator.cc index 39743bcb10c700c9a8446b9040c8a8707d57ec7d..cb2df1a029815478bbc9d3b09425f3ef145c5fb3 100644 --- a/paddle/fluid/memory/allocation/zero_size_allocator.cc +++ b/paddle/fluid/memory/allocation/zero_size_allocator.cc @@ -24,20 +24,11 @@ bool ZeroSizeAllocator::IsAllocThreadSafe() const { Allocation *ZeroSizeAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { if (size == 0) { - return new Allocation(nullptr, 0, place_); + return new ZeroSizeAllocation(place_); } else { return underlying_allocator_->Allocate(size, attr).release(); } } - -void ZeroSizeAllocator::FreeImpl(Allocation *allocation) { - if (allocation->size() == 0) { - delete allocation; - } else { - underlying_allocator_->Free(allocation); - } -} - } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/zero_size_allocator.h b/paddle/fluid/memory/allocation/zero_size_allocator.h index 08a7a06dbf290b55994a407fe478f792b0c0964a..0f01dfcdf5b1179c52d8c0204b655cab10770d95 100644 --- a/paddle/fluid/memory/allocation/zero_size_allocator.h +++ b/paddle/fluid/memory/allocation/zero_size_allocator.h @@ -24,6 +24,12 @@ namespace allocation { // The allocator handles the request's size is zero. Allocator will always // return an allocation even the request size is zero. However, the // allocation.ptr() is nullptr +class ZeroSizeAllocation : public Allocation { + public: + explicit ZeroSizeAllocation(const platform::Place& p) + : Allocation(nullptr, 0, p) {} +}; + class ZeroSizeAllocator : public Allocator { public: ZeroSizeAllocator(std::shared_ptr underlying_allocator, @@ -34,7 +40,6 @@ class ZeroSizeAllocator : public Allocator { protected: Allocation* AllocateImpl(size_t size, Allocator::Attr attr) override; - void FreeImpl(Allocation* allocation) override; private: std::shared_ptr underlying_allocator_; diff --git a/paddle/fluid/operators/alloc_continuous_space_op.cc b/paddle/fluid/operators/alloc_continuous_space_op.cc index df0e9911cf7186e952cfd7fbf7f43889e9098c84..d4bdecff62c016a31011266a0f066076d85fcdef 100644 --- a/paddle/fluid/operators/alloc_continuous_space_op.cc +++ b/paddle/fluid/operators/alloc_continuous_space_op.cc @@ -65,7 +65,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel { // Get numel and dtype size_t numel = 0; auto dtype = kDefaultDtype; - GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype); + GetMemSizeAndDtype(in_tensors, in_var_names, &numel, &dtype, + context.GetPlace()); // Alloc the continuous space auto fused_tensor = context.Output("FusedOutput"); @@ -74,14 +75,18 @@ class AllocContinuousSpaceKernel : public framework::OpKernel { // Init the continuous space auto out_tensors = context.MultiOutput("Output"); - int64_t offset = 0; + size_t offset = 0; + size_t size_of_dtype = framework::SizeOfType(dtype); if (context.Attr("copy_data")) { for (size_t i = 0; i < in_var_names.size(); ++i) { - int64_t len = out_tensors[i]->numel(); - auto sub_tensor = fused_tensor->Slice(offset, offset + len); - offset += len; - framework::TensorCopy(*out_tensors[i], context.GetPlace(), dev_ctx, + size_t len = static_cast(in_tensors[i]->numel()); + auto sub_tensor = fused_tensor->Slice( + static_cast(offset), static_cast(offset + len)); + framework::TensorCopy(*in_tensors[i], context.GetPlace(), dev_ctx, &sub_tensor); + + offset += + Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype; } } else if (context.Attr("set_constant")) { math::SetConstant set_constant; @@ -92,11 +97,13 @@ class AllocContinuousSpaceKernel : public framework::OpKernel { // Make the outputs point to the continuous space. offset = 0; for (size_t i = 0; i < out_tensors.size(); ++i) { - int64_t len = out_tensors[i]->numel(); + size_t len = static_cast(out_tensors[i]->numel()); auto dim = out_tensors[i]->dims(); out_tensors[i] - ->ShareDataWith(fused_tensor->Slice(offset, offset + len)) + ->ShareDataWith(fused_tensor->Slice( + static_cast(offset), static_cast(offset + len))) .Resize(dim); + len = Alignment(len * size_of_dtype, context.GetPlace()) / size_of_dtype; offset += len; VLOG(10) << "alloc_space_for_vars: output(" << out_var_names[i] << ") ,dim:(" << dim << ")" @@ -104,12 +111,28 @@ class AllocContinuousSpaceKernel : public framework::OpKernel { } } + private: + // Note(zcd): Addresses should be aligned, otherwise, the results may have + // diff. + size_t Alignment(size_t size, const platform::Place &place) const { + // Allow to allocate the minimum chunk size is 4 KB. + size_t alignment = 1 << 12; + if (platform::is_gpu_place(place)) { + // Allow to allocate the minimum chunk size is 256 B. + alignment = 1 << 8; + } + size_t remaining = size % alignment; + return remaining == 0 ? size : size + (alignment - remaining); + } + void GetMemSizeAndDtype( const std::vector &lod_tensors, const std::vector var_names, size_t *numel, - framework::proto::VarType::Type *dtype) const { + framework::proto::VarType::Type *dtype, + const platform::Place &place) const { PADDLE_ENFORCE_EQ(lod_tensors.size(), var_names.size()); *numel = 0; + size_t size_of_dtype = 0; for (size_t i = 0; i < var_names.size(); ++i) { PADDLE_ENFORCE(lod_tensors[i]->IsInitialized(), "%s is not initialized.", var_names[i]); @@ -119,6 +142,7 @@ class AllocContinuousSpaceKernel : public framework::OpKernel { PADDLE_ENFORCE_NE(p_dtype, kDefaultDtype, "%s's type should not be %s.", var_names[i], kDefaultDtype); *dtype = p_dtype; + size_of_dtype = framework::SizeOfType(p_dtype); } PADDLE_ENFORCE_EQ(p_dtype, *dtype, "Input vars is not equal."); @@ -126,7 +150,8 @@ class AllocContinuousSpaceKernel : public framework::OpKernel { PADDLE_ENFORCE_GT(size, 0); VLOG(10) << "alloc_space_for_vars: input(" << var_names[i] << ") ,dim:(" << lod_tensors[i]->dims() << ")"; - *numel += size; + *numel += Alignment(static_cast(size) * size_of_dtype, place) / + size_of_dtype; } } }; diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc index f349c51d8a99aaab43a15580a8904d4e4fd0d9b7..b2dbaecfcfd67cc679d02e22d4e89cfedeeba80c 100644 --- a/paddle/fluid/operators/bpr_loss_op.cc +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/bpr_loss_op.h" +#include namespace paddle { namespace operators { @@ -127,6 +128,23 @@ neural networks>(https://arxiv.org/abs/1511.06939) )DOC"); } }; + +class BprLossGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("bpr_loss_grad"); + op->SetInput("X", Input("X")); + op->SetInput("Label", Input("Label")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; } // namespace operators } // namespace paddle @@ -134,7 +152,7 @@ namespace ops = paddle::operators; using CPUCtx = paddle::platform::CPUDeviceContext; REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, ops::BprLossOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::BprLossGradDescMaker); REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp); REGISTER_OP_CPU_KERNEL(bpr_loss, ops::BprLossOpKernel, ops::BprLossOpKernel); diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc index a97828e6fe9cf3ed963da3c784a975f61ecec4a5..5b84221cfa5902d01540a06c6bc61fe9eac986f0 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include +#include #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/math_function.h" @@ -568,13 +569,31 @@ class ROIPerspectiveTransformOpMaker } }; +class ROIPerspectiveTransformGradDescMaker + : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("roi_perspective_transform_grad"); + op->SetInput("X", Input("X")); + op->SetInput("ROIs", Input("ROIs")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(roi_perspective_transform, ops::ROIPerspectiveTransformOp, ops::ROIPerspectiveTransformOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ROIPerspectiveTransformGradDescMaker); REGISTER_OPERATOR(roi_perspective_transform_grad, ops::ROIPerspectiveTransformGradOp); REGISTER_OP_CPU_KERNEL(roi_perspective_transform, diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 7aaa607f1585c98fe2dd816e8d66e5c6fd171e80..6a6741d8fc54d22addca91b75dfabf5950c1a35a 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -77,7 +77,8 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { } else { functor.RunMidWise(n, pre, post); } - z->set_mkldnn_prim_desc(x->get_mkldnn_prim_desc()); + z->set_layout(DataLayout::kMKLDNN); + z->set_format(x->format()); } else { PADDLE_ENFORCE(x->layout() == DataLayout::kMKLDNN && x->format() != memory::format::format_undef, @@ -115,8 +116,7 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_pd); // create mkldnn memory for dst - auto dst_mem_pd = sum_pd.dst_primitive_desc(); - memory dst_memory = memory(dst_mem_pd, z_data); + memory dst_memory = memory(sum_pd.dst_primitive_desc(), z_data); std::vector inputs; inputs.push_back(srcs[0]); @@ -129,7 +129,9 @@ class EltwiseAddMKLDNNKernel : public framework::OpKernel { pipeline.push_back(sum_prim); stream(stream::kind::eager).submit(pipeline).wait(); - z->set_mkldnn_prim_desc(dst_mem_pd); + z->set_layout(DataLayout::kMKLDNN); + z->set_format( + (memory::format)dst_memory.get_primitive_desc().desc().data.format); } } }; @@ -150,19 +152,24 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { auto* out = dout; auto *x = dout, *y = dout; + auto set_mkldnn_format = [](Tensor* in, const Tensor* out) { + in->set_layout(DataLayout::kMKLDNN); + in->set_format(out->format()); + }; + if (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()) { if (dx->dims() == dy->dims()) { auto blas = math::GetBlas(ctx); if (dx) { blas.VCOPY(dout->numel(), dout->data(), dx->mutable_data(ctx.GetPlace())); - dx->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc()); + set_mkldnn_format(dx, dout); } if (dy) { blas.VCOPY(dout->numel(), dout->data(), dy->mutable_data(ctx.GetPlace())); - dy->set_mkldnn_prim_desc(dout->get_mkldnn_prim_desc()); + set_mkldnn_format(dy, dout); } } } else { diff --git a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc index 4a974281481c8bc02589b428098475d73b8a0ba5..98ebe1fdf4bb3308b2f07a073072031e79e14146 100644 --- a/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc +++ b/paddle/fluid/operators/gaussian_random_batch_size_like_op.cc @@ -65,11 +65,17 @@ by input arguments. } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + GaussianRandomBatchSizeLikeNoNeedBufferVarsInference, "Input"); + } // namespace operators } // namespace paddle -REGISTER_OP_WITHOUT_GRADIENT( +REGISTER_OPERATOR( gaussian_random_batch_size_like, paddle::operators::GaussianRandomBatchSizeLikeOp, - paddle::operators::GaussianRandomBatchSizeLikeOpMaker); + paddle::operators::GaussianRandomBatchSizeLikeOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::operators::GaussianRandomBatchSizeLikeNoNeedBufferVarsInference); + // Kernels are registered in gaussian_random_op.cc and gaussian_random_op.cu diff --git a/paddle/fluid/operators/im2sequence_op.cc b/paddle/fluid/operators/im2sequence_op.cc index 8efd43928aac994c7630a213f6724e8f50abc7e0..44fd95edef253b814a166f724ca67fcafe979b99 100644 --- a/paddle/fluid/operators/im2sequence_op.cc +++ b/paddle/fluid/operators/im2sequence_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/im2sequence_op.h" +#include #include #include @@ -146,12 +147,28 @@ class Im2SequenceGradOp : public framework::OperatorWithKernel { } }; +class Im2SequenceGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("im2sequence_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::Im2SequenceGradDescMaker); REGISTER_OPERATOR(im2sequence_grad, ops::Im2SequenceGradOp); REGISTER_OP_CPU_KERNEL( im2sequence, diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index 10d01af982d01800bdd2d5d59761cfb09e2a8139..edee8c08d070742d54f761083592466658a445c9 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -10,6 +10,7 @@ limitations under the License. */ #include "paddle/fluid/operators/interpolate_op.h" +#include #include #include #include "paddle/fluid/framework/op_registry.h" @@ -194,21 +195,46 @@ class InterpolateOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.GetPlace()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.GetPlace()); + } +}; + +class InterpolateGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType(ForwardOp().Type() + "_grad"); + op->SetInput("X", Input("X")); + if (ForwardOp().Inputs().count("OutSize") > 0) { + op->SetInput("OutSize", Input("OutSize")); + } + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(InterpolateGradNoNeedBufferVarsInference, + "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(bilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad); + ops::InterpolateGradDescMaker); +REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad, + ops::InterpolateGradNoNeedBufferVarsInference); REGISTER_OPERATOR(nearest_interp, ops::InterpolateOp, ops::InterpolateOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad); + ops::InterpolateGradDescMaker); +REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad, + ops::InterpolateGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::InterpolateKernel, ops::InterpolateKernel, ops::InterpolateKernel); diff --git a/paddle/fluid/operators/l1_norm_op.cc b/paddle/fluid/operators/l1_norm_op.cc index bc115090acb473ac3175999ca96c5e00c0aeaeae..2696d0bef9e322fce1251984c9e0f5b7429eeea8 100644 --- a/paddle/fluid/operators/l1_norm_op.cc +++ b/paddle/fluid/operators/l1_norm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/l1_norm_op.h" +#include namespace paddle { namespace operators { @@ -62,12 +63,28 @@ $$Out = \sum{|X|}$$ } }; +class L1NormGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("l1_norm_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::L1NormGradDescMaker); REGISTER_OPERATOR(l1_norm_grad, ops::L1NormGradOp); REGISTER_OP_CPU_KERNEL( l1_norm, ops::L1NormKernel); diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc index da59bd53bce010d0d6ad2ab14acaffb9cc2f99e6..6d0af573184b10a783f9c5802d1db3630eb55538 100644 --- a/paddle/fluid/operators/label_smooth_op.cc +++ b/paddle/fluid/operators/label_smooth_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/label_smooth_op.h" +#include #include namespace paddle { @@ -105,10 +106,23 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel { : OperatorWithKernel(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) shouldn't be null."); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); + } +}; + +class LabelSmoothGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("label_smooth_grad"); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; } }; @@ -117,7 +131,7 @@ class LabelSmoothGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OPERATOR(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::LabelSmoothGradDescMaker); REGISTER_OPERATOR(label_smooth_grad, ops::LabelSmoothGradOp); REGISTER_OP_CPU_KERNEL( label_smooth, diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc index e17b6cb59898524d793f3cc78a09232f5b664617..fa09cb61e64aacd2aebf1ecf9826a15f9dcef877 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cc +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/linear_chain_crf_op.h" +#include namespace paddle { namespace operators { @@ -250,14 +251,46 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { } }; +class LinearChainCRFGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("linear_chain_crf_grad"); + op->SetAttrMap(Attrs()); + + op->SetInput("Emission", Input("Emission")); + op->SetInput("Transition", Input("Transition")); + op->SetInput("Label", Input("Label")); + + op->SetInput("Alpha", Output("Alpha")); + op->SetInput("EmissionExps", Output("EmissionExps")); + op->SetInput("TransitionExps", Output("TransitionExps")); + + op->SetInput(framework::GradVarName("LogLikelihood"), + OutputGrad("LogLikelihood")); + + op->SetOutput(framework::GradVarName("Emission"), InputGrad("Emission")); + op->SetOutput(framework::GradVarName("Transition"), + InputGrad("Transition")); + + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE( + LinearChainCRFGradNoNeedBufferVarsInference, "Transition", "Emission"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(linear_chain_crf, ops::LinearChainCRFOp, - ops::LinearChainCRFOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp); + ops::LinearChainCRFOpMaker, ops::LinearChainCRFGradDescMaker); +REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp, + ops::LinearChainCRFGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( linear_chain_crf, ops::LinearChainCRFOpKernel, diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc index ef1fb83aa6e34c14637b6e761fd7d2dbadee36b8..e8850a1e582dc5c0a9ad64d26ba9b824349ee4e3 100644 --- a/paddle/fluid/operators/log_loss_op.cc +++ b/paddle/fluid/operators/log_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/log_loss_op.h" +#include namespace paddle { namespace operators { @@ -100,12 +101,29 @@ class LogLossGradOp : public framework::OperatorWithKernel { } }; +class LogLossGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("log_loss_grad"); + op->SetInput("Predicted", Input("Predicted")); + op->SetInput("Labels", Input("Labels")); + op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); + op->SetOutput(framework::GradVarName("Predicted"), InputGrad("Predicted")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(log_loss, ops::LogLossOp, ops::LogLossOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::LogLossGradDescMaker); REGISTER_OPERATOR(log_loss_grad, ops::LogLossGradOp); REGISTER_OP_CPU_KERNEL( log_loss, ops::LogLossKernel); diff --git a/paddle/fluid/operators/lstm_op.cc b/paddle/fluid/operators/lstm_op.cc index 4a199d681f328318401e3aec9457d59b959a9e0c..52e4e8be28746d42ebbda9a5148a9495d0d80c6a 100644 --- a/paddle/fluid/operators/lstm_op.cc +++ b/paddle/fluid/operators/lstm_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/lstm_op.h" +#include #include namespace paddle { @@ -264,12 +265,51 @@ class LSTMGradOp : public framework::OperatorWithKernel { } }; +class LSTMGradOpDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("lstm_grad"); + op->SetAttrMap(Attrs()); + op->SetInput("Input", Input("Input")); + op->SetOutput(framework::GradVarName("Input"), InputGrad("Input")); + + if (ForwardOp().Inputs().count("H0") > 0) { + op->SetInput("H0", Input("H0")); + op->SetOutput(framework::GradVarName("H0"), InputGrad("H0")); + } + + if (ForwardOp().Inputs().count("C0") > 0) { + op->SetInput("C0", Input("C0")); + op->SetOutput(framework::GradVarName("C0"), InputGrad("C0")); + } + + op->SetInput("Weight", Input("Weight")); + op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight")); + + op->SetInput("Bias", Input("Bias")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + + op->SetInput("Cell", Output("Cell")); + + op->SetInput("Hidden", Output("Hidden")); + op->SetInput(framework::GradVarName("Hidden"), OutputGrad("Hidden")); + + op->SetInput("BatchGate", Output("BatchGate")); + op->SetInput("BatchCellPreAct", Output("BatchCellPreAct")); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(lstm, ops::LSTMOp, ops::LSTMOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::LSTMGradOpDescMaker); REGISTER_OPERATOR(lstm_grad, ops::LSTMGradOp); REGISTER_OP_CPU_KERNEL( lstm, ops::LSTMKernel, diff --git a/paddle/fluid/operators/margin_rank_loss_op.cc b/paddle/fluid/operators/margin_rank_loss_op.cc index b643ba9d7fa61d758e871ebe7a463c22e937fa2c..fca3532551730a39bda7cfad60151de97ef881de 100644 --- a/paddle/fluid/operators/margin_rank_loss_op.cc +++ b/paddle/fluid/operators/margin_rank_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/margin_rank_loss_op.h" +#include namespace paddle { namespace operators { @@ -94,8 +95,6 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null."); - PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null."); - PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) shouldn't be null."); PADDLE_ENFORCE(ctx->HasInput("Activated"), @@ -106,13 +105,31 @@ class MarginRankLossGradOp : public framework::OperatorWithKernel { } }; +class MarginRankLossGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("margin_rank_loss_grad"); + op->SetInput("Activated", Output("Activated")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetInput("Label", Input("Label")); + op->SetOutput(framework::GradVarName("X1"), InputGrad("X1")); + op->SetOutput(framework::GradVarName("X2"), InputGrad("X2")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(margin_rank_loss, ops::MarginRankLossOp, ops::MarginRankLossOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::MarginRankLossGradDescMaker); REGISTER_OPERATOR(margin_rank_loss_grad, ops::MarginRankLossGradOp); REGISTER_OP_CPU_KERNEL( margin_rank_loss, diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc index 35b6d7b5e3b16ced845a9dca619539d7753c55e6..2b2f8450768b9885381f10b19631a6a200c7f703 100644 --- a/paddle/fluid/operators/mean_op.cc +++ b/paddle/fluid/operators/mean_op.cc @@ -13,7 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/mean_op.h" +#include #include +#include + namespace paddle { namespace operators { @@ -61,7 +64,8 @@ class MeanGradOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - auto input_data_type = ctx.Input("X")->type(); + auto input_data_type = + ctx.Input(framework::GradVarName("Out"))->type(); return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -81,13 +85,16 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker { } }; +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(MeanGradNoNeedBufferVarsInference, "X"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanOpInferVarType, ops::MeanGradMaker); -REGISTER_OPERATOR(mean_grad, ops::MeanGradOp); +REGISTER_OPERATOR(mean_grad, ops::MeanGradOp, + ops::MeanGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL( mean, ops::MeanKernel, ops::MeanKernel); diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index 43559940d925e6fff29f0c5c66ec1a3dc717aaf4..5b7505f3c4acdef94fead04efd00b47825274117 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -96,7 +96,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx, std::vector src_tz = framework::vectorize2int(x->dims()); - auto src_format = x->format(); + auto src_format = + src_tz.size() == 2 ? mkldnn::memory::format::nc : x->format(); const std::string key = gethash(src_tz, algorithm); const std::string key_src_data = @@ -126,8 +127,10 @@ void eltwise_forward(const framework::ExecutionContext &ctx, if (p_fwd == nullptr) { // create mkldnn memory for input X + auto src_md = platform::MKLDNNMemDesc( + src_tz, platform::MKLDNNGetDataType(), src_format); auto src_memory = std::shared_ptr( - new memory(x->get_mkldnn_prim_desc(), to_void_cast(x_data))); + new memory({src_md, mkldnn_engine}, to_void_cast(x_data))); // save src_memory to be referred in backward path dev_ctx.SetBlob(key_src_mem, src_memory); @@ -174,7 +177,8 @@ void eltwise_forward(const framework::ExecutionContext &ctx, pipeline.push_back(*p_fwd); stream(stream::kind::eager).submit(pipeline).wait(); - y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc()); + y->set_layout(DataLayout::kMKLDNN); + y->set_format(GetMKLDNNFormat(*dst_memory)); } template @@ -192,6 +196,9 @@ void eltwise_grad(const framework::ExecutionContext &ctx, std::vector diff_dst_tz = framework::vectorize2int(diff_y->dims()); + auto diff_y_format = + diff_dst_tz.size() == 2 ? mkldnn::memory::format::nc : diff_y->format(); + const std::string key = gethash(diff_dst_tz, algorithm); const std::string key_src_data = key + ctx.op().Input("Out") + "@eltwise_fwd_src_data"; @@ -203,8 +210,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx, key + std::to_string(*p_src_layout) + "@eltwise_fwd_src_mem"; const std::string key_fwd_pd = key + std::to_string(*p_src_layout) + "@eltwise_fwd_pd"; - const std::string key_with_layouts = key + std::to_string(*p_src_layout) + - "-" + std::to_string(diff_y->format()); + const std::string key_with_layouts = + key + std::to_string(*p_src_layout) + "-" + std::to_string(diff_y_format); const std::string key_diff_src_mem = key_with_layouts + "@eltwise_diff_src_mem"; const std::string key_diff_dst_mem = @@ -227,8 +234,10 @@ void eltwise_grad(const framework::ExecutionContext &ctx, if (p_grad == nullptr) { // create mkldnn memory for input diff_y + auto diff_dst_md = platform::MKLDNNMemDesc( + diff_dst_tz, platform::MKLDNNGetDataType(), diff_y_format); auto diff_dst_memory = std::shared_ptr( - new memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data))); + new memory({diff_dst_md, mkldnn_engine}, to_void_cast(diff_y_data))); dev_ctx.SetBlob(key_diff_dst_mem, diff_dst_memory); // retrieve eltwise primitive desc from device context @@ -272,7 +281,8 @@ void eltwise_grad(const framework::ExecutionContext &ctx, pipeline.push_back(*p_grad); stream(stream::kind::eager).submit(pipeline).wait(); - diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc()); + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format(GetMKLDNNFormat(*diff_src_memory)); } template diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc index 04e45d4853907bb7d6b5ce362892a2183fd4b60e..bddca232e6c8a2a7fde998877006e37ee6d3d0dc 100644 --- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc @@ -206,14 +206,17 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu; // create mkldnn memory from input x tensor + mkldnn::memory::format input_format = + platform::MKLDNNFormatForSize(src_tz.size(), x->format()); // keys for backward pass const std::string key = BatchNormMKLDNNHandler::GetHash( - src_tz, epsilon, flags, global_stats, x->format(), + src_tz, epsilon, flags, global_stats, input_format, ctx.op().Output("SavedMean")); const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; - auto user_src_md = x->get_mkldnn_prim_desc().desc(); + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, platform::MKLDNNGetDataType(), input_format); // create primitive descriptor for batch norm forward using bn_fwd_types = bn_type_traits; @@ -227,8 +230,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine, key); - auto src_memory = handler.AcquireSrcMemory(x->get_mkldnn_prim_desc(), - to_void_cast(x_data)); + auto src_memory = + handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data)); // crate mkldnn memory for weights(scale/shift) auto scaleshift_memory = @@ -262,7 +265,8 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { variance_memory, false); } - y->set_mkldnn_prim_desc(dst_memory->get_primitive_desc()); + y->set_layout(DataLayout::kMKLDNN); + y->set_format(platform::GetMKLDNNFormat(*dst_memory)); std::vector pipeline; pipeline.push_back(*batch_norm_p); @@ -332,6 +336,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { using bn_bwd_types = bn_type_traits; + mkldnn::memory::format dst_format = + platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format()); + mkldnn::memory::format input_format = platform::MKLDNNFormatForSize(src_tz.size(), x->format()); @@ -339,14 +346,14 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { // keys from forward pass const std::string key = BatchNormMKLDNNHandler::GetHash( - src_tz, epsilon, flags, false, x->format(), + src_tz, epsilon, flags, false, input_format, ctx.op().Input("SavedMean")); const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd"; // keys for primitives reuse const std::string key_with_hash = key + BatchNormMKLDNNHandler::GetHash(src_tz, epsilon, flags, false, - x->format()); + input_format); const std::string key_batch_norm_bwd_p = key_with_hash + "@batch_norm_bwd_p"; const std::string key_batch_norm_src_mem_p = @@ -366,8 +373,9 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { primitive reorder_diff_dst; bool is_diff_dst_reordered = false; - auto user_diff_dst_memory = - memory(diff_y->get_mkldnn_prim_desc(), to_void_cast(diff_y_data)); + auto user_diff_dst_memory = memory( + {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine}, + to_void_cast(diff_y_data)); // MKLDNN requires a single piece of memory for scale and shift/bias data const size_t scaleshift_size = 2 * ic; @@ -451,7 +459,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { dev_ctx.SetBlob(key_batch_norm_diff_dst_mem_p, diff_dst_memory); // set layout/format of output tensors - diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc()); + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc() + .desc() + .data.format); } else { // primitives already exist UpdateMemoryData(dev_ctx, key_batch_norm_src_mem_p, to_void_cast(x_data)); @@ -476,7 +487,10 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { } // set layout/format of output tensors - diff_x->set_mkldnn_prim_desc(diff_src_memory->get_primitive_desc()); + diff_x->set_layout(DataLayout::kMKLDNN); + diff_x->set_format((memory::format)diff_src_memory->get_primitive_desc() + .desc() + .data.format); } // execute optional reorder and batch_norm backward primitive diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index 97387af92ffbd123ae6e795f17ef2273dadeab9d..50fe2e6e4c5a5e3e0ed1d9a9827e75094454c2fc 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -210,7 +210,8 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { stream(stream::kind::eager).submit({*concat_p}).wait(); - output->set_mkldnn_prim_desc(concat_pd->dst_primitive_desc()); + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetDstMemFormat(*concat_pd)); } }; } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 8d96ae7e4215c2488564322e1dda46a81b46a665..5e4d79f1c35af42f662711ae9d8bfc650bab2b4f 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -96,8 +96,12 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; auto* output = ctx.Output("Output"); - PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN); - PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN); + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5, "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5, @@ -144,19 +148,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector pipeline; - // For convolution with groups we need to recreate primitive descriptor - // as Paddle tensor is not having group dims while mkldnn treats - // group as another dimensions - mkldnn::memory::primitive_desc user_weights_mpd = - filter->get_mkldnn_prim_desc(); - if (g > 1) { - mkldnn::memory::format weights_format = - GetWeightsFormat(filter->format(), g, is_conv3d); - auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), weights_format); - user_weights_mpd = - mkldnn::memory::primitive_desc(user_weights_md, mkldnn_engine); - } + auto src_format = input->format(); + mkldnn::memory::format weights_format = + GetWeightsFormat(filter->format(), g, is_conv3d); + + auto user_src_md = platform::MKLDNNMemDesc( + {src_tz}, platform::MKLDNNGetDataType(), src_format); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), weights_format); /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose @@ -166,7 +165,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto chosen_memory_format = platform::data_format_to_memory_format(data_format); - mkldnn::memory::format weights_format = mkldnn::memory::format::any; + weights_format = mkldnn::memory::format::any; // Check the format for user's special output if (chosen_memory_format != mkldnn::memory::format::any) { if (is_conv3d) { @@ -206,10 +205,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key); // create mkldnn memory from input tensors (data/weights) - auto user_src_memory_p = handler.AcquireSrcMemory( - input->get_mkldnn_prim_desc(), to_void_cast(input_data)); + auto user_src_memory_p = + handler.AcquireSrcMemory(user_src_md, to_void_cast(input_data)); auto user_weights_memory_p = handler.AcquireWeightsMemory( - user_weights_mpd, to_void_cast(filter_data)); + user_weights_md, to_void_cast(filter_data)); // create reorder primitive if the input format is not the preferred one auto src_memory_p = @@ -282,7 +281,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { pipeline.push_back(*conv_p); stream(stream::kind::eager).submit(pipeline).wait(); - output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc()); + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(*dst_memory_p)); } void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const { const bool is_test = ctx.Attr("is_test"); @@ -948,8 +948,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { // push primitive to stream and wait until it's executed pipeline.push_back(*conv_bwd_weights_p); - auto filter_grad_mpd = diff_weights_memory_p->get_primitive_desc(); - filter_grad->set_mkldnn_prim_desc(filter_grad_mpd); + filter_grad->set_layout(DataLayout::kMKLDNN); + filter_grad->set_format(GetMKLDNNFormat(*diff_weights_memory_p)); } if (input_grad) { @@ -972,7 +972,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { pipeline.push_back(*conv_bwd_data_p); - input_grad->set_mkldnn_prim_desc(diff_src_memory_p->get_primitive_desc()); + input_grad->set_layout(DataLayout::kMKLDNN); + input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p)); } stream(stream::kind::eager).submit(pipeline).wait(); } diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index 79a0c5c7683d677daeb4feea10deab86407f944c..317d4cebe26b81ff03c212e6328233d5152ed1b4 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -221,7 +221,8 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { pipeline.push_back(*conv_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); - output->set_mkldnn_prim_desc(dst_memory_p->get_primitive_desc()); + output->set_layout(DataLayout::kMKLDNN); + output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); } private: diff --git a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc index d01e8dbf4ce0c92bb81fc76df68d5424f9da0717..76b00b396c1349eff5db1059268e7cf280a8fc64 100644 --- a/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/gaussian_random_mkldnn_op.cc @@ -42,12 +42,8 @@ class GaussianMKLDNNKernel : public paddle::framework::OpKernel { // The format of output is set as the mkldnn's format // TODO(@mozga-intel) The format of matrix sets inside the another layers. - // TODO(jczaja): Remove this hack after checking performance on block layout - - auto tensor_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(tensor->dims()), - mkldnn::memory::format::oihw); - tensor->set_mkldnn_prim_desc(tensor_mem_pd); + tensor->set_layout(DataLayout::kMKLDNN); + tensor->set_format(mkldnn::memory::format::oihw); } }; } // namespace operators diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc index 4ff27ab12280b56abdf72056fe69ec713f2f2f46..097ba01d401dbc7969e30f576cac2567c874ed99 100644 --- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc @@ -81,7 +81,10 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto e_mid = framework::EigenTensor::From(*mid); e_mid = e_mid.constant(k); - auto src_md = x->get_mkldnn_prim_desc().desc(); + auto dims = paddle::framework::vectorize2int(x->dims()); + + auto src_md = paddle::platform::MKLDNNMemDesc( + dims, mkldnn::memory::data_type::f32, x->format()); auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward, mkldnn::lrn_across_channels, @@ -91,7 +94,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { beta, k}; - auto src_memory_pd = x->get_mkldnn_prim_desc(); + auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; if (!is_test) { const std::string key = ctx.op().Output("Out"); @@ -108,15 +111,16 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { src_memory->set_data_handle( static_cast(const_cast(input_data))); - auto dst_memory_pd = forward_pd->dst_primitive_desc(); - auto dst_memory = - mkldnn::memory(dst_memory_pd, static_cast(output_data)); + auto dst_memory = mkldnn::memory(forward_pd->dst_primitive_desc(), + static_cast(output_data)); auto workspace_memory = insert_to_context( key_workspace_memory, dev_ctx, forward_pd->workspace_primitive_desc()); run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory); - out->set_mkldnn_prim_desc(dst_memory_pd); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(dst_memory)); } else { auto forward_pd = mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine}; @@ -124,12 +128,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { src_memory_pd, static_cast(const_cast(input_data))}; auto workspace_memory = mkldnn::memory{forward_pd.workspace_primitive_desc()}; - auto dst_memory_pd = forward_pd.dst_primitive_desc(); auto dst_memory = mkldnn::memory(forward_pd.dst_primitive_desc(), static_cast(output_data)); run_primitive(forward_pd, src_memory, workspace_memory, dst_memory); - out->set_mkldnn_prim_desc(dst_memory_pd); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(dst_memory)); } } }; diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index 0ce552219458859e147ba207c94270bf84a1fe75..dc1176f0848b93dd6872f676c3a71dab4f3455fd 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -158,14 +158,6 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { auto softmax_p = handler.AcquireSoftmax(softmax_dst_memory_p, softmax_src_memory_p); - // We cannot use softmax_dst_memory_p to get prim desc as - // it contains flattened dims (2D) while output tensor can - // have 2,3,4+ dims - auto output_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(output->dims()), - mkldnn::memory::format::blocked); - output->set_mkldnn_prim_desc(output_mem_pd); - std::vector pipeline{ *(static_cast(softmax_p.get()))}; stream(stream::kind::eager).submit(pipeline).wait(); diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index aef5b7d4311adfedb3db157f17506c3a2c76fbf6..6f64157b64e2f6247db8b49dc94cd10bfb6e861f 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -106,12 +106,12 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { memory::desc(dst_tz, memory::data_type::f32, memory::format::any); auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_mpd); - auto dst_mem_pd = sum_pd.dst_primitive_desc(); + std::shared_ptr dst_mem; if (in_place) { - dst_mem.reset(new memory(dst_mem_pd)); + dst_mem.reset(new memory(sum_pd.dst_primitive_desc())); } else { - dst_mem.reset(new memory(dst_mem_pd, output_data)); + dst_mem.reset(new memory(sum_pd.dst_primitive_desc(), output_data)); } std::vector inputs; for (size_t i = 0; i < srcs_mem.size(); ++i) { @@ -136,7 +136,8 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel { if (in_place) pipeline.push_back(reorder_prim); stream(stream::kind::eager).submit(pipeline).wait(); - output->set_mkldnn_prim_desc(dst_mem_pd); + output->set_layout(DataLayout::kMKLDNN); + output->set_format(output_format); } else { // Fallback to naive version // TODO(@mozga-intel) Add MKLDNN SelectedRows & LoDTensorArray support SumKernel reference_kernel; diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc index 4debc7ca5ec90d6cc781d10e817e9ed8650f12aa..95cee806ac451235a8fb03567e6057e10aa56427 100644 --- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc @@ -52,7 +52,7 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { mkldnn_engine, key); auto transpose_src_memory_p = handler.AcquireSrcMemory( - input->get_mkldnn_prim_desc(), platform::to_void_cast(input_data)); + input->format(), platform::to_void_cast(input_data)); auto transpose_dst_memory_p = handler.AcquireDstMemory(output, ctx.GetPlace()); auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, @@ -62,14 +62,8 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { pipeline.push_back(*transpose_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); - // Transpose did change logical dimensions of Tensor, but reorder does not. - // Reorder does change only physical layout eg. format , strides - // so we need to create new primitive descriptor with changed logical layout - // so it match output shape - auto output_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(output->dims()), - mkldnn::memory::format::blocked); - output->set_mkldnn_prim_desc(output_mem_pd); + output->set_layout(DataLayout::kNCHW); + output->set_format(mkldnn::memory::format::format_undef); } }; @@ -134,9 +128,8 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { platform::TransposeMKLDNNHandler handler(nchw_tz, reversed_axis, dev_ctx, mkldnn_engine, key); - auto transpose_src_memory_p = - handler.AcquireSrcMemory(out_grad->get_mkldnn_prim_desc(), - platform::to_void_cast(out_grad_data)); + auto transpose_src_memory_p = handler.AcquireSrcMemory( + out_grad->format(), platform::to_void_cast(out_grad_data)); auto transpose_dst_memory_p = handler.AcquireDstMemory(x_grad, ctx.GetPlace()); auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p, @@ -145,15 +138,6 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { std::vector pipeline; pipeline.push_back(*transpose_p); mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); - - // Transpose did change logical dimensions of Tensor, but reorder does not. - // Reorder does change only physical layout eg. format , strides - // so we need to create new primitive descriptor with changed logical layout - // so it match output shape - auto x_grad_mem_pd = paddle::platform::create_prim_desc_from_dims( - paddle::framework::vectorize2int(x_grad->dims()), - mkldnn::memory::format::blocked); - x_grad->set_mkldnn_prim_desc(x_grad_mem_pd); } }; diff --git a/paddle/fluid/operators/multiplex_op.cc b/paddle/fluid/operators/multiplex_op.cc index 1801f2915e09b5ac6ee1ee27726e66d26c9c6a8f..7cb213e89958e017c62d7cded261570307d3e64b 100644 --- a/paddle/fluid/operators/multiplex_op.cc +++ b/paddle/fluid/operators/multiplex_op.cc @@ -13,6 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/multiplex_op.h" +#include +#include namespace paddle { namespace operators { @@ -111,28 +113,47 @@ class MultiplexGradOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(!ctx->Inputs("X").empty(), "Input(X) should not be null."); - PADDLE_ENFORCE(!ctx->Outputs(framework::GradVarName("X")).empty(), - "Output(X@Grad) should not be null."); + auto& dxs = ctx->Outputs(framework::GradVarName("X")); + PADDLE_ENFORCE(!dxs.empty(), "Output(X@Grad) should not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) should not be null."); - ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); + auto dout_dim = ctx->GetInputDim(framework::GradVarName("Out")); + ctx->SetOutputsDim(framework::GradVarName("X"), + std::vector(dxs.size(), dout_dim)); } protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.MultiInput("X")[0]->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); + } +}; + +class MultiplexGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("multiplex_grad"); + op->SetInput("Ids", Input("Ids")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X", false)); + op->SetAttrMap(Attrs()); + return op; } }; } // namespace operators } // namespace paddle + namespace ops = paddle::operators; REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::MultiplexGradDescMaker); REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); REGISTER_OP_CPU_KERNEL( multiplex, diff --git a/paddle/fluid/operators/multiplex_op.cu b/paddle/fluid/operators/multiplex_op.cu index 2f8a602f3c5c0a7c262235f99943ce336e20a7b4..1ef54ecc732f3d2098ed51d955f8feed4cb1a821 100644 --- a/paddle/fluid/operators/multiplex_op.cu +++ b/paddle/fluid/operators/multiplex_op.cu @@ -53,20 +53,25 @@ class MultiplexGradGPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto ins = ctx.MultiInput("X"); auto* ids = ctx.Input("Ids"); auto d_ins = ctx.MultiOutput(framework::GradVarName("X")); + + size_t idx = -1UL; for (size_t i = 0; i < d_ins.size(); i++) { if (d_ins[i]) { d_ins[i]->mutable_data(ctx.GetPlace()); auto t = framework::EigenVector::Flatten(*d_ins[i]); t.device(*ctx.template device_context().eigen_device()) = t.constant(static_cast(0)); + + idx = i; } } - auto rows = ins[0]->dims()[0]; - auto cols = ins[0]->numel() / rows; + if (idx == -1UL) return; + + auto rows = d_ins[idx]->dims()[0]; + auto cols = d_ins[idx]->numel() / rows; // copy index to cpu Tensor index_t_cpu; TensorCopySync(*ids, platform::CPUPlace(), &index_t_cpu); diff --git a/paddle/fluid/operators/multiplex_op.h b/paddle/fluid/operators/multiplex_op.h index 87de000971941c39ee84e1bca46e2cd18e262fd8..44d6cc84a6493a326257d96f19b43c83c62f7b31 100644 --- a/paddle/fluid/operators/multiplex_op.h +++ b/paddle/fluid/operators/multiplex_op.h @@ -52,20 +52,25 @@ class MultiplexGradCPUKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const { auto* d_out = ctx.Input(framework::GradVarName("Out")); auto* ids = ctx.Input("Ids"); - auto ins = ctx.MultiInput("X"); auto d_ins = ctx.MultiOutput(framework::GradVarName("X")); + + size_t idx = -1UL; for (size_t i = 0; i < d_ins.size(); i++) { if (d_ins[i]) { d_ins[i]->mutable_data(ctx.GetPlace()); auto t = framework::EigenVector::Flatten(*d_ins[i]); t.device(*ctx.template device_context().eigen_device()) = t.constant(static_cast(0)); + + idx = i; } } - auto rows = ins[0]->dims()[0]; - auto cols = ins[0]->numel() / rows; + if (idx == -1UL) return; + + auto rows = d_ins[idx]->dims()[0]; + auto cols = d_ins[idx]->numel() / rows; auto* index = ids->data(); platform::CPUPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc index d4b631a6f5bf9332f4ed1d1a4bda529fbb6ada0a..c28106d31273cb54e3974d186296644272d2014c 100644 --- a/paddle/fluid/operators/pad_op.cc +++ b/paddle/fluid/operators/pad_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/pad_op.h" +#include namespace paddle { namespace operators { @@ -29,7 +30,7 @@ class PadOp : public framework::OperatorWithKernel { "Output(Out) of PadOp should not be null."); auto x_dim = ctx->GetInputDim("X"); - auto paddings = ctx->Attrs().Get>("paddings"); + auto& paddings = ctx->Attrs().Get>("paddings"); PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()), "Size of paddings should be equal to 2 * dimension size " "of input tensor."); @@ -99,13 +100,20 @@ class PadOpGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null"); - auto x_dims = ctx->GetInputDim("X"); + auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); + auto& paddings = ctx->Attrs().Get>("paddings"); + for (int i = 0; i < dout_dims.size(); ++i) { + dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]); + } + auto x_grad_name = framework::GradVarName("X"); if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); + auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); + auto& paddings = ctx->Attrs().Get>("paddings"); + for (int i = 0; i < dout_dims.size(); ++i) { + dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]); + } + ctx->SetOutputDim(x_grad_name, dout_dims); } } }; @@ -117,7 +125,6 @@ class PadOpGradMaker : public framework::SingleGradOpDescMaker { protected: std::unique_ptr Apply() const override { auto* bind = new framework::OpDesc(); - bind->SetInput("X", Input("X")); bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); bind->SetOutput(framework::GradVarName("X"), InputGrad("X")); bind->SetAttrMap(Attrs()); diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc index 78989582b7a0da5b7ff326cea1606df9993bed4c..dce9108eb17d76cfdf1c1b2313d975fd9fbdf9a7 100644 --- a/paddle/fluid/operators/psroi_pool_op.cc +++ b/paddle/fluid/operators/psroi_pool_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/psroi_pool_op.h" +#include namespace paddle { namespace operators { @@ -154,12 +155,29 @@ class PSROIPoolGradOp : public framework::OperatorWithKernel { } }; +class PSROIPoolGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("psroi_pool_grad"); + op->SetInput("X", Input("X")); + op->SetInput("ROIs", Input("ROIs")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::PSROIPoolGradDescMaker); REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp); REGISTER_OP_CPU_KERNEL( psroi_pool, diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc index 313cf01541dd88a0f4f8bf54fe4436984c2cbcf8..45daa6b955639e3695211c1032869c743ede9b2c 100644 --- a/paddle/fluid/operators/rank_loss_op.cc +++ b/paddle/fluid/operators/rank_loss_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/rank_loss_op.h" +#include #include namespace paddle { @@ -116,6 +117,25 @@ class RankLossGradOp : public framework::OperatorWithKernel { } }; +class RankLossGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("rank_loss_grad"); + op->SetInput("Label", Input("Label")); + op->SetInput("Left", Input("Left")); + op->SetInput("Right", Input("Right")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("Left"), InputGrad("Left")); + op->SetOutput(framework::GradVarName("Right"), InputGrad("Right")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index 6857b5ed9dbccb06a71063c3da9045e1f79ef6f6..7bb10ce063109dbd8520430d2b32ac9370ef8d25 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/roi_align_op.h" +#include namespace paddle { namespace operators { @@ -147,12 +148,29 @@ Thus avoid the misaligned problem. } }; +class ROIAlignGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("roi_align_grad"); + op->SetInput("X", Input("X")); + op->SetInput("ROIs", Input("ROIs")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(roi_align, ops::ROIAlignOp, ops::ROIAlignOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ROIAlignGradDescMaker); REGISTER_OPERATOR(roi_align_grad, ops::ROIAlignGradOp); REGISTER_OP_CPU_KERNEL( roi_align, diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index e46d92d6fc3a9830535a8bb07824b26b92a5dbde..cfac7e09e123c43204454adacb87a7c3c158690e 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/roi_pool_op.h" +#include namespace paddle { namespace operators { @@ -158,12 +159,30 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn } }; +class ROIPoolGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("roi_pool_grad"); + op->SetInput("X", Input("X")); + op->SetInput("ROIs", Input("ROIs")); + op->SetInput("Argmax", Output("Argmax")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ROIPoolGradDescMaker); REGISTER_OPERATOR(roi_pool_grad, ops::ROIPoolGradOp); REGISTER_OP_CPU_KERNEL( roi_pool, diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index ad418d51bcdb0e9e7959961bdf344a80f85c3f17..8e0e3bd6054018852b242d1dba5c250394ed81ce 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/scatter_op.h" +#include #include "paddle/fluid/framework/ddim.h" namespace paddle { @@ -63,14 +64,16 @@ class ScatterGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { ctx->SetOutputDim(framework::GradVarName("Updates"), ctx->GetInputDim("Updates")); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->SetOutputDim(framework::GradVarName("X"), + ctx->GetInputDim(framework::GradVarName("Out"))); } protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType(ctx.Input("X")->type(), - ctx.device_context()); + return framework::OpKernelType( + ctx.Input(framework::GradVarName("Out"))->type(), + ctx.device_context()); } }; @@ -95,12 +98,34 @@ $$ } }; +class ScatterGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("scatter_grad"); + op->SetInput("Ids", Input("Ids")); + op->SetInput("Updates", Input("Updates")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Updates"), InputGrad("Updates")); + op->SetAttrMap(Attrs()); + return op; + } +}; + +DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(ScatterGradNoNeedBufferVarsInference, + "Updates"); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(scatter, ops::ScatterOp, ops::ScatterOpMaker, - paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp); + ops::ScatterGradDescMaker); +REGISTER_OPERATOR(scatter_grad, ops::ScatterGradOp, + ops::ScatterGradNoNeedBufferVarsInference); REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel); REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel); diff --git a/paddle/fluid/operators/shuffle_channel_op.cc b/paddle/fluid/operators/shuffle_channel_op.cc index 9349912e090f2ad3248923c87b50c8d72b0d84d1..26355e58615454c8e9aea1d6a5405368e6006e87 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cc +++ b/paddle/fluid/operators/shuffle_channel_op.cc @@ -10,6 +10,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/shuffle_channel_op.h" +#include namespace paddle { namespace operators { @@ -91,13 +92,28 @@ class ShuffleChannelGradOp : public framework::OperatorWithKernel { } }; +class ShuffleChannelGradDescMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + std::unique_ptr op(new framework::OpDesc()); + op->SetType("shuffle_channel_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Out"), OutputGrad("Out")); + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetAttrMap(Attrs()); + return op; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(shuffle_channel, ops::ShuffleChannelOp, - ops::ShuffleChannelOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::ShuffleChannelOpMaker, ops::ShuffleChannelGradDescMaker); REGISTER_OPERATOR(shuffle_channel_grad, ops::ShuffleChannelGradOp); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 4fa6774f028bef901f6e11f2d3dafe52a10a548e..ecaad4ec070fe60a522839e0718c424a441dec0b 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include #include "paddle/fluid/framework/data_layout_transform.h" @@ -39,45 +40,6 @@ class MKLDNNHandler { return this->AcquireMemory(md, ptr, "@user_src_mem_p"); } - // TODO(jczaja): extract common part and make AcquireMemory - std::shared_ptr AcquireSrcMemory( - const mkldnn::memory::primitive_desc& mpd, void* ptr) { - auto local_key = key_ + "@user_src_mem_p"; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), - " find mem primitive in device context"); - if (mem_p == nullptr) { - mem_p = std::make_shared(mpd, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - // Mark that reusing happenned. All primitives from operator instance - // should be reused or none of them. So we check consistency - is_reusing_ = true; - } - return mem_p; - } - - std::shared_ptr AcquireWeightsMemory( - const mkldnn::memory::primitive_desc& mpd, void* ptr) { - auto local_key = key_ + "@user_weights_mem_p"; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), - " find mem primitive in device context"); - if (mem_p == nullptr) { - mem_p = std::make_shared(mpd, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - // Mark that reusing happenned. All primitives from operator instance - // should be reused or none of them. So we check consistency - is_reusing_ = true; - } - return mem_p; - } - std::shared_ptr AcquireWeightsMemory( const mkldnn::memory::desc& md, void* ptr, user_function custom_func = {}) { @@ -315,7 +277,37 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { mkldnn::engine engine, const std::string& base_key) : platform::MKLDNNHandler(dev_ctx, engine, base_key), dims_(dims), - axis_(axis) {} + axis_(axis), + logical_axis_(dims.size(), 0) {} + + std::shared_ptr AcquireSrcMemory( + const mkldnn::memory::format& fmt, void* ptr) { + auto local_key = key_ + "@user_src_mem_p"; + auto mem_p = + std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); + PADDLE_ENFORCE((mem_p != nullptr) || (is_reusing_ == false), + " find mem primitive in device context"); + if (mem_p == nullptr) { + // Make memory descriptor using input format, unless it + // cannot be trusted (nchw) then make up memory fmt manually + for (size_t i = 0; i < logical_axis_.size(); ++i) { + logical_axis_[i] = i; + } + auto src_md = fmt != mkldnn::memory::format::nchw + ? platform::MKLDNNMemDesc( + dims_, platform::MKLDNNGetDataType(), fmt) + : Axis2MemoryDesc(dims_, logical_axis_); + mem_p = std::make_shared( + mkldnn::memory::primitive_desc{src_md, engine_}, ptr); + dev_ctx_.SetBlob(local_key, mem_p); + } else { + mem_p->set_data_handle(ptr); + // Mark that reusing happenned. All primitives from operator instance + // should be reused or none of them. So we check consistency + is_reusing_ = true; + } + return mem_p; + } std::shared_ptr AcquireDstMemory(framework::Tensor* output, platform::Place place) { @@ -400,6 +392,7 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { private: std::vector dims_; std::vector axis_; + std::vector logical_axis_; }; template diff --git a/paddle/fluid/platform/mkldnn_utils.h b/paddle/fluid/platform/mkldnn_utils.h deleted file mode 100644 index 8c511f97d12cfe299ad5629eff1871e8d156c850..0000000000000000000000000000000000000000 --- a/paddle/fluid/platform/mkldnn_utils.h +++ /dev/null @@ -1,69 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include - -namespace paddle { -namespace platform { - -inline mkldnn::memory::primitive_desc create_prim_desc_from_dims( - const std::vector& ltz, mkldnn::memory::format fmt, - mkldnn::memory::data_type data_type = mkldnn::memory::data_type::f32) { - mkldnn_memory_desc_t mem_fmt; - - mem_fmt.primitive_kind = mkldnn_memory; - mem_fmt.ndims = ltz.size(); - for (unsigned int i = 0; i < ltz.size(); ++i) { - mem_fmt.dims[i] = ltz[i]; // logical dimensions (nchw format, - // regardless physical layout) - } - mem_fmt.data_type = static_cast(data_type); - mem_fmt.format = static_cast(fmt); - - unsigned int total_stride = 1; - for (int i = ltz.size() - 1; i >= 0; --i) { - mem_fmt.layout_desc.blocking.padding_dims[i] = - ltz[i]; // logical dimensions (nchw format, regardless physical - // layout) - mem_fmt.layout_desc.blocking.block_dims[i] = 1; - mem_fmt.layout_desc.blocking.offset_padding_to_data[i] = 0; // no offset - mem_fmt.layout_desc.blocking.strides[0][i] = total_stride; - mem_fmt.layout_desc.blocking.strides[1][i] = 1; - total_stride *= ltz[i]; - } - mem_fmt.layout_desc.blocking.offset_padding = 0; // no initial offset - - auto& pool = platform::DeviceContextPool::Instance(); - auto place = paddle::platform::CPUPlace(); - auto* dev_ctx = dynamic_cast(pool.Get(place)); - auto& cpu_engine = dev_ctx->GetEngine(); - return mkldnn::memory::primitive_desc(mem_fmt, cpu_engine); -} - -inline mkldnn::memory::primitive_desc create_prim_desc_from_format( - const std::vector& ltz, const mkldnn::memory::format format, - const mkldnn::memory::data_type data_type) { - auto md = mkldnn::memory::desc({ltz}, data_type, format); - auto& pool = platform::DeviceContextPool::Instance(); - auto place = paddle::platform::CPUPlace(); - auto dev_ctx = dynamic_cast(pool.Get(place)); - PADDLE_ENFORCE_NOT_NULL(dev_ctx, "Could not get valid device"); - auto& cpu_engine = dev_ctx->GetEngine(); - return mkldnn::memory::primitive_desc(md, cpu_engine); -} - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/temporary_allocator.cc b/paddle/fluid/platform/temporary_allocator.cc index ddde7baf4cf3b44ac5d8a22fcc98acef50294575..d489ed5368ed95a1a0a8b0d6759310501cd49fcd 100644 --- a/paddle/fluid/platform/temporary_allocator.cc +++ b/paddle/fluid/platform/temporary_allocator.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/platform/temporary_allocator.h" #include +#include #include "paddle/fluid/memory/allocation/allocator_facade.h" DEFINE_int64(limit_of_tmp_allocation, -1, @@ -30,31 +31,38 @@ namespace paddle { namespace platform { namespace alloc = memory::allocation; +TemporaryAllocation::TemporaryAllocation( + alloc::AllocationPtr &&underlying_allocation) + : Allocation(underlying_allocation->ptr(), underlying_allocation->size(), + underlying_allocation->place()), + underlying_allocation_(std::move(underlying_allocation)) {} + TemporaryAllocator::TemporaryAllocator(platform::Place place) : place_(place) { - temp_mem_map_.reset(new std::multimap()); + temp_mem_map_.reset(new std::multimap()); } bool TemporaryAllocator::IsAllocThreadSafe() const { return true; } void TemporaryAllocator::Release(const std::function &callback) { - std::unique_ptr> t_allocations; + std::unique_ptr> t_allocations; { std::unique_lock lock(mtx_); callback(); t_allocations.swap(temp_mem_map_); - temp_mem_map_.reset(new std::multimap()); + temp_mem_map_.reset(new std::multimap()); wait_delete_mem_ = 0; } - alloc::AllocationDeleter deleter; for (auto tmp : *t_allocations) { VLOG(10) << "Delete temporary allocation " << tmp.second->ptr() << " size: " << tmp.second->size(); - deleter(tmp.second); + delete tmp.second; } } -void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) { +void TemporaryAllocator::Free(alloc::Allocation *allocation) { + auto *temp_allocation = dynamic_cast(allocation); + PADDLE_ENFORCE_NOT_NULL(temp_allocation); if (platform::is_gpu_place(temp_allocation->place())) { PADDLE_ENFORCE(platform::is_same_place(temp_allocation->place(), place_), "The place should be the same."); @@ -78,7 +86,7 @@ void TemporaryAllocator::FreeImpl(alloc::Allocation *temp_allocation) { } VLOG(10) << "Delete temporary allocation " << temp_allocation->ptr() << " size: " << temp_allocation->size(); - alloc::AllocationDeleter()(temp_allocation); + delete temp_allocation; } size_t TemporaryAllocator::TemporaryAllocationQueueSize() { @@ -113,9 +121,11 @@ alloc::Allocation *TemporaryAllocator::AllocateImpl( } // If not find the the available allocation, get allocation from // AllocatorFacadeInstance. - auto temp_mem = alloc::AllocatorFacade::Instance().Alloc(place_, size, attr); + auto raw_allocation = + alloc::AllocatorFacade::Instance().Alloc(place_, size, attr); + auto temp_mem = new TemporaryAllocation(std::move(raw_allocation)); VLOG(10) << "Alloc temporary allocation: " << temp_mem->ptr() << ": " << size; - return temp_mem.release(); + return temp_mem; } } // namespace platform diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h index 912d45eaf17fe8c05840995275dd3e2e688b38ef..f8a43b889d58d5e027aac8e08324cf51b7d82913 100644 --- a/paddle/fluid/platform/temporary_allocator.h +++ b/paddle/fluid/platform/temporary_allocator.h @@ -23,6 +23,14 @@ namespace paddle { namespace platform { +class TemporaryAllocation : public memory::allocation::Allocation { + public: + explicit TemporaryAllocation( + memory::allocation::AllocationPtr &&underlying_allocation); + + memory::allocation::AllocationPtr underlying_allocation_; +}; + /*! \brief the TemporaryAllocator is used to alloc the temporary allocation * which used by CUDA's async operation. * @@ -49,7 +57,7 @@ class TemporaryAllocator : public memory::allocation::Allocator { void SetCallback(const std::function &callback); protected: - void FreeImpl(memory::allocation::Allocation *allocation) override; + void Free(memory::allocation::Allocation *allocation) override; memory::allocation::Allocation *AllocateImpl( size_t size, memory::allocation::Allocator::Attr attr) override; @@ -58,8 +66,8 @@ class TemporaryAllocator : public memory::allocation::Allocator { platform::Place place_; // When the allocation is not held by any variable, it should be placed // to temp_mem_map immediately. - std::unique_ptr> - temp_mem_map_{nullptr}; + std::unique_ptr> temp_mem_map_{ + nullptr}; std::mutex mtx_; size_t wait_delete_mem_{0}; std::function callback_; diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 93e75543bfb3f8a02960aa282890891981589eaa..adc6f5de59cbf65d1c2bcecc2d6085f797d12e7a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -328,7 +328,6 @@ PYBIND11_MODULE(core, m) { [](Tensor &self, paddle::platform::CUDAPinnedPlace &place) { self.mutable_data(place); }) - .def("_clear", &Tensor::clear) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) .def("set", PyCPUTensorSetFromArray) @@ -1287,6 +1286,15 @@ All parameter, weight, gradient are variables in Paddle. it will save GPU memory and may make the execution faster. This options is only available in GPU devices. Default False)DOC") + .def_property("fuse_all_optimizer_ops", + [](const BuildStrategy &self) { + return self.fuse_all_optimizer_ops_; + }, + [](BuildStrategy &self, bool b) { + PADDLE_ENFORCE(!self.IsFinalized(), + "BuildStrategy is finlaized."); + self.fuse_all_optimizer_ops_ = b; + }) .def_property( "sync_batch_norm", [](const BuildStrategy &self) { return self.sync_batch_norm_; }, diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h index 66b768665b6d0b97b4ca1470020132bfc9576bbb..16bb3771f2e9bcc07028ef2039fed8691f9aab97 100644 --- a/paddle/fluid/string/printf.h +++ b/paddle/fluid/string/printf.h @@ -105,12 +105,14 @@ void Printf(const char* fmt, const Args&... args) { Fprintf(std::cout, fmt, args...); } -inline std::string HumanReadableSize(double f_size) { +template +std::string HumanReadableSize(T size) { size_t i = 0; + double f_size = static_cast(size); double orig = f_size; const std::vector units( {"B", "kB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"}); - while (f_size >= 1024) { + while (f_size > 1024) { f_size /= 1024; i++; } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 18f01ca1374a24cec3bf882d347596dd38f4fd21..24c8a6934fe355b2de388e7f90b6e40d4871f0d8 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -34,7 +34,7 @@ from . import io from . import evaluator from . import initializer from . import layers -from . import imperative +from . import dygraph from . import contrib from . import nets from . import optimizer @@ -71,7 +71,7 @@ __all__ = framework.__all__ + executor.__all__ + \ 'initializer', 'layers', 'contrib', - 'imperative', + 'dygraph', 'transpiler', 'nets', 'optimizer', diff --git a/python/paddle/fluid/contrib/model_stat.py b/python/paddle/fluid/contrib/model_stat.py new file mode 100644 index 0000000000000000000000000000000000000000..0d974c8d9685840c79de17f297fcba00b01a6c35 --- /dev/null +++ b/python/paddle/fluid/contrib/model_stat.py @@ -0,0 +1,194 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +''' +Example: + >>from paddle.fluid.contrib.model_stat import summary + >>main_program = ... + >>summary(main_program) + +-----+------------+----------------+----------------+---------+------------+ + | No. | TYPE | INPUT | OUTPUT | PARAMs | FLOPs | + +-----+------------+----------------+----------------+---------+------------+ + | 0 | conv2d | (3, 200, 200) | (64, 100, 100) | 9408 | 188160000 | + | 1 | batch_norm | (64, 100, 100) | (64, 100, 100) | 256 | 640000 | + | 2 | relu | (64, 100, 100) | (64, 100, 100) | 0 | 640000 | + | 3 | pool2d | (64, 100, 100) | (64, 50, 50) | 0 | 1440000 | + ... + | 176 | conv2d | (512, 7, 7) | (512, 7, 7) | 2359296 | 231211008 | + | 177 | relu | (512, 7, 7) | (512, 7, 7) | 0 | 25088 | + | 178 | conv2d | (512, 7, 7) | (2048, 7, 7) | 1048576 | 102760448 | + | 179 | relu | (2048, 7, 7) | (2048, 7, 7) | 0 | 100352 | + | 180 | pool2d | (2048, 7, 7) | (2048, 1, 1) | 0 | 100352 | + +-----+------------+----------------+----------------+---------+------------+ + Total PARAMs: 48017344(0.0480G) + Total FLOPs: 11692747751(11.69G) +''' +from collections import OrderedDict +from prettytable import PrettyTable + + +def summary(main_prog): + ''' + It can summary model's PARAMS, FLOPs until now. + It support common operator like conv, fc, pool, relu, sigmoid, bn etc. + Args: + main_prog: main program + Returns: + print summary on terminal + ''' + collected_ops_list = [] + for one_b in main_prog.blocks: + block_vars = one_b.vars + for one_op in one_b.ops: + op_info = OrderedDict() + spf_res = _summary_model(block_vars, one_op) + if spf_res is None: + continue + # TODO: get the operator name + op_info['type'] = one_op.type + op_info['input_shape'] = spf_res[0][1:] + op_info['out_shape'] = spf_res[1][1:] + op_info['PARAMs'] = spf_res[2] + op_info['FLOPs'] = spf_res[3] + collected_ops_list.append(op_info) + + summary_table, total = _format_summary(collected_ops_list) + _print_summary(summary_table, total) + + +def _summary_model(block_vars, one_op): + ''' + Compute operator's params and flops. + Args: + block_vars: all vars of one block + one_op: one operator to count + Returns: + in_data_shape: one operator's input data shape + out_data_shape: one operator's output data shape + params: one operator's PARAMs + flops: : one operator's FLOPs + ''' + if one_op.type in ['conv2d', 'depthwise_conv2d']: + k_arg_shape = block_vars[one_op.input("Filter")[0]].shape + in_data_shape = block_vars[one_op.input("Input")[0]].shape + out_data_shape = block_vars[one_op.output("Output")[0]].shape + c_out, c_in, k_h, k_w = k_arg_shape + _, c_out_, h_out, w_out = out_data_shape + assert c_out == c_out_, 'shape error!' + k_groups = one_op.attr("groups") + kernel_ops = k_h * k_w * (c_in / k_groups) + bias_ops = 0 if one_op.input("Bias") == [] else 1 + params = c_out * (kernel_ops + bias_ops) + flops = h_out * w_out * c_out * (kernel_ops + bias_ops) + # base nvidia paper, include mul and add + flops = 2 * flops + + elif one_op.type == 'pool2d': + in_data_shape = block_vars[one_op.input("X")[0]].shape + out_data_shape = block_vars[one_op.output("Out")[0]].shape + _, c_out, h_out, w_out = out_data_shape + k_size = one_op.attr("ksize") + params = 0 + flops = h_out * w_out * c_out * (k_size[0] * k_size[1]) + + elif one_op.type == 'mul': + k_arg_shape = block_vars[one_op.input("Y")[0]].shape + in_data_shape = block_vars[one_op.input("X")[0]].shape + out_data_shape = block_vars[one_op.output("Out")[0]].shape + # TODO: fc has mul ops + # add attr to mul op, tell us whether it belongs to 'fc' + # this's not the best way + if 'fc' not in one_op.output("Out")[0]: + return None + k_in, k_out = k_arg_shape + # bias in sum op + params = k_in * k_out + 1 + flops = k_in * k_out + + elif one_op.type in ['sigmoid', 'tanh', 'relu', 'leaky_relu', 'prelu']: + in_data_shape = block_vars[one_op.input("X")[0]].shape + out_data_shape = block_vars[one_op.output("Out")[0]].shape + params = 0 + if one_op.type == 'prelu': + params = 1 + flops = 1 + for one_dim in in_data_shape: + flops *= one_dim + + elif one_op.type == 'batch_norm': + in_data_shape = block_vars[one_op.input("X")[0]].shape + out_data_shape = block_vars[one_op.output("Y")[0]].shape + _, c_in, h_out, w_out = in_data_shape + # gamma, beta + params = c_in * 2 + # compute mean and std + flops = h_out * w_out * c_in * 2 + + else: + return None + + return in_data_shape, out_data_shape, params, flops + + +def _format_summary(collected_ops_list): + ''' + Format summary report. + Args: + collected_ops_list: the collected operator with summary + Returns: + summary_table: summary report format + total: sum param and flops + ''' + summary_table = PrettyTable( + ["No.", "TYPE", "INPUT", "OUTPUT", "PARAMs", "FLOPs"]) + summary_table.align = 'r' + + total = {} + total_params = [] + total_flops = [] + for i, one_op in enumerate(collected_ops_list): + # notice the order + table_row = [ + i, + one_op['type'], + one_op['input_shape'], + one_op['out_shape'], + int(one_op['PARAMs']), + int(one_op['FLOPs']), + ] + summary_table.add_row(table_row) + total_params.append(int(one_op['PARAMs'])) + total_flops.append(int(one_op['FLOPs'])) + + total['params'] = total_params + total['flops'] = total_flops + + return summary_table, total + + +def _print_summary(summary_table, total): + ''' + Print all the summary on terminal. + Args: + summary_table: summary report format + total: sum param and flops + ''' + parmas = total['params'] + flops = total['flops'] + print(summary_table) + print('Total PARAMs: {}({:.4f}M)'.format( + sum(parmas), sum(parmas) / (10**6))) + print('Total FLOPs: {}({:.2f}G)'.format(sum(flops), sum(flops) / 10**9)) + print( + "Notice: \n now supported ops include [Conv, DepthwiseConv, FC(mul), BatchNorm, Pool, Activation(sigmoid, tanh, relu, leaky_relu, prelu)]" + ) diff --git a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py index 7388ecd3b096fc05d1420b904f2d65d805c3fc53..e7f5f0d6a2185521549abe7af7b6be2b0b7d90fb 100644 --- a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py +++ b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py @@ -204,6 +204,10 @@ class GraphWrapper(object): """ super(GraphWrapper, self).__init__() self.program = Program() if program is None else program + self.persistables = {} + for var in self.program.list_vars(): + if var.persistable: + self.persistables[var.name] = var self.compiled_graph = None self.in_nodes = OrderedDict(in_nodes) self.out_nodes = OrderedDict(out_nodes) @@ -467,7 +471,12 @@ class GraphWrapper(object): path(str): The path to save the persistables. exe(framework.Executor): The executor used to save the persistables. """ - io.save_persistables(exe.exe, path, main_program=self.program) + # update persistables from program + for var in self.program.list_vars(): + if var.persistable and var.name not in self.persistables: + self.persistables[var.name] = var + + io.save_vars(exe.exe, path, vars=self.persistables.values()) def load_persistables(self, path, exe): """ @@ -481,7 +490,7 @@ class GraphWrapper(object): return os.path.exists(os.path.join(path, var.name)) io.load_vars( - exe.exe, path, main_program=self.program, predicate=if_exist) + exe.exe, path, vars=self.persistables.values(), predicate=if_exist) def update_param_shape(self, scope): """ diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index ab3bd8bd182c7e933c58e2ba2f3548f2d001cbdb..3809e327943832571a1bde6a53a0a6e7fbd13bdd 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -26,6 +26,17 @@ __all__ = [ ] +def _init_var_node(var_node, value, scope, place): + assert isinstance(value, + np.ndarray), 'The type of value should be numpy array.' + assert scope is not None, \ + 'The scope cannot be set None.' + assert place is not None, \ + 'The place cannot be set None.' + tensor = scope.var(var_node.name()).get_tensor() + tensor.set(value, place) + + class QuantizationTransformPass(object): def __init__(self, scope=None, @@ -88,14 +99,14 @@ class QuantizationTransformPass(object): assert activation_quantize_type != 'channel_wise_abs_max', "The activation quantization type does not support 'channel_wise_abs_max'." if activation_quantize_type not in quant_type: raise ValueError( - "Unknown activation_quantize_type : '%s'. It can only be ", - "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.", - str(activation_quantize_type)) + "Unknown activation_quantize_type : '%s'. It can only be " + "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'." % + (str(activation_quantize_type))) if weight_quantize_type not in quant_type: raise ValueError( - "Unknown weight_quantize_type: '%s'. It can only be ", - "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'.", - str(weight_quantize_type)) + "Unknown weight_quantize_type: '%s'. It can only be " + "'abs_max' or 'channel_wise_abs_max' or 'range_abs_max' or 'moving_average_abs_max'." + % (str(weight_quantize_type))) self._activation_quantize_type = activation_quantize_type self._weight_quantize_type = weight_quantize_type @@ -121,8 +132,6 @@ class QuantizationTransformPass(object): """ assert isinstance(graph, IrGraph), 'graph must be the instance of IrGraph.' - #sequential_execution = core.get_pass('sequential_execution_pass') - #sequential_execution.apply(graph.graph) self._is_test = graph.is_test() # marked the variable which has been dequantized. dequantized_vars = collections.OrderedDict() @@ -203,9 +212,12 @@ class QuantizationTransformPass(object): var_type=core.VarDesc.VarType.LOD_TENSOR, shape=[1], var_dtype=core.VarDesc.VarType.INT64) - self._init_var_node( - global_step_in, np.zeros( - [1], dtype='int64')) + _init_var_node( + global_step_in, + np.zeros( + [1], dtype='int64'), + self._scope, + self._place) global_step_out = graph.create_var_node_from_desc( global_step_in.var()) # The attribute of `op_role` is needed by ParallelExecutor. @@ -284,7 +296,12 @@ class QuantizationTransformPass(object): var_dtype=var_node.dtype()) data_type = 'float64' if var_node.dtype( ) == core.VarDesc.VarType.FP64 else 'float32' - self._init_var_node(scale_in_node, np.array([0.001], dtype=data_type)) + _init_var_node( + scale_in_node, + np.array( + [0.001], dtype=data_type), + self._scope, + self._place) scale_out_node = graph.create_var_node_from_desc(scale_in_node.var()) inputs = {'X': var_node, 'InScale': scale_in_node} @@ -299,9 +316,13 @@ class QuantizationTransformPass(object): var_dtype=var_node.dtype()) data_type = 'float64' if var_node.dtype( ) == core.VarDesc.VarType.FP64 else 'float32' - self._init_var_node( - scales_node, np.zeros( - [self._window_size], dtype=data_type)) + _init_var_node( + scales_node, + np.zeros( + [self._window_size], dtype=data_type), + self._scope, + self._place) + inputs['Iter'] = self._global_step outputs['OutScales'] = scales_node attrs = { @@ -343,7 +364,12 @@ class QuantizationTransformPass(object): var_dtype=var_node.dtype()) data_type = 'float64' if var_node.dtype( ) == core.VarDesc.VarType.FP64 else 'float32' - self._init_var_node(scale_in_node, np.array([0.001], dtype=data_type)) + _init_var_node( + scale_in_node, + np.array( + [0.001], dtype=data_type), + self._scope, + self._place) scale_out_node = graph.create_var_node_from_desc(scale_in_node.var()) ins = {'X': var_node, 'InScale': scale_in_node} @@ -356,13 +382,23 @@ class QuantizationTransformPass(object): shape=[1]) data_type = 'float64' if var_node.dtype( ) == core.VarDesc.VarType.FP64 else 'float32' - self._init_var_node(scale_in_node, np.ones([1], dtype=data_type)) + _init_var_node( + scale_in_node, + np.ones( + [1], dtype=data_type), + self._scope, + self._place) accum_in_node = graph.create_persistable_node( name=unique_name.generate('accum'), var_type=core.VarDesc.VarType.LOD_TENSOR, var_dtype=var_node.dtype(), shape=[1]) - self._init_var_node(accum_in_node, np.ones([1], dtype=data_type)) + _init_var_node( + accum_in_node, + np.ones( + [1], dtype=data_type), + self._scope, + self._place) state_out_node = graph.create_var_node_from_desc(state_in_node.var( )) accum_out_node = graph.create_var_node_from_desc(accum_in_node.var( @@ -482,16 +518,6 @@ class QuantizationTransformPass(object): graph.link_to(dequant_op_node, dequant_var_node) return dequant_var_node - def _init_var_node(self, var_node, value): - assert isinstance( - value, np.ndarray), 'The type of value should be numpy array.' - assert self._scope is not None, \ - 'The scope cannot be set None when activation_quantize_type equals to range_abs_max.' - assert self._place is not None, \ - 'The place cannot be set None when activation_quantize_type equals to range_abs_max.' - tensor = self._scope.var(var_node.name()).get_tensor() - tensor.set(value, self._place) - def _quantized_var_name(self, var_name): """ Return quantized variable name for the input `var_name`. @@ -594,8 +620,8 @@ class QuantizationFreezePass(object): self._weight_bits) self._restore_var(input_arg_name, quantized_param_v) else: - scale_v = self._to_node(op_node.outputs, - op_node.output('OutScale')[0]) + scale_v = graph._find_node_by_name( + op_node.outputs, op_node.output('OutScale')[0]) self._var_scale_map[input_arg_name] = scale_v ops = graph.all_op_nodes() @@ -627,8 +653,8 @@ class QuantizationFreezePass(object): return graph def _remove_fake_quant_and_dequant_op(self, graph, op_node): - k = self._to_node(op_node.outputs, op_node.output('Out')[0]) - v = self._to_node(op_node.inputs, op_node.input('X')[0]) + k = graph._find_node_by_name(op_node.outputs, op_node.output('Out')[0]) + v = graph._find_node_by_name(op_node.inputs, op_node.input('X')[0]) if v.node not in self._op_input_rename_map: self._op_input_rename_map[k.node] = v else: @@ -663,8 +689,8 @@ class QuantizationFreezePass(object): raise ValueError("Only support one output, but op %s has" " more than one output." % (op_node.name())) - output_var_node = self._to_node(op_node.outputs, - op_node.output_arg_names()[0]) + output_var_node = graph._find_node_by_name( + op_node.outputs, op_node.output_arg_names()[0]) weight_scale_node = graph.create_persistable_node( name=unique_name.generate('channel_scale'), var_type=core.VarDesc.VarType.LOD_TENSOR, @@ -672,7 +698,9 @@ class QuantizationFreezePass(object): var_dtype=output_var_node.dtype()) data_type = 'float64' if output_var_node.dtype( ) == core.VarDesc.VarType.FP64 else 'float32' - self._init_var_node(weight_scale_node, channel_scale.astype(data_type)) + _init_var_node(weight_scale_node, + channel_scale.astype(data_type), self._scope, + self._place) dequant_var_node = graph.create_var_node( name=self._dequantized_var_name(output_var_node.name()), var_type=output_var_node.type(), @@ -724,8 +752,8 @@ class QuantizationFreezePass(object): raise ValueError("Only support one output, but op %s has" " more than one output." % (op_node.name())) - output_var_node = self._to_node(op_node.outputs, - op_node.output_arg_names()[0]) + output_var_node = graph._find_node_by_name( + op_node.outputs, op_node.output_arg_names()[0]) dequant_var_node = graph.create_var_node( name=self._dequantized_var_name(output_var_node.name()), var_type=output_var_node.type(), @@ -746,24 +774,6 @@ class QuantizationFreezePass(object): self._op_output_rename_map[output_var_node.node] = dequant_var_node return dequant_var_node - def _init_var_node(self, var_node, value): - assert isinstance( - value, np.ndarray), 'The type of value should be numpy array.' - assert self._scope is not None, \ - 'The scope cannot be set None when activation_quantize_type equals to range_abs_max.' - assert self._place is not None, \ - 'The place cannot be set None when activation_quantize_type equals to range_abs_max.' - tensor = self._scope.var(var_node.name()).get_tensor() - tensor.set(value, self._place) - - def _to_node(self, nodes, node_name): - target_node = None - for n in nodes: - if n.name() == node_name: - target_node = n - assert target_node is not None, "Cannot find the target node in the giving set." - return target_node - def _load_var(self, name): return np.array(self._scope.find_var(name).get_tensor()) diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py index c4b02166abf020ed561a989fa97b28c9fd44223e..a22b6da020510838dc82fe7af87ab62db6e874ef 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py @@ -20,7 +20,7 @@ from .... import io from .... import core from ....compiler import CompiledProgram from ....compiler import BuildStrategy -from ....framework import IrGraph +from ....framework import IrGraph, Variable, Program from ..core.strategy import Strategy from .quantization_pass import * @@ -45,13 +45,14 @@ class QuantizationStrategy(Strategy): activation_bits=8, weight_bits=8, activation_quantize_type='abs_max', + weight_quantize_type='abs_max', save_in_nodes=None, save_out_nodes=None): """ Args: start_epoch(int): The 'on_epoch_begin' function will be called in start_epoch. default: 0 end_epoch(int): The 'on_epoch_end' function will be called in end_epoch. default: 0 - float_model_save_path(str): The path to save model with float weights. + float_model_save_path(str): The path to save model with float weights. None means it doesn't save float model. defalut: None. mobile_model_save_path(str): The path to save model for paddle-mobile execution. None means it doesn't save mobile model. defalut: None. @@ -66,9 +67,11 @@ class QuantizationStrategy(Strategy): dynamically each step in both training and testing period. If use 'range_abs_max', a static quantization scale will be calculated during training and used in inference. - save_in_nodes(list): A list of variable names used to prune graph + weight_quantize_type (str): quantization type for weights, support 'abs_max' and 'channel_wise_abs_max'. + The 'range_abs_max' usually is not used for weight, since weights are fixed once the model is well trained. + save_in_nodes(list): A list of variable names used to prune graph for saving inference model. - save_out_nodes(list): A list of variable names used to prune graph + save_out_nodes(list): A list of variable names used to prune graph for saving inference model. """ @@ -81,43 +84,80 @@ class QuantizationStrategy(Strategy): self.activation_bits = activation_bits self.weight_bits = weight_bits self.activation_quantize_type = activation_quantize_type + self.weight_quantize_type = weight_quantize_type self.save_out_nodes = save_out_nodes self.save_in_nodes = save_in_nodes + def on_compression_begin(self, context): + """ + Restore graph when the compressoin task is inited from checkpoint. + """ + # It is inited from checkpoint and has missed start epoch. + if context.epoch_id != 0 and context.epoch_id > self.start_epoch: + _logger.info("Restore quantization task from checkpoint") + self._modify_graph_for_quantization(context) + _logger.info("Finish restoring quantization task from checkpoint") + + def _modify_graph_for_quantization(self, context): + """ + Insert fake_quantize_op and fake_dequantize_op before trainging and testing. + """ + train_ir_graph = IrGraph( + core.Graph(context.optimize_graph.program.clone().desc), + for_test=False) + test_ir_graph = IrGraph( + core.Graph(context.eval_graph.program.clone().desc), for_test=True) + transform_pass = QuantizationTransformPass( + scope=context.scope, + place=context.place, + weight_bits=self.weight_bits, + activation_bits=self.activation_bits, + activation_quantize_type=self.activation_quantize_type, + weight_quantize_type=self.weight_quantize_type) + transform_pass.apply(train_ir_graph) + transform_pass.apply(test_ir_graph) + # Put persistables created by transform_pass into context.optimize_graph.persistables + # for saving checkpoint. + program_persistables = set() + for var in context.optimize_graph.program.list_vars(): + if var.persistable: + program_persistables.add(var.name) + + program = Program() + for var_node in train_ir_graph.all_persistable_nodes(): + if var_node.name() not in program_persistables: + var_desc = var_node.var() + var = program.global_block().create_var( + name=var_node.name(), + shape=var_desc.shape(), + dtype=var_desc.dtype(), + type=var_desc.type(), + lod_level=var_desc.lod_level()) + context.optimize_graph.persistables[var.name] = var + + build_strategy = BuildStrategy() + build_strategy.enable_inplace = False + build_strategy.memory_optimize = False + # for quantization training + context.optimize_graph.compiled_graph = CompiledProgram( + train_ir_graph.graph).with_data_parallel( + loss_name=context.optimize_graph.out_nodes['loss'], + build_strategy=build_strategy) + # for evaluation. And program compiled from ir graph must be with data parallel. + context.eval_graph.compiled_graph = CompiledProgram( + test_ir_graph.graph).with_data_parallel( + build_strategy=build_strategy) + # for saving inference model after training + context.put('quantization_test_ir_graph_backup', test_ir_graph) + def on_epoch_begin(self, context): """ Insert fake_quantize_op and fake_dequantize_op before trainging and testing. """ - super(QuantizationStrategy, self).on_compression_begin(context) + super(QuantizationStrategy, self).on_epoch_begin(context) if self.start_epoch == context.epoch_id: _logger.info('QuantizationStrategy::on_epoch_begin') - train_ir_graph = IrGraph( - core.Graph(context.optimize_graph.program.desc), for_test=False) - test_ir_graph = IrGraph( - core.Graph(context.eval_graph.program.desc), for_test=True) - transform_pass = QuantizationTransformPass( - scope=context.scope, - place=context.place, - weight_bits=self.weight_bits, - activation_bits=self.activation_bits, - activation_quantize_type=self.activation_quantize_type) - transform_pass.apply(train_ir_graph) - transform_pass.apply(test_ir_graph) - - build_strategy = BuildStrategy() - build_strategy.enable_inplace = False - build_strategy.memory_optimize = False - # for quantization training - context.optimize_graph.compiled_graph = CompiledProgram( - train_ir_graph.graph).with_data_parallel( - loss_name=context.optimize_graph.out_nodes['loss'], - build_strategy=build_strategy) - # for evaluation. And program compiled from ir graph must be with data parallel. - context.eval_graph.compiled_graph = CompiledProgram( - test_ir_graph.graph).with_data_parallel( - build_strategy=build_strategy) - # for saving inference model after training - context.put('quantization_test_ir_graph_backup', test_ir_graph) + self._modify_graph_for_quantization(context) _logger.info('Finish QuantizationStrategy::on_epoch_begin') def on_epoch_end(self, context): @@ -134,7 +174,8 @@ class QuantizationStrategy(Strategy): scope=context.scope, place=context.place, weight_bits=self.weight_bits, - activation_bits=self.activation_bits) + activation_bits=self.activation_bits, + weight_quantize_type=self.weight_quantize_type) freeze_pass.apply(test_ir_graph) # for other strategies diff --git a/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml b/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml index f29eb53f88d22d87b61f82279b676af5ec1ef497..a3a5a724fbfcac41ed4ab286caac184c2fe104ad 100644 --- a/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml +++ b/python/paddle/fluid/contrib/slim/tests/quantization/compress.yaml @@ -35,6 +35,8 @@ strategies: start_epoch: 0 end_epoch: 0 float_model_save_path: './output/float' + mobile_model_save_path: './output/mobile' + int8_model_save_path: './output/int8' weight_bits: 8 activation_bits: 8 weight_quantize_type: 'abs_max' diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py index c7feca0b82606cdba9a05fb6de821aa6d347d4e6..e896f8bb423a642bada043e3e578033d3bfdea90 100644 --- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py @@ -256,8 +256,6 @@ class TestQuantizationFreezePass(unittest.TestCase): place=place, activation_quantize_type=activation_quant_type, weight_quantize_type=weight_quant_type) - #transform_pass = QuantizationTransformPass( - # scope=scope, place=place, activation_quantize_type=activation_quant_type) transform_pass.apply(main_graph) transform_pass.apply(test_graph) dev_name = '_gpu_' if use_cuda else '_cpu_' @@ -315,7 +313,6 @@ class TestQuantizationFreezePass(unittest.TestCase): # Freeze graph for inference, but the weight of fc/conv is still float type. freeze_pass = QuantizationFreezePass( scope=scope, place=place, weight_quantize_type=weight_quant_type) - #freeze_pass = QuantizationFreezePass(scope=scope, place=place) freeze_pass.apply(test_graph) if not for_ci: marked_nodes = set() diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/dygraph/__init__.py similarity index 100% rename from python/paddle/fluid/imperative/__init__.py rename to python/paddle/fluid/dygraph/__init__.py diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/dygraph/base.py similarity index 88% rename from python/paddle/fluid/imperative/base.py rename to python/paddle/fluid/dygraph/base.py index 097cd2be35b01aced30486b874f202381c4d9962..d55dbbb9c72cb887e169849c3a3e32a13c202a7b 100644 --- a/python/paddle/fluid/imperative/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -22,7 +22,7 @@ __all__ = ['enabled', 'guard', 'to_variable'] def enabled(): - return framework._in_imperative_mode() + return framework._in_dygraph_mode() @signature_safe_contextmanager @@ -39,14 +39,14 @@ def guard(place=None): with framework.program_guard(train, startup): with framework.unique_name.guard(): - with framework._imperative_guard(tracer): - with framework._imperative_place_guard(place): + with framework._dygraph_guard(tracer): + with framework._dygraph_place_guard(place): yield def to_variable(value, block=None, name=None): if isinstance(value, np.ndarray): - assert enabled(), "to_variable could only be called in imperative mode" + assert enabled(), "to_variable could only be called in dygraph mode" if not block: block = framework.default_main_program().current_block() diff --git a/python/paddle/fluid/imperative/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py similarity index 93% rename from python/paddle/fluid/imperative/checkpoint.py rename to python/paddle/fluid/dygraph/checkpoint.py index 37c43f29d2ae9214058238e4f834dbbcd9e42df1..f992ae0576c81ed98a3e9f7a446b0c2e808622ea 100644 --- a/python/paddle/fluid/imperative/checkpoint.py +++ b/python/paddle/fluid/dygraph/checkpoint.py @@ -68,7 +68,7 @@ def save_persistables(vardict, dirname, filename=None): dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden, init_cell) param_path = "./my_paddle_model" - fluid.imperative.checkpoint.save_persistables(ptb_model.state_dict(), dirname=param_path, + fluid.dygraph.save_persistables(ptb_model.state_dict(), dirname=param_path, layer=ptb_model) """ if isinstance(vardict, collections.OrderedDict): @@ -97,17 +97,17 @@ def load_persistables(vardict, dirname, filename=None): Examples: .. code-block:: python - my_layer = layer(fluid.imperative.Layer) + my_layer = layer(fluid.dygraph.Layer) param_path = "./my_paddle_model" - param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.parameters(), param_path) + param_dict = fluid.dygraph.load_persistables(my_layer.parameters(), param_path) param_1 = param_dict['PtbModel_0.w_1'] or: - my_layer = layer(fluid.imperative.Layer) + my_layer = layer(fluid.dygraph.Layer) param_path = "./my_paddle_model" filename = "model.file" - param_dict = fluid.imperative.checkpoint.load_persistables(my_layer.state_dict(), param_path, + param_dict = fluid.dygraph.load_persistables(my_layer.state_dict(), param_path, filename=filename) param_1 = param_dict['PtbModel_0.w_1'] diff --git a/python/paddle/fluid/imperative/layer_object_helper.py b/python/paddle/fluid/dygraph/layer_object_helper.py similarity index 99% rename from python/paddle/fluid/imperative/layer_object_helper.py rename to python/paddle/fluid/dygraph/layer_object_helper.py index 3d4426e8cdfe79a6fa2d6452e7cb3ab0a458c0bc..c56652e103ce93bf5459b30b66c7b1f04e7c14d0 100644 --- a/python/paddle/fluid/imperative/layer_object_helper.py +++ b/python/paddle/fluid/dygraph/layer_object_helper.py @@ -16,7 +16,7 @@ from __future__ import print_function import copy import six -from ..framework import Parameter, _in_imperative_mode +from ..framework import Parameter, _in_dygraph_mode from ..param_attr import ParamAttr from .. import core from six.moves import zip diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/dygraph/layers.py similarity index 99% rename from python/paddle/fluid/imperative/layers.py rename to python/paddle/fluid/dygraph/layers.py index e64667f7f467d0d0a3c07d14ce22c3f231e82eb6..014ee41f4c5aa280fb5b366d8f1704290cc067d4 100644 --- a/python/paddle/fluid/imperative/layers.py +++ b/python/paddle/fluid/dygraph/layers.py @@ -283,7 +283,7 @@ class PyLayer(core.PyLayer): @classmethod def __call__(cls, *inputs): - tracer = framework._imperative_tracer() + tracer = framework._dygraph_tracer() block = framework.default_main_program().current_block() ivar_inputs = [x._ivar for x in inputs] diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/dygraph/nn.py similarity index 99% rename from python/paddle/fluid/imperative/nn.py rename to python/paddle/fluid/dygraph/nn.py index 9856276b20b7affb548847d359463451bb238518..8925381119272d7462562c0952d3e157f78f25af 100644 --- a/python/paddle/fluid/imperative/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -133,7 +133,7 @@ class Conv2D(layers.Layer): outputs={'Out': [pre_act]}, attrs={'axis': 1}) - # Currently, we don't support inplace in imperative mode + # Currently, we don't support inplace in dygraph mode return self._helper.append_activation(pre_act, act=self._act) @@ -265,7 +265,7 @@ class FC(layers.Layer): attrs={'axis': self._num_flatten_dims}) else: pre_activation = pre_bias - # Currently, we don't support inplace in imperative mode + # Currently, we don't support inplace in dygraph mode return self._helper.append_activation(pre_activation, act=self._act) @@ -387,7 +387,7 @@ class BatchNorm(layers.Layer): "use_global_stats": self._use_global_stats }) - # Currently, we don't support inplace in imperative mode + # Currently, we don't support inplace in dygraph mode return self._helper.append_activation(batch_norm_out, self._act) @@ -426,7 +426,7 @@ class Embedding(layers.Layer): dict_size = len(dataset.ids) input = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32') - embedding = fluid.imperative.Embedding(size=[dict_size, 16]) + embedding = fluid.dygraph.Embedding(size=[dict_size, 16]) fc = embedding(input) """ diff --git a/python/paddle/fluid/imperative/profiler.py b/python/paddle/fluid/dygraph/profiler.py similarity index 100% rename from python/paddle/fluid/imperative/profiler.py rename to python/paddle/fluid/dygraph/profiler.py diff --git a/python/paddle/fluid/imperative/tracer.py b/python/paddle/fluid/dygraph/tracer.py similarity index 95% rename from python/paddle/fluid/imperative/tracer.py rename to python/paddle/fluid/dygraph/tracer.py index 28c8586813410f7349da7943a966eaa9cc3816d2..94e212b139b2b375aa9f5252d396e90235ba33c1 100644 --- a/python/paddle/fluid/imperative/tracer.py +++ b/python/paddle/fluid/dygraph/tracer.py @@ -24,12 +24,12 @@ __all__ = ['Tracer'] def release_op(op): - del framework._imperative_tracer()._ops[op._trace_id] + del framework._dygraph_tracer()._ops[op._trace_id] class Tracer(core.Tracer): """ - Python wrapper of imperative tracer + Python wrapper of dygraph tracer """ def __init__(self, block): diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 4a5301b436e4eb4c634749e8be736fe43c958348..a49fafa97da45adc25ba7de6d2e5ff19f1a87fc4 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -75,20 +75,20 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix() CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() -_imperative_tracer_ = None -_imperative_current_expected_place_ = None +_dygraph_tracer_ = None +_dygraph_current_expected_place_ = None -def _in_imperative_mode(): - return _imperative_tracer_ is not None +def _in_dygraph_mode(): + return _dygraph_tracer_ is not None -def _imperative_tracer(): - return _imperative_tracer_ +def _dygraph_tracer(): + return _dygraph_tracer_ def _current_expected_place(): - return _imperative_current_expected_place_ + return _dygraph_current_expected_place_ def _cpu_num(): @@ -396,7 +396,7 @@ class Variable(object): if not isinstance(dtype, core.VarDesc.VarType): dtype = convert_np_dtype_to_dtype_(dtype) - if _in_imperative_mode(): + if _in_dygraph_mode(): # record vars in tracer rather than blocks self._ivar = kwargs.get("ivar", None) if not self._ivar: @@ -406,7 +406,7 @@ class Variable(object): _current_expected_place(), stop_gradient, True if persistable else False) if persistable: - _imperative_tracer().trace_var(name, self) + _dygraph_tracer().trace_var(name, self) else: self.error_clip = error_clip @@ -515,8 +515,8 @@ class Variable(object): Returns: str: The debug string. """ - if _in_imperative_mode(): - # TODO(panyx0718): add more imperative debug info. + if _in_dygraph_mode(): + # TODO(panyx0718): add more dygraph debug info. return 'name %s, dtype: %s shape: %s' % (self.name, self.dtype, self.shape) @@ -548,42 +548,42 @@ class Variable(object): @property def _stop_gradient(self): - if _in_imperative_mode(): + if _in_dygraph_mode(): return self._ivar.stop_gradient else: return self.stop_gradient @_stop_gradient.setter def _stop_gradient(self, s): - if _in_imperative_mode(): + if _in_dygraph_mode(): self._ivar.stop_gradient = s else: self.stop_gradient = s @property def persistable(self): - if _in_imperative_mode(): + if _in_dygraph_mode(): return self._ivar.persistable else: return self.desc.persistable() @persistable.setter def persistable(self, p): - if _in_imperative_mode(): + if _in_dygraph_mode(): return self._ivar.persistable else: self.desc.set_persistable(p) @property def name(self): - if _in_imperative_mode(): + if _in_dygraph_mode(): return self._ivar.name else: return cpt.to_text(self.desc.name()) @name.setter def name(self, new_name): - if _in_imperative_mode(): + if _in_dygraph_mode(): self._ivar.name = new_name else: self.desc.set_name(new_name) @@ -591,26 +591,26 @@ class Variable(object): @property def shape(self): # convert to tuple, make it as same as numpy API. - if _in_imperative_mode(): + if _in_dygraph_mode(): return self._ivar.shape else: return tuple(self.desc.shape()) @property def dtype(self): - if _in_imperative_mode(): + if _in_dygraph_mode(): return self._ivar.dtype else: return self.desc.dtype() @property def lod_level(self): - # TODO(minqiyang): Support lod_level in imperative mode + # TODO(minqiyang): Support lod_level in dygraph mode return self.desc.lod_level() @property def type(self): - if _in_imperative_mode(): + if _in_dygraph_mode(): return self._ivar.dtype else: return self.desc.type() @@ -918,7 +918,7 @@ class Operator(object): inputs=None, outputs=None, attrs=None): - if _in_imperative_mode(): + if _in_dygraph_mode(): if type is None: raise ValueError( "`type` to initialized an Operator can not be None.") @@ -1037,7 +1037,7 @@ class Operator(object): for arg in out_args: out_arg_names.append(cpt.to_text(arg.name)) # TODO(minqiyang): could we remove variable's op in static mode? - if not _in_imperative_mode(): + if not _in_dygraph_mode(): arg.op = self self.desc.set_output(out_proto.name, out_arg_names) @@ -1083,7 +1083,7 @@ class Operator(object): @property def type(self): - if _in_imperative_mode(): + if _in_dygraph_mode(): return self.iop.type else: return self.desc.type() @@ -1626,7 +1626,7 @@ class Block(object): Returns: Operator: the append Operator. """ - if _in_imperative_mode(): + if _in_dygraph_mode(): op = Operator( block=self, desc=None, @@ -1638,9 +1638,8 @@ class Block(object): # record ops in tracer rather than blocks # # TODO(minqiyang): add op stop_gradient support in static mode too. - # currently, we only support stop_gradient in imperative mode. - _imperative_tracer().trace_op(op, - kwargs.get("stop_gradient", False)) + # currently, we only support stop_gradient in dygraph mode. + _dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False)) else: op_desc = self.desc.append_op() op = Operator( @@ -1699,7 +1698,7 @@ class Block(object): return self.ops[start:end] def _prepend_op(self, *args, **kwargs): - if _in_imperative_mode(): + if _in_dygraph_mode(): op = Operator( self, None, @@ -1707,8 +1706,7 @@ class Block(object): inputs=kwargs.get("inputs", None), outputs=kwargs.get("outputs", None), attrs=kwargs.get("attrs", None)) - _imperative_tracer().trace_op(op, - kwargs.get("stop_gradient", False)) + _dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False)) else: op_desc = self.desc._prepend_op() op = Operator( @@ -2347,40 +2345,6 @@ class IrGraph(object): """ return {IrOpNode(node) for node in self.graph.nodes() if node.is_op()} - def _find_var_node(self, key): - """ - Get a variable node by the `key` from this graph. The key - can be a node name or a node id. - - WARNS: - There are some nodes may have the same name. So, be - cautious about using this method when you find the - target var node by its name. - - Args: - key(str|int): The str type denotes that the target variable node's name. - And the int type denotes that the target variable node's id. - - Raises: - ValueError: If this graph doesn't have a variable with the giving name or id. - - Returns: - IrVarNode: the variable node with the giving name or id. - """ - target_var_node = None - var_nodes = self.all_var_nodes() - if isinstance(key, six.string_types): - for var_node in var_nodes: - if var_node.name() == key: - target_var_node = var_node - elif isinstance(key, int): - for var_node in var_nodes: - if var_node.id() == key: - target_var_node = var_node - if target_var_node is None: - raise ValueError("var_node %s not in this graph" % key) - return target_var_node - def create_persistable_node(self, name, var_type, shape, var_dtype): """ Create a persistable variable node in the graph. In IrGraph, @@ -2525,14 +2489,6 @@ class IrGraph(object): core.graph_safe_remove_nodes(self.graph, original_nodes) def resolve_hazard(self): - def _to_node(nodes, node_name): - target_node = None - for n in nodes: - if n.name() == node_name: - target_node = n - assert target_node is not None, "Cannot find the target node in the giving set." - return target_node - ordered_nodes = core.topology_sort(self.graph) var_nodes = dict() for node in ordered_nodes: @@ -2540,16 +2496,17 @@ class IrGraph(object): for each_var_name in node.op().input_arg_names(): if each_var_name not in var_nodes: var_nodes[each_var_name] = [ - _to_node(node.inputs, each_var_name) + self._find_node_by_name(node.inputs, each_var_name) ] for each_var_name in node.op().output_arg_names(): if each_var_name not in var_nodes: var_nodes[each_var_name] = [ - _to_node(node.outputs, each_var_name) + self._find_node_by_name(node.outputs, each_var_name) ] else: var_nodes[each_var_name].append( - _to_node(node.outputs, each_var_name)) + self._find_node_by_name(node.outputs, + each_var_name)) self.graph.resolve_hazard(var_nodes) def has_circle(self): @@ -2662,6 +2619,17 @@ class IrGraph(object): program = Program._construct_from_desc(desc) return program + def _find_node_by_name(self, nodes, node_name): + """ + Find a node in the giving nodes set by the name. + """ + target_node = None + for n in nodes: + if n.name() == node_name: + target_node = n + assert target_node is not None, "Cannot find the target node in the giving set." + return target_node + def _update_desc_attr(self, desc, name, val): """ Update the value of desc's attribute by attribute's name. @@ -3541,22 +3509,22 @@ def _get_var(name, program=None): @signature_safe_contextmanager -def _imperative_guard(tracer): - global _imperative_tracer_ - tmp_trace = _imperative_tracer_ - _imperative_tracer_ = tracer +def _dygraph_guard(tracer): + global _dygraph_tracer_ + tmp_trace = _dygraph_tracer_ + _dygraph_tracer_ = tracer yield - _imperative_tracer_ = tmp_trace + _dygraph_tracer_ = tmp_trace @signature_safe_contextmanager -def _imperative_place_guard(place): - global _imperative_current_expected_place_ - tmp_place = _imperative_current_expected_place_ - _imperative_current_expected_place_ = place +def _dygraph_place_guard(place): + global _dygraph_current_expected_place_ + tmp_place = _dygraph_current_expected_place_ + _dygraph_current_expected_place_ = place yield - _imperative_current_expected_place_ = tmp_place + _dygraph_current_expected_place_ = tmp_place diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py index 8358bb1aba98d8f5699cbda27e657ba6c470d333..6aff93dceaf5cfd299bdc9f68246ed579f248f3c 100644 --- a/python/paddle/fluid/initializer.py +++ b/python/paddle/fluid/initializer.py @@ -165,7 +165,7 @@ class ConstantInitializer(Initializer): 'force_cpu': self._force_cpu or force_init_on_cpu() }, stop_gradient=True) - if not framework._in_imperative_mode(): + if not framework._in_dygraph_mode(): var.op = op return op @@ -245,7 +245,7 @@ class UniformInitializer(Initializer): attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype}) - if not framework._in_imperative_mode(): + if not framework._in_dygraph_mode(): var.op = op return op @@ -324,7 +324,7 @@ class NormalInitializer(Initializer): outputs={"Out": var}, attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype}) - if not framework._in_imperative_mode(): + if not framework._in_dygraph_mode(): var.op = op return op @@ -403,7 +403,7 @@ class TruncatedNormalInitializer(Initializer): outputs={"Out": var}, attrs={"in_dtype": out_var.dtype, "out_dtype": var.dtype}) - if not framework._in_imperative_mode(): + if not framework._in_dygraph_mode(): var.op = op return op @@ -509,7 +509,7 @@ class XavierInitializer(Initializer): "seed": self._seed }, stop_gradient=True) - if not framework._in_imperative_mode(): + if not framework._in_dygraph_mode(): var.op = op return op @@ -610,7 +610,7 @@ class MSRAInitializer(Initializer): "seed": self._seed }, stop_gradient=True) - if not framework._in_imperative_mode(): + if not framework._in_dygraph_mode(): var.op = op return op @@ -709,7 +709,7 @@ class BilinearInitializer(Initializer): 'shape': list(shape), value_name: values }) - if not framework._in_imperative_mode(): + if not framework._in_dygraph_mode(): var.op = op return op @@ -768,7 +768,7 @@ class NumpyArrayInitializer(Initializer): value_name: values }, stop_gradient=True) - if not framework._in_imperative_mode(): + if not framework._in_dygraph_mode(): var.op = op return op diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py index 3569a8bc357daf9408e8ae3eb53ad9d2942cfeaa..3cdd05533f703ac27333daab7ada0c26392a24f5 100644 --- a/python/paddle/fluid/install_check.py +++ b/python/paddle/fluid/install_check.py @@ -17,7 +17,7 @@ from .param_attr import ParamAttr from .initializer import Constant from . import layers from . import backward -from .imperative import Layer, nn +from .dygraph import Layer, nn from . import executor from . import core diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index a85ef3c13f845959200d26391f6c95923a11c6ed..7eb912645e5077d35a2d11d7d09a033d28345e15 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -17,7 +17,7 @@ from __future__ import print_function import copy import six -from .framework import Parameter, dtype_is_floating, _in_imperative_mode +from .framework import Parameter, dtype_is_floating, _in_dygraph_mode from . import unique_name from paddle.fluid.initializer import Constant, Xavier from .param_attr import ParamAttr @@ -30,9 +30,9 @@ class LayerHelper(LayerHelperBase): def __init__(self, layer_type, **kwargs): self.kwargs = kwargs name = self.kwargs.get('name', None) - # TODO(panyx0718, minqiyang): imperative mode + # TODO(panyx0718, minqiyang): dygraph mode # can not use both `layer_type` and `name`. Deprecate LayerHelper - # and write a Helper for imperative mode. + # and write a Helper for dygraph mode. if name is None: self.kwargs['name'] = unique_name.generate(layer_type) diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py index a68160d797bcaca8cff849c560960d6a8823de53..869a5f54e9cdf5740c5e216917d92880d7d61e2d 100644 --- a/python/paddle/fluid/layer_helper_base.py +++ b/python/paddle/fluid/layer_helper_base.py @@ -17,7 +17,7 @@ from __future__ import print_function import copy import numpy as np -from .framework import Variable, default_main_program, default_startup_program, _in_imperative_mode, _current_expected_place +from .framework import Variable, default_main_program, default_startup_program, _in_dygraph_mode, _current_expected_place from . import unique_name from .param_attr import ParamAttr, WeightNormParamAttr from . import core @@ -54,8 +54,8 @@ class LayerHelperBase(object): Return Variable construct from value """ if isinstance(value, np.ndarray): - assert _in_imperative_mode( - ), "to_variable could only be called in imperative mode" + assert _in_dygraph_mode( + ), "to_variable could only be called in dygraph mode" if not block: block = default_main_program().current_block() @@ -302,8 +302,8 @@ class LayerHelperBase(object): param = self._create_weight_normalize(attr, shape, dtype) WeightNormParamAttr.params_with_weight_norm.append(param) return param - if _in_imperative_mode(): - # In imperative mode, we want the returned parameter to be + if _in_dygraph_mode(): + # In dygraph mode, we want the returned parameter to be # initialized so that it can be used imperatively. return self.main_program.global_block().create_parameter( dtype=dtype, @@ -370,7 +370,7 @@ class LayerHelperBase(object): initializer: initializer to use """ assert isinstance(var, Variable) - if _in_imperative_mode(): + if _in_dygraph_mode(): initializer(var, var.block) else: self.startup_program.global_block().create_var( diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index f2413f603304f8262476ca3ae2b820c89d009c3d..f02496506c6f0ce37d135625aafaa405c88eb8cb 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -23,8 +23,8 @@ import os import inspect from ..layer_helper import LayerHelper from ..initializer import Normal, Constant, NumpyArrayInitializer -from ..framework import Variable, OpProtoHolder, _in_imperative_mode -from ..imperative import base +from ..framework import Variable, OpProtoHolder, _in_dygraph_mode +from ..dygraph import base from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ from .tensor import concat, assign @@ -32,7 +32,7 @@ from . import utils from .. import unique_name from functools import reduce from .. import core -from ..imperative import layers +from ..dygraph import layers __all__ = [ 'fc', @@ -296,7 +296,6 @@ def fc(input, data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32") fc = fluid.layers.fc(input=[data_1, data_2], size=1000, act="tanh") """ - helper = LayerHelper("fc", **locals()) dtype = helper.input_dtype() @@ -3279,6 +3278,8 @@ def layer_norm(input, >>> dtype='float32') >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1) """ + assert _in_dygraph_mode( + ) is not True, "please use FC instead of fc in dygraph mode!" helper = LayerHelper('layer_norm', **locals()) dtype = helper.input_dtype() @@ -5866,11 +5867,49 @@ def multiplex(inputs, index): """ ${comment} - >>> import paddle.fluid as fluid - >>> x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32') - >>> x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32') - >>> index = fluid.layers.data(name='index', shape=[1], dtype='int32') - >>> out = fluid.layers.multiplex(inputs=[x1, x2], index=index) + For Example: + + .. code-block:: text + + case 1: + + Given: + + X = [[[0,0,3,4], [0,1,3,4], [0,2,4,4], [0,3,3,4]], + [[1,0,3,4], [1,1,7,8], [1,2,4,2], [1,3,3,4]], + [[2,0,3,4], [2,1,7,8], [2,2,4,2], [2,3,3,4]], + [[3,0,3,4], [3,1,7,8], [3,2,4,2], [3,3,3,4]]] + + index = [3,0,1,2] + + out:[[3 0 3 4] // X[3,0] (3 = index[i], 0 = i); i=0 + [0 1 3 4] // X[0,1] (0 = index[i], 1 = i); i=1 + [1 2 4 2] // X[1,2] (0 = index[i], 2 = i); i=2 + [2 3 3 4]] // X[2,3] (0 = index[i], 3 = i); i=3 + + case 2: + + Given: + + X = [[[0,0,3,4], [0,1,3,4], [0,2,4,4], [0,3,3,4]], + [[1,0,3,4], [1,1,7,8], [1,2,4,2], [1,3,3,4]]] + + index = [1,0] + + out:[[1 0 3 4] // X[1,0] (3 = index[0], 0 = i); i=1 + [0 1 3 4] // X[0,1] (0 = index[1], 1 = i); i=2 + [0 2 4 4] // X[0,2] (0 = 0, 2 = i); i=3 + [0 3 3 4]] // X[0,3] (0 = 0, 3 = i); i=4 + + Examples: + + .. code-block:: python + + import paddle.fluid as fluid + x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32') + x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32') + index = fluid.layers.data(name='index', shape=[1], dtype='int32') + out = fluid.layers.multiplex(inputs=[x1, x2], index=index) Args: inputs (list): ${x_comment}. @@ -6405,8 +6444,8 @@ def squeeze(input, axes, name=None): x = layers.data(name='x', shape=[5, 1, 10]) y = layers.sequeeze(input=x, axes=[1]) """ - assert not _in_imperative_mode(), ( - "squeeze layer is not supported in imperative mode yet.") + assert not _in_dygraph_mode(), ( + "squeeze layer is not supported in dygraph mode yet.") helper = LayerHelper("squeeze", **locals()) out = helper.create_variable_for_type_inference(dtype=input.dtype) x_shape = helper.create_variable_for_type_inference(dtype=input.dtype) @@ -9144,7 +9183,7 @@ def _elementwise_op(helper): op_type = helper.layer_type x = helper.kwargs.get('x', None) y = helper.kwargs.get('y', None) - if _in_imperative_mode(): + if _in_dygraph_mode(): x = base.to_variable(x) y = base.to_variable(y) diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index ef90638c721810e618ce4760e83e1a63b86c2325..80450119f44e93aae4b483983484ea18be5b2035 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -20,7 +20,6 @@ from ..framework import convert_np_dtype_to_dtype_ from ..framework import Variable from ..initializer import Constant, force_init_on_cpu from ..core import VarDesc -from ..imperative import base as imperative_base from .layer_function_generator import templatedoc import numpy diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index e21f303a3e07fe176920cd0650fb96f600dd4743..479c0b0a4abef23b9aed646eb34a476e443016d5 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -30,7 +30,6 @@ from .initializer import Constant from .layer_helper import LayerHelper from .layers import ops from .regularizer import append_regularization_ops -from .imperative import base as imperative_base from paddle.fluid import core from paddle.fluid.layers import tensor from functools import reduce @@ -169,7 +168,7 @@ class Optimizer(object): name = self._name + "_" + name if (name in self._accumulators and param.name in self._accumulators[name]): - if framework._in_imperative_mode(): + if framework._in_dygraph_mode(): return self._accumulators[name][param.name] raise Exception("Accumulator {} already exists for parameter {}". format(name, param.name)) @@ -396,11 +395,11 @@ class Optimizer(object): """ self._dtype = loss.dtype optimize_ops = [] - if framework._in_imperative_mode(): + if framework._in_dygraph_mode(): if parameter_list is not None: parameters = parameter_list else: - parameters = framework._imperative_tracer().all_parameters() + parameters = framework._dygraph_tracer().all_parameters() params_grads = [] for param in parameters: diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index b84ce2b3aeab7963f8de85eb09ff6e085e52c198..6b8622b6f26f6102e5ee02716f30a847ed9a2fed 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -262,14 +262,14 @@ class OpTest(unittest.TestCase): if isinstance(value, tuple): data = value[0] lod = value[1] - v = fluid.imperative.base.to_variable(value=data) + v = fluid.dygraph.base.to_variable(value=data) v._ivar.value().get_tensor().set_recursive_sequence_lengths(lod) return v else: - return fluid.imperative.base.to_variable(value) + return fluid.dygraph.base.to_variable(value) - def _calc_imperative_output(self, place, parallel=False, no_check_set=None): - with fluid.imperative.base.guard(place=place): + def _calc_dygraph_output(self, place, parallel=False, no_check_set=None): + with fluid.dygraph.base.guard(place=place): block = fluid.default_main_program().global_block() # prepare input variable @@ -316,7 +316,7 @@ class OpTest(unittest.TestCase): return outputs - def _calc_output(self, place, parallel=False, no_check_set=None): + def _calc_output(self, place, parallel=False, no_check_set=None, loss=None): program = Program() block = program.global_block() self._append_ops(block) @@ -329,8 +329,14 @@ class OpTest(unittest.TestCase): use_cuda = False if isinstance(place, fluid.CUDAPlace(0)): use_cuda = True - executor = fluid.ParallelExecutor( - use_cuda=use_cuda, loss_name=loss.name, main_program=program) + if loss: + executor = fluid.ParallelExecutor( + use_cuda=use_cuda, + loss_name=loss.name, + main_program=program) + else: + executor = fluid.ParallelExecutor( + use_cuda=use_cuda, main_program=program) else: executor = Executor(place) @@ -364,9 +370,9 @@ class OpTest(unittest.TestCase): atol, no_check_set=None, equal_nan=False, - check_imperative=False): - if check_imperative: - imperative_outs = self._calc_imperative_output( + check_dygraph=False): + if check_dygraph: + dygraph_outs = self._calc_dygraph_output( place, no_check_set=no_check_set) outs, fetch_list = self._calc_output(place, no_check_set=no_check_set) @@ -393,8 +399,8 @@ class OpTest(unittest.TestCase): type(sub_out)) for item in sub_out: sub_out_name, expect = item[0], item[1] - if check_imperative: - imperative_actual = imperative_outs[sub_out_name][0] + if check_dygraph: + imperative_actual = dygraph_outs[sub_out_name][0] imperative_actual_t = np.array( imperative_actual._ivar.value().get_tensor()) idx = find_actual(sub_out_name, fetch_list) @@ -407,7 +413,7 @@ class OpTest(unittest.TestCase): actual_t, expect_t, atol=atol, equal_nan=equal_nan), "Output (" + sub_out_name + ") has diff at " + str(place)) - if check_imperative: + if check_dygraph: self.assertTrue( np.allclose( imperative_actual_t, @@ -415,21 +421,21 @@ class OpTest(unittest.TestCase): atol=atol, equal_nan=equal_nan), "Output (" + sub_out_name + ") has diff at " + - str(place) + " in imperative mode") + str(place) + " in dygraph mode") if isinstance(expect, tuple): self.assertListEqual( actual.recursive_sequence_lengths(), expect[1], "Output (" + sub_out_name + ") has different lod at " + str(place)) - if check_imperative: + if check_dygraph: self.assertListEqual( imperative_actual._ivar.value().get_tensor() .recursive_sequence_lengths(), expect[1], "Output (" + out_name + ") has different lod at " + - str(place) + " in imperative mode") + str(place) + " in dygraph mode") else: - if check_imperative: - imperative_actual = imperative_outs[out_name][0] + if check_dygraph: + imperative_actual = dygraph_outs[out_name][0] imperative_actual_t = np.array( imperative_actual._ivar.value().get_tensor()) idx = find_actual(out_name, fetch_list) @@ -443,7 +449,7 @@ class OpTest(unittest.TestCase): "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + str(expect_t) + "\n" + "But Got" + str(actual_t) + " in class " + self.__class__.__name__) - if check_imperative: + if check_dygraph: self.assertTrue( np.allclose( imperative_actual_t, @@ -458,12 +464,12 @@ class OpTest(unittest.TestCase): self.assertListEqual(actual.recursive_sequence_lengths(), expect[1], "Output (" + out_name + ") has different lod at " + str(place)) - if check_imperative: + if check_dygraph: self.assertListEqual( imperative_actual._ivar.value().get_tensor() .recursive_sequence_lengths(), expect[1], "Output (" + out_name + ") has different lod at " + - str(place) + " in imperative mode") + str(place) + " in dygraph mode") def _get_places(self): if self.dtype == np.float16: @@ -490,11 +496,11 @@ class OpTest(unittest.TestCase): atol=1e-5, no_check_set=None, equal_nan=False, - check_imperative=False): + check_dygraph=False): places = self._get_places() for place in places: self.check_output_with_place(place, atol, no_check_set, equal_nan, - check_imperative) + check_dygraph) def check_output_customized(self, checker): places = self._get_places() diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 61fd9af1275865f2d03e759199c219b36d3a0b5b..18ed02a72275437fa6106e57c0383e17647d9700 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -43,6 +43,7 @@ class TestParallelExecutorBase(unittest.TestCase): use_ir_memory_optimize=True, enable_inplace=True, fuse_elewise_add_act_ops=False, + fuse_all_optimizer_ops=False, fuse_all_reduce_ops=False, fuse_relu_depthwise_conv=False, optimizer=fluid.optimizer.Adam, @@ -81,6 +82,7 @@ class TestParallelExecutorBase(unittest.TestCase): build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops build_strategy.fuse_relu_depthwise_conv = fuse_relu_depthwise_conv build_strategy.memory_optimize = False if memory_opt else use_ir_memory_optimize + build_strategy.fuse_all_optimizer_ops = fuse_all_optimizer_ops build_strategy.fuse_all_reduce_ops = fuse_all_reduce_ops # python memory optimization is conflict with inplace pass. # Use ir graph memory optimization after inplace pass is the correct way. diff --git a/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py b/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py index 9d5fe114bad2b2bae73cf18e17ebd7af288a91da..29eb0166b771bbea5509de8b7714bc4608a07cd1 100644 --- a/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py +++ b/python/paddle/fluid/tests/unittests/test_alloc_continuous_space_op.py @@ -16,8 +16,10 @@ from __future__ import print_function import unittest import numpy as np - from op_test import OpTest +from paddle.fluid import core + +alignment = 256 class TestAllocContinuousSpace(OpTest): @@ -29,11 +31,11 @@ class TestAllocContinuousSpace(OpTest): self.constant = attrs["constant"] self.set_constant = attrs["set_constant"] self.Inputs = self.init_input() - self.FusedOutput = self.init_output(self.Inputs, self.set_constant, - self.constant) + self.Outputs, self.FusedOutput = self.init_output( + self.Inputs, self.set_constant, self.constant) self.inputs = {'Input': self.Inputs} self.attrs = attrs - self.outputs = {'Output': self.Inputs, 'FusedOutput': self.FusedOutput} + self.outputs = {'Output': self.Outputs, 'FusedOutput': self.FusedOutput} def init_dtype(self): self.dtype = np.float32 @@ -52,14 +54,31 @@ class TestAllocContinuousSpace(OpTest): return {"copy_data": True, "set_constant": False, "constant": 0.0} def init_output(self, input_list, set_constant, constant): - inputs = [input[1].flatten() for input in input_list] - output = np.concatenate(inputs) + inputs = [] + outputs = input_list + + for input in input_list: + length = len(input[1].flatten()) + aligned_len = (length + alignment) / alignment * alignment + out = np.zeros(int(aligned_len)) + out[0:length] = input[1].flatten() + inputs.append(out) + + alloc_continuous_space_var = np.concatenate([input for input in inputs]) if set_constant: - output = np.ones((len(output))) * constant - return output + alloc_continuous_space_var = np.ones( + (len(alloc_continuous_space_var))) * constant + outputs = [(out[0], + np.ones(out[1].shape).astype(self.dtype) * constant) + for out in outputs] + return outputs, alloc_continuous_space_var def test_check_output(self): - self.check_output() + if core.is_compiled_with_cuda(): + self.check_output_with_place( + place=core.CUDAPlace(0), + no_check_set=["FusedOutput"], + atol=1e-5) class TestAllocContinuousSpace2(TestAllocContinuousSpace): @@ -67,7 +86,11 @@ class TestAllocContinuousSpace2(TestAllocContinuousSpace): return {"copy_data": False, "set_constant": True, "constant": 0.5} def test_check_output(self): - self.check_output(no_check_set=["Output"]) + if core.is_compiled_with_cuda(): + self.check_output_with_place( + place=core.CUDAPlace(0), + no_check_set=["FusedOutput"], + atol=1e-5) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py index b12aaea3219cb81e8fa0e7584120db510fb7b62c..9cb88d4a8553f3b750f6cf3b24115b4d188ed1d6 100644 --- a/python/paddle/fluid/tests/unittests/test_base_layer.py +++ b/python/paddle/fluid/tests/unittests/test_base_layer.py @@ -18,7 +18,7 @@ import numpy as np import paddle.fluid as fluid -class L1(fluid.imperative.Layer): +class L1(fluid.dygraph.Layer): def __init__(self, prefix): super(L1, self).__init__(prefix) self._param_attr = fluid.ParamAttr( @@ -32,7 +32,7 @@ class L1(fluid.imperative.Layer): return self.w1 + self.w2 -class L2(fluid.imperative.Layer): +class L2(fluid.dygraph.Layer): def __init__(self, prefix): super(L2, self).__init__(prefix) self.layer1 = L1(self.full_name()) @@ -42,7 +42,7 @@ class L2(fluid.imperative.Layer): return self.layer1() + self.layer2() -class L3(fluid.imperative.Layer): +class L3(fluid.dygraph.Layer): def __init__(self, prefix): super(L3, self).__init__(prefix) self.layer1 = L2(self.full_name()) @@ -54,7 +54,7 @@ class L3(fluid.imperative.Layer): class TestBaseLayer(unittest.TestCase): def test_one_level(self): - with fluid.imperative.guard(): + with fluid.dygraph.guard(): l = L1('test_one_level') ret = l() self.assertEqual(l.w1.name, "test_one_level/L1_0.w_0") @@ -62,7 +62,7 @@ class TestBaseLayer(unittest.TestCase): self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2]))) def test_three_level(self): - with fluid.imperative.guard(): + with fluid.dygraph.guard(): l = L3('test_three_level') names = [p.name for p in l.parameters()] ret = l() diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py new file mode 100644 index 0000000000000000000000000000000000000000..93e67deaf3c9f7fe17296049137fbbe00374c6f1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py @@ -0,0 +1,135 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from parallel_executor_test_base import TestParallelExecutorBase +import paddle.fluid as fluid +import paddle.fluid.core as core +import numpy as np +import paddle +import paddle.dataset.mnist as mnist +import unittest +import os + + +def simple_fc_net(use_feed): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + hidden = img + for _ in range(4): + hidden = fluid.layers.fc( + hidden, + size=200, + act='relu', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def fc_with_batchnorm(use_feed): + img = fluid.layers.data(name='image', shape=[784], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + + hidden = img + for _ in range(2): + hidden = fluid.layers.fc( + hidden, + size=200, + act='relu', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + + hidden = fluid.layers.batch_norm(input=hidden) + + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +class TestFuseAdamOps(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + os.environ['CPU_NUM'] = str(4) + + def _init_data(self, random=True): + np.random.seed(5) + if random: + img = np.random.random(size=[32, 784]).astype(np.float32) + else: + img = np.ones(shape=[32, 784], dtype='float32') + label = np.ones(shape=[32, 1], dtype='int64') + return img, label + + def _compare_fused_optimizer_ops(self, + model, + use_cuda, + random_data=True, + optimizer=fluid.optimizer.Adam): + if use_cuda and not core.is_compiled_with_cuda(): + return + img, label = self._init_data(random_data) + not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + fuse_all_optimizer_ops=False, + memory_opt=False, # avoid the gradient's name changed in Python side. + optimizer=optimizer) + fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( + model, + feed_dict={"image": img, + "label": label}, + use_cuda=use_cuda, + fuse_all_optimizer_ops=True, + memory_opt=False, # avoid the gradient's name changed in Python side. + optimizer=optimizer) + + for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss): + self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) + + def test_simple_fc_with_fuse_op(self): + self._compare_fused_optimizer_ops(simple_fc_net, True) + self._compare_fused_optimizer_ops(simple_fc_net, False) + + def test_batchnorm_fc_with_fuse_op(self): + self._compare_fused_optimizer_ops(fc_with_batchnorm, True) + # self._compare_fused_optimizer_ops(fc_with_batchnorm, False) + + +class TestFuseSGDOps(TestFuseAdamOps): + def sgd_optimizer(self, learning_rate=1e-4): + return fluid.optimizer.SGD(learning_rate=learning_rate) + + def test_simple_fc_with_fuse_op(self): + self._compare_fused_optimizer_ops( + simple_fc_net, True, optimizer=self.sgd_optimizer) + self._compare_fused_optimizer_ops( + simple_fc_net, False, optimizer=self.sgd_optimizer) + + def test_batchnorm_fc_with_fuse_op(self): + self._compare_fused_optimizer_ops( + fc_with_batchnorm, True, optimizer=self.sgd_optimizer) + self._compare_fused_optimizer_ops( + fc_with_batchnorm, False, optimizer=self.sgd_optimizer) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_gru_op.py b/python/paddle/fluid/tests/unittests/test_gru_op.py index 848c9a4952aebcf93fd7bf12f7bc4cd15c7a8b28..c66d59aceb05dfbf9beac809ff13841a77953695 100644 --- a/python/paddle/fluid/tests/unittests/test_gru_op.py +++ b/python/paddle/fluid/tests/unittests/test_gru_op.py @@ -156,7 +156,7 @@ class TestGRUOp(OpTest): } def test_check_output(self): - self.check_output(atol=1e-8, check_imperative=True) + self.check_output(atol=1e-8, check_dygraph=True) def test_check_grad(self): self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden']) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py index 4c44195a3d42a1a2a4a072b0513f212b22269c31..13f2d662178c7e1474ec43fdeadf7046516eb8e5 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py @@ -18,11 +18,11 @@ import numpy as np import paddle.fluid as fluid from paddle.fluid import core -from paddle.fluid.imperative.nn import FC +from paddle.fluid.dygraph.nn import FC from test_imperative_base import new_program_scope -class MyLayer(fluid.imperative.Layer): +class MyLayer(fluid.dygraph.Layer): def __init__(self, name_scope): super(MyLayer, self).__init__(name_scope) @@ -34,7 +34,7 @@ class MyLayer(fluid.imperative.Layer): return [x] -class MyPyLayer(fluid.imperative.PyLayer): +class MyPyLayer(fluid.dygraph.PyLayer): def __init__(self): super(MyPyLayer, self).__init__() @@ -48,7 +48,7 @@ class MyPyLayer(fluid.imperative.PyLayer): return np.array(dout) * (1 - np.square(np.array(out))) -class MLP(fluid.imperative.Layer): +class MLP(fluid.dygraph.Layer): def __init__(self, name_scope): super(MLP, self).__init__(name_scope) self._fc1 = FC(self.full_name(), @@ -71,7 +71,7 @@ class MLP(fluid.imperative.Layer): return x -class SimpleRNNCell(fluid.imperative.Layer): +class SimpleRNNCell(fluid.dygraph.Layer): def __init__(self, name_scope, step_input_size, hidden_size, output_size, param_attr): super(SimpleRNNCell, self).__init__(name_scope) @@ -159,7 +159,7 @@ class SimpleRNNCell(fluid.imperative.Layer): return reduce_out, hidden -class SimpleRNN(fluid.imperative.Layer): +class SimpleRNN(fluid.dygraph.Layer): def __init__(self, name_scope): super(SimpleRNN, self).__init__(name_scope) self.seq_len = 4 @@ -194,10 +194,10 @@ class SimpleRNN(fluid.imperative.Layer): class TestImperative(unittest.TestCase): def test_sum_op(self): x = np.ones([2, 2], np.float32) - with fluid.imperative.guard(): + with fluid.dygraph.guard(): inputs = [] for _ in range(10): - inputs.append(fluid.imperative.base.to_variable(x)) + inputs.append(fluid.dygraph.base.to_variable(x)) ret = fluid.layers.sums(inputs) loss = fluid.layers.reduce_sum(ret) loss._backward() @@ -205,17 +205,17 @@ class TestImperative(unittest.TestCase): self.assertTrue(np.allclose(inputs[0]._gradient(), x)) def test_layer(self): - with fluid.imperative.guard(): + with fluid.dygraph.guard(): cl = core.Layer() cl.forward([]) - l = fluid.imperative.Layer("l") + l = fluid.dygraph.Layer("l") self.assertRaises(NotImplementedError, l.forward, []) def test_pylayer_func_id(self): - with fluid.imperative.guard(): + with fluid.dygraph.guard(): - class PyLayer1(fluid.imperative.PyLayer): + class PyLayer1(fluid.dygraph.PyLayer): def __init__(self): super(PyLayer1, self).__init__() @@ -227,7 +227,7 @@ class TestImperative(unittest.TestCase): def backward(input): return input - class PyLayer2(fluid.imperative.PyLayer): + class PyLayer2(fluid.dygraph.PyLayer): def __init__(self): super(PyLayer2, self).__init__() @@ -241,21 +241,21 @@ class TestImperative(unittest.TestCase): py_layer_1 = PyLayer1() py_layer_2 = PyLayer2() - py_layer_1(fluid.imperative.base.to_variable(np.ones([2, 2]))) - py_layer_2(fluid.imperative.base.to_variable(np.ones([2, 2]))) + py_layer_1(fluid.dygraph.base.to_variable(np.ones([2, 2]))) + py_layer_2(fluid.dygraph.base.to_variable(np.ones([2, 2]))) id = py_layer_1.forward_id self.assertGreater(id, 0) self.assertEqual(py_layer_1.backward_id, id + 1) self.assertEqual(py_layer_2.forward_id, id + 2) self.assertEqual(py_layer_2.backward_id, id + 3) - py_layer_1(fluid.imperative.base.to_variable(np.ones([2, 2]))) + py_layer_1(fluid.dygraph.base.to_variable(np.ones([2, 2]))) self.assertEqual(py_layer_1.forward_id, id) def test_pylayer(self): np_inp = np.ones([2, 2], np.float32) - with fluid.imperative.guard(): + with fluid.dygraph.guard(): my_py_layer = MyPyLayer() - var_inp = fluid.imperative.base.to_variable(np_inp) + var_inp = fluid.dygraph.base.to_variable(np_inp) outs = my_py_layer(var_inp) dy_out = np.sum(outs[0]._numpy()) outs[0]._backward() @@ -282,8 +282,8 @@ class TestImperative(unittest.TestCase): def test_layer_in_out(self): np_inp = np.array([1.0, 2.0, -1.0], dtype=np.float32) - with fluid.imperative.guard(): - var_inp = fluid.imperative.base.to_variable(np_inp) + with fluid.dygraph.guard(): + var_inp = fluid.dygraph.base.to_variable(np_inp) l = MyLayer("my_layer") x = l(var_inp)[0] self.assertIsNotNone(x) @@ -310,8 +310,8 @@ class TestImperative(unittest.TestCase): def test_mlp(self): np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) - with fluid.imperative.guard(): - var_inp = fluid.imperative.base.to_variable(np_inp) + with fluid.dygraph.guard(): + var_inp = fluid.dygraph.base.to_variable(np_inp) mlp = MLP("mlp") out = mlp(var_inp) dy_out = out._numpy() @@ -353,8 +353,8 @@ class TestImperative(unittest.TestCase): [10.0, 11.0, 12.0]]) np_inp = np_inp.reshape((1, 4, 3)) np_inp = np_inp.astype(np.float32) - with fluid.imperative.guard(): - var_inp = fluid.imperative.base.to_variable(np_inp) + with fluid.dygraph.guard(): + var_inp = fluid.dygraph.base.to_variable(np_inp) var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3]) simple_rnn = SimpleRNN("simple_rnn") outs, pre_hiddens = simple_rnn.forward(var_inp) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py index 62c25f734598e35b7c668d1ec1b89b5c57449f73..a92b7d62fa598a3ec9b53bade2805cc033f4b9d9 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py @@ -18,11 +18,11 @@ import numpy as np import paddle import paddle.fluid as fluid from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC -from paddle.fluid.imperative.base import to_variable +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC +from paddle.fluid.dygraph.base import to_variable -class SimpleImgConvPool(fluid.imperative.Layer): +class SimpleImgConvPool(fluid.dygraph.Layer): def __init__(self, name_scope, num_channels, @@ -71,7 +71,7 @@ class SimpleImgConvPool(fluid.imperative.Layer): return x -class MNIST(fluid.imperative.Layer): +class MNIST(fluid.dygraph.Layer): def __init__(self, name_scope): super(MNIST, self).__init__(name_scope) @@ -98,12 +98,12 @@ class MNIST(fluid.imperative.Layer): return x -class TestImperativeCheckpoint(unittest.TestCase): +class TestDygraphCheckpoint(unittest.TestCase): def save_load_persistables(self): seed = 90 epoch_num = 1 - with fluid.imperative.guard(): + with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed @@ -135,14 +135,14 @@ class TestImperativeCheckpoint(unittest.TestCase): avg_loss._backward() sgd.minimize(avg_loss) - fluid.imperative.save_persistables(mnist, "save_dir") + fluid.dygraph.save_persistables(mnist, "save_dir") mnist.clear_gradients() for param in mnist.parameters(): dy_param_init_value[param.name] = param._numpy() mnist.load_dict( - fluid.imperative.load_persistables(mnist, "save_dir")) + fluid.dygraph.load_persistables(mnist, "save_dir")) restore = mnist.parameters() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py index ac123ee8db26ac23bbf9454e399a592a28c91c32..ccebd4a54727f383bd4e46ff57bfdc9381577d05 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py @@ -22,7 +22,7 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core from test_imperative_base import new_program_scope -from paddle.fluid.imperative.base import to_variable +from paddle.fluid.dygraph.base import to_variable # Can use Amusic dataset as the DeepCF describes. DATA_PATH = os.environ.get('DATA_PATH', '') @@ -32,11 +32,11 @@ NUM_BATCHES = int(os.environ.get('NUM_BATCHES', 5)) NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1)) -class DMF(fluid.imperative.Layer): +class DMF(fluid.dygraph.Layer): def __init__(self, name_scope): super(DMF, self).__init__(name_scope) - self._user_latent = fluid.imperative.FC(self.full_name(), 256) - self._item_latent = fluid.imperative.FC(self.full_name(), 256) + self._user_latent = fluid.dygraph.FC(self.full_name(), 256) + self._item_latent = fluid.dygraph.FC(self.full_name(), 256) self._user_layers = [] self._item_layers = [] @@ -45,12 +45,12 @@ class DMF(fluid.imperative.Layer): self._user_layers.append( self.add_sublayer( 'user_layer_%d' % i, - fluid.imperative.FC( + fluid.dygraph.FC( self.full_name(), self._hid_sizes[i], act='relu'))) self._item_layers.append( self.add_sublayer( 'item_layer_%d' % i, - fluid.imperative.FC( + fluid.dygraph.FC( self.full_name(), self._hid_sizes[i], act='relu'))) def forward(self, users, items): @@ -63,18 +63,18 @@ class DMF(fluid.imperative.Layer): return fluid.layers.elementwise_mul(users, items) -class MLP(fluid.imperative.Layer): +class MLP(fluid.dygraph.Layer): def __init__(self, name_scope): super(MLP, self).__init__(name_scope) - self._user_latent = fluid.imperative.FC(self.full_name(), 256) - self._item_latent = fluid.imperative.FC(self.full_name(), 256) + self._user_latent = fluid.dygraph.FC(self.full_name(), 256) + self._item_latent = fluid.dygraph.FC(self.full_name(), 256) self._match_layers = [] self._hid_sizes = [128, 64] for i in range(len(self._hid_sizes)): self._match_layers.append( self.add_sublayer( 'match_layer_%d' % i, - fluid.imperative.FC( + fluid.dygraph.FC( self.full_name(), self._hid_sizes[i], act='relu'))) self._mat @@ -88,7 +88,7 @@ class MLP(fluid.imperative.Layer): return match_vec -class DeepCF(fluid.imperative.Layer): +class DeepCF(fluid.dygraph.Layer): def __init__(self, name_scope, num_users, num_items, matrix): super(DeepCF, self).__init__(name_scope) self._num_users = num_users @@ -103,7 +103,7 @@ class DeepCF(fluid.imperative.Layer): self._mlp = MLP(self.full_name()) self._dmf = DMF(self.full_name()) - self._match_fc = fluid.imperative.FC(self.full_name(), 1, act='sigmoid') + self._match_fc = fluid.dygraph.FC(self.full_name(), 1, act='sigmoid') def forward(self, users, items): # users_emb = self._user_emb(users) @@ -191,7 +191,7 @@ def load_data(DATA_PATH): np.expand_dims(labels_np, -1), num_users, num_items, matrix -class TestImperativeDeepCF(unittest.TestCase): +class TestDygraphDeepCF(unittest.TestCase): def test_deefcf(self): seed = 90 if DATA_PATH: @@ -237,7 +237,7 @@ class TestImperativeDeepCF(unittest.TestCase): fetch_list=[loss])[0] sys.stderr.write('static loss %s\n' % static_loss) - with fluid.imperative.guard(): + with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py index 6024fb5f816d10cedad36272e353704797526676..58faa1cb85af9cedb70f3a12244cfeb44e0f4f52 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py @@ -22,12 +22,12 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC from test_imperative_base import new_program_scope -from paddle.fluid.imperative.base import to_variable +from paddle.fluid.dygraph.base import to_variable -class Discriminator(fluid.imperative.Layer): +class Discriminator(fluid.dygraph.Layer): def __init__(self, name_scope): super(Discriminator, self).__init__(name_scope) self._fc1 = FC(self.full_name(), size=32, act='elu') @@ -38,7 +38,7 @@ class Discriminator(fluid.imperative.Layer): return self._fc2(x) -class Generator(fluid.imperative.Layer): +class Generator(fluid.dygraph.Layer): def __init__(self, name_scope): super(Generator, self).__init__(name_scope) self._fc1 = FC(self.full_name(), size=64, act='elu') @@ -51,7 +51,7 @@ class Generator(fluid.imperative.Layer): return self._fc3(x) -class TestImperativeGAN(unittest.TestCase): +class TestDygraphGAN(unittest.TestCase): def test_gan_float32(self): seed = 90 @@ -130,7 +130,7 @@ class TestImperativeGAN(unittest.TestCase): scope.find_var(param.name).get_tensor()) dy_params = dict() - with fluid.imperative.guard(): + with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py index 2086fab5c81e241d1a49386d8285289b14364dc8..a8fb9ecfe4be16b73ac2144259f25ed3859ece7e 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py @@ -22,16 +22,16 @@ import paddle import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.optimizer import AdamOptimizer -from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC from test_imperative_base import new_program_scope -from paddle.fluid.imperative.base import to_variable +from paddle.fluid.dygraph.base import to_variable def gen_data(): pass -class GraphConv(fluid.imperative.Layer): +class GraphConv(fluid.dygraph.Layer): def __init__(self, name_scope, in_features, out_features): super(GraphConv, self).__init__(name_scope) @@ -50,7 +50,7 @@ class GraphConv(fluid.imperative.Layer): return fluid.layers.matmul(adj, support) + self.bias -class GCN(fluid.imperative.Layer): +class GCN(fluid.dygraph.Layer): def __init__(self, name_scope, num_hidden): super(GCN, self).__init__(name_scope) self.gc = GraphConv(self.full_name(), num_hidden, 32) @@ -61,7 +61,7 @@ class GCN(fluid.imperative.Layer): return self.gc2(x, adj) -class TestImperativeGNN(unittest.TestCase): +class TestDygraphGNN(unittest.TestCase): def test_gnn_float32(self): seed = 90 @@ -115,7 +115,7 @@ class TestImperativeGNN(unittest.TestCase): static_weight = np.array( scope.find_var(model.gc.weight.name).get_tensor()) - with fluid.imperative.guard(): + with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py index 5b3c250501386a7854313218f5ea338281824252..829274afc7e17fb0b5f4d8200c5e1f7bbbe02393 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py @@ -23,12 +23,12 @@ import paddle import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC -from paddle.fluid.imperative.base import to_variable +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC +from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope -class SimpleImgConvPool(fluid.imperative.Layer): +class SimpleImgConvPool(fluid.dygraph.Layer): def __init__(self, name_scope, num_channels, @@ -77,7 +77,7 @@ class SimpleImgConvPool(fluid.imperative.Layer): return x -class MNIST(fluid.imperative.Layer): +class MNIST(fluid.dygraph.Layer): def __init__(self, name_scope): super(MNIST, self).__init__(name_scope) @@ -104,11 +104,11 @@ class MNIST(fluid.imperative.Layer): return x -class TestImperativeMnist(unittest.TestCase): +class TestDygraphMnist(unittest.TestCase): def test_mnist_float32(self): seed = 90 epoch_num = 1 - with fluid.imperative.guard(): + with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py index 460ba65a48c863315cda4847aee1b4e2366bba96..998c675815ece9236c819bffc4a4b74d44ff790e 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py @@ -16,17 +16,17 @@ from __future__ import print_function import unittest import paddle.fluid as fluid -from paddle.fluid.imperative.nn import Embedding +from paddle.fluid.dygraph.nn import Embedding import paddle.fluid.framework as framework from paddle.fluid.optimizer import SGDOptimizer -from paddle.fluid.imperative.base import to_variable +from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope import numpy as np import six from paddle.fluid.backward import append_backward -class SimpleLSTMRNN(fluid.imperative.Layer): +class SimpleLSTMRNN(fluid.dygraph.Layer): def __init__(self, name_scope, hidden_size, @@ -131,7 +131,7 @@ class SimpleLSTMRNN(fluid.imperative.Layer): return real_res, last_hidden, last_cell -class PtbModel(fluid.imperative.Layer): +class PtbModel(fluid.dygraph.Layer): def __init__(self, name_scope, hidden_size, @@ -214,7 +214,7 @@ class PtbModel(fluid.imperative.Layer): return loss, last_hidden, last_cell -class TestImperativePtbRnn(unittest.TestCase): +class TestDygraphPtbRnn(unittest.TestCase): def test_ptb_rnn_cpu_float32(self): seed = 90 hidden_size = 10 @@ -224,7 +224,7 @@ class TestImperativePtbRnn(unittest.TestCase): init_scale = 0.1 batch_size = 4 - with fluid.imperative.guard(): + with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed # TODO: marsyang1993 Change seed to diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py index ab9298890bf69774fd842ec202d833be0a57f7ad..1d786d584632769e4318bcdeb24ef7ef8ea18597 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py @@ -21,8 +21,8 @@ import paddle import paddle.fluid as fluid from paddle.fluid import core from paddle.fluid.layer_helper import LayerHelper -from paddle.fluid.imperative.nn import Conv2D, Pool2D, BatchNorm, FC -from paddle.fluid.imperative.base import to_variable +from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC +from paddle.fluid.dygraph.base import to_variable from test_imperative_base import new_program_scope batch_size = 8 @@ -57,7 +57,7 @@ def optimizer_setting(params): lr = [] lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)] optimizer = fluid.optimizer.SGD(learning_rate=0.01) - # TODO(minqiyang): Add learning rate scheduler support to imperative mode + # TODO(minqiyang): Add learning rate scheduler support to dygraph mode # optimizer = fluid.optimizer.Momentum( # learning_rate=params["lr"], # learning_rate=fluid.layers.piecewise_decay( @@ -68,7 +68,7 @@ def optimizer_setting(params): return optimizer -class ConvBNLayer(fluid.imperative.Layer): +class ConvBNLayer(fluid.dygraph.Layer): def __init__(self, name_scope, num_channels, @@ -99,7 +99,7 @@ class ConvBNLayer(fluid.imperative.Layer): return y -class BottleneckBlock(fluid.imperative.Layer): +class BottleneckBlock(fluid.dygraph.Layer): def __init__(self, name_scope, num_channels, @@ -156,7 +156,7 @@ class BottleneckBlock(fluid.imperative.Layer): return layer_helper.append_activation(y) -class ResNet(fluid.imperative.Layer): +class ResNet(fluid.dygraph.Layer): def __init__(self, name_scope, layers=50, class_dim=102): super(ResNet, self).__init__(name_scope) @@ -226,13 +226,13 @@ class ResNet(fluid.imperative.Layer): return y -class TestImperativeResnet(unittest.TestCase): +class TestDygraphResnet(unittest.TestCase): def test_resnet_float32(self): seed = 90 batch_size = train_parameters["batch_size"] batch_num = 20 - with fluid.imperative.guard(): + with fluid.dygraph.guard(): fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py index b06d3e8894072943b06456340f928cda260763c3..3bdf3349730b0c9916449cfe0658d5a3c88834ed 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py @@ -16,7 +16,7 @@ from __future__ import print_function import unittest import paddle.fluid as fluid -from paddle.fluid.imperative import Embedding, LayerNorm, FC, to_variable, Layer, guard +from paddle.fluid.dygraph import Embedding, LayerNorm, FC, to_variable, Layer, guard from test_imperative_base import new_program_scope from paddle.fluid import core import numpy as np @@ -623,7 +623,7 @@ class PrepareEncoderDecoderLayer(Layer): initializer=fluid.initializer.NumpyArrayInitializer(pos_inp), trainable=False)) - # use in imperative_mode to fit different length batch + # use in dygraph_mode to fit different length batch # self._pos_emb._w = to_variable( # position_encoding_init(self._src_max_len, self._src_emb_dim)) @@ -946,7 +946,7 @@ class TransFormer(Layer): return sum_cost, avg_cost, predict, token_num -class TestImperativeTransformer(unittest.TestCase): +class TestDygraphTransformer(unittest.TestCase): def test_transformer_float32(self): seed = 90 with guard(): diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 7fd9617cc7687a5a553ed22cfed560aef8058496..90487d4ef22cd47c5e503bebf40c7ac8adfd83e1 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -29,8 +29,8 @@ from paddle.fluid import core from paddle.fluid.initializer import Constant import paddle.fluid.layers as layers from test_imperative_base import new_program_scope -from paddle.fluid.imperative import nn -from paddle.fluid.imperative import base +from paddle.fluid.dygraph import nn +from paddle.fluid.dygraph import base class LayerTest(unittest.TestCase): @@ -68,7 +68,7 @@ class LayerTest(unittest.TestCase): @contextlib.contextmanager def dynamic_graph(self, force_to_use_cpu=False): - with fluid.imperative.guard( + with fluid.dygraph.guard( self._get_place(force_to_use_cpu=force_to_use_cpu)): fluid.default_startup_program().random_seed = self.seed fluid.default_main_program().random_seed = self.seed diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py index ba63213a410b8b2579b6842c5a6ecd720c7957b3..6671a2def3cccd2acd76025e73486b06b4bb1471 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -61,6 +61,11 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, param_attr=fluid.ParamAttr( name=embedding_name, trainable=False)) for x in word_input ] + # TODO(zcd): if the parameter is not trainable, the + # parameter's gradient should not generated. + for emb_layer in emb_layers: + emb_layer.stop_gradient = True + emb_layers.append(predicate_embedding) emb_layers.append(mark_embedding) @@ -113,60 +118,62 @@ class TestCRFModel(unittest.TestCase): os.environ['CPU_NUM'] = str(4) main = fluid.Program() startup = fluid.Program() - with fluid.program_guard(main, startup): - word = fluid.layers.data( - name='word_data', shape=[1], dtype='int64', lod_level=1) - predicate = fluid.layers.data( - name='verb_data', shape=[1], dtype='int64', lod_level=1) - ctx_n2 = fluid.layers.data( - name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) - ctx_n1 = fluid.layers.data( - name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) - ctx_0 = fluid.layers.data( - name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) - ctx_p1 = fluid.layers.data( - name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) - ctx_p2 = fluid.layers.data( - name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) - mark = fluid.layers.data( - name='mark_data', shape=[1], dtype='int64', lod_level=1) - - feature_out = db_lstm(**locals()) - target = fluid.layers.data( - name='target', shape=[1], dtype='int64', lod_level=1) - crf_cost = fluid.layers.linear_chain_crf( - input=feature_out, - label=target, - param_attr=fluid.ParamAttr( - name='crfw', learning_rate=1e-1)) - avg_cost = fluid.layers.mean(crf_cost) - - sgd_optimizer = fluid.optimizer.SGD( - learning_rate=fluid.layers.exponential_decay( - learning_rate=0.01, - decay_steps=100000, - decay_rate=0.5, - staircase=True)) - sgd_optimizer.minimize(avg_cost) - - train_data = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.conll05.test(), buf_size=8192), - batch_size=16) - - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup) - - train_cp = compiler.CompiledProgram(main).with_data_parallel( - loss_name=avg_cost.name, build_strategy=build_strategy) - - feeder = fluid.DataFeeder( - feed_list=[ - word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, - mark, target - ], - place=fluid.CPUPlace()) + scope = fluid.Scope() + with fluid.scope_guard(scope): + with fluid.program_guard(main, startup): + word = fluid.layers.data( + name='word_data', shape=[1], dtype='int64', lod_level=1) + predicate = fluid.layers.data( + name='verb_data', shape=[1], dtype='int64', lod_level=1) + ctx_n2 = fluid.layers.data( + name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) + ctx_n1 = fluid.layers.data( + name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) + ctx_0 = fluid.layers.data( + name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) + ctx_p1 = fluid.layers.data( + name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) + ctx_p2 = fluid.layers.data( + name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) + mark = fluid.layers.data( + name='mark_data', shape=[1], dtype='int64', lod_level=1) + + feature_out = db_lstm(**locals()) + target = fluid.layers.data( + name='target', shape=[1], dtype='int64', lod_level=1) + crf_cost = fluid.layers.linear_chain_crf( + input=feature_out, + label=target, + param_attr=fluid.ParamAttr( + name='crfw', learning_rate=1e-1)) + avg_cost = fluid.layers.mean(crf_cost) + + sgd_optimizer = fluid.optimizer.SGD( + learning_rate=fluid.layers.exponential_decay( + learning_rate=0.01, + decay_steps=100000, + decay_rate=0.5, + staircase=True)) + sgd_optimizer.minimize(avg_cost) + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.conll05.test(), buf_size=8192), + batch_size=16) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup) + + train_cp = compiler.CompiledProgram(main).with_data_parallel( + loss_name=avg_cost.name, build_strategy=build_strategy) + + feeder = fluid.DataFeeder( + feed_list=[ + word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, + mark, target + ], + place=fluid.CPUPlace()) data = train_data() for i in range(10): diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py index 17f8f5a0b4f753aabe8af3f97c2018cd2cf54dc1..d0eca7d6dfbdf03828125508c798a9bd31f8bbd6 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_dry_run.py @@ -41,14 +41,15 @@ class TestBase(unittest.TestCase): fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()) exe.run(startup_prog) - for _ in six.moves.xrange(iter): - exe_strategy = fluid.ExecutionStrategy() - exe_strategy._dry_run = True - exe_strategy.use_experimental_executor = use_experimental_executor - train_cp = compiler.CompiledProgram(main_prog).with_data_parallel( - loss_name=loss.name, exec_strategy=exe_strategy) - for _ in six.moves.xrange(iter_per_pe): - exe.run(train_cp) + exe_strategy = fluid.ExecutionStrategy() + exe_strategy._dry_run = True + exe_strategy.use_experimental_executor = use_experimental_executor + train_cp = compiler.CompiledProgram( + main_prog).with_data_parallel( + loss_name=loss.name, exec_strategy=exe_strategy) + for _ in six.moves.xrange(iter): + for _ in six.moves.xrange(iter_per_pe): + exe.run(train_cp) class TestMNISTDryRun(TestBase): diff --git a/python/paddle/fluid/tests/unittests/test_variable.py b/python/paddle/fluid/tests/unittests/test_variable.py index 076ee3baf96ab3c16f3ed9a3b9a15e2eb2aaed77..601da5839015efd81ea302e1cae65ba3c7bb22fc 100644 --- a/python/paddle/fluid/tests/unittests/test_variable.py +++ b/python/paddle/fluid/tests/unittests/test_variable.py @@ -19,7 +19,6 @@ from paddle.fluid.framework import default_main_program, Program, convert_np_dty import paddle.fluid as fluid import paddle.fluid.core as core import numpy as np -from test_imperative_base import new_program_scope class TestVariable(unittest.TestCase): @@ -153,7 +152,7 @@ class TestVariableImperative(unittest.TestCase): self.assertEqual([1, 1, 100], nw.shape) def test_slice(self): - with fluid.imperative.guard(): + with fluid.dygraph.guard(): self._test_slice() diff --git a/python/setup.py.in b/python/setup.py.in index 9f87f5644fc969f3f55fd08689f3e2bbaf36dc39..68f96273a23c725d1643e8e7397bc970411dd191 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -102,7 +102,7 @@ packages=['paddle', 'paddle.reader', 'paddle.distributed', 'paddle.fluid', - 'paddle.fluid.imperative', + 'paddle.fluid.dygraph', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', 'paddle.fluid.distributed', diff --git a/tools/print_signatures.py b/tools/print_signatures.py index d32b247342cc0c37b7bcff7b676cb47a4f429dfd..6a262529b5cac7e596e65d23de6cc4b5d720cacb 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -28,7 +28,7 @@ import hashlib member_dict = collections.OrderedDict() -experimental_namespace = {"paddle.fluid.imperative"} +experimental_namespace = {"paddle.fluid.dygraph"} def md5(doc):