diff --git a/CMakeLists.txt b/CMakeLists.txt index 66dcef0013efb486b532f9ae17e9ae2040dc9e38..d6aa8f1b85c9c4c1a9ccd5b7d5f5607f9db39bc6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,7 @@ option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF) option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF) option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF) option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) +option(WITH_JEMALLOC "Compile PaddlePaddle with jemalloc" OFF) option(WITH_DOC "Compile PaddlePaddle with documentation" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) @@ -261,6 +262,12 @@ if (WITH_PROFILER) add_definitions(-DWITH_GPERFTOOLS) endif() +if (WITH_JEMALLOC) + find_package(JeMalloc REQUIRED) + include_directories(${JEMALLOC_INCLUDE_DIR}) + add_definitions(-DWITH_JEMALLOC) +endif() + include(generic) # simplify cmake module include(package) # set paddle packages include(ccache) # set ccache for compilation @@ -290,7 +297,7 @@ if(WITH_PSLIB) list(APPEND EXTERNAL_LIBS pslib_brpc) list(APPEND EXTERNAL_LIBS libmct) endif(WITH_PSLIB) - + if(WITH_AMD_GPU) find_package(HIP) include(hip) diff --git a/cmake/FindJeMalloc.cmake b/cmake/FindJeMalloc.cmake new file mode 100644 index 0000000000000000000000000000000000000000..7911f77c4c35b5cf0fa47ff98282986eef974832 --- /dev/null +++ b/cmake/FindJeMalloc.cmake @@ -0,0 +1,21 @@ +# - Find JeMalloc library +# Find the native JeMalloc includes and library +# +# JEMALLOC_INCLUDE_DIR - where to find jemalloc.h, etc. +# JEMALLOC_LIBRARIES - List of libraries when using jemalloc. +# JEMALLOC_FOUND - True if jemalloc found. + +find_path(JEMALLOC_INCLUDE_DIR + NAMES jemalloc/jemalloc.h + HINTS ${JEMALLOC_ROOT_DIR}/include) + +find_library(JEMALLOC_LIBRARIES + NAMES jemalloc + HINTS ${JEMALLOC_ROOT_DIR}/lib) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(jemalloc DEFAULT_MSG JEMALLOC_LIBRARIES JEMALLOC_INCLUDE_DIR) + +mark_as_advanced( + JEMALLOC_LIBRARIES + JEMALLOC_INCLUDE_DIR) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 4ee2fdcf2db6bfa373f814ee4c0ab4d708486ea8..e3d856fb30d8103f50ebcb6dc16153c8ed2a97a6 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -134,6 +134,7 @@ if(WITH_GPU) message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF") set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE) endif() + add_definitions(-DWITH_ANAKIN) endif() if(WITH_ANAKIN) # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 5be7be64137be57f078739e5f287dd4bb0dcbd4f..10ecdf0ea873718a23ece8fa97faa3728652c188 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -2,7 +2,7 @@ if(NOT WITH_GPU) return() endif() -set(paddle_known_gpu_archs "30 35 50 52 60 61 70") +set(paddle_known_gpu_archs "30 35 50 52 60 61 70 75") set(paddle_known_gpu_archs7 "30 35 50 52") set(paddle_known_gpu_archs8 "30 35 50 52 60 61") @@ -59,7 +59,7 @@ endfunction() # select_nvcc_arch_flags(out_variable) function(select_nvcc_arch_flags out_variable) # List of arch names - set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual") + set(archs_names "Kepler" "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual") set(archs_name_default "All") if(NOT CMAKE_CROSSCOMPILING) list(APPEND archs_names "Auto") @@ -93,6 +93,8 @@ function(select_nvcc_arch_flags out_variable) set(cuda_arch_bin "60 61") elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") set(cuda_arch_bin "70") + elseif(${CUDA_ARCH_NAME} STREQUAL "Turing") + set(cuda_arch_bin "75") elseif(${CUDA_ARCH_NAME} STREQUAL "All") set(cuda_arch_bin ${paddle_known_gpu_archs}) elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") diff --git a/cmake/external/ngraph.cmake b/cmake/external/ngraph.cmake index 9da657b7d78f2287ae253b48c5e18d7eb43abbaa..799d9c309f329f5f10364d794a7964ec3b02eeb4 100644 --- a/cmake/external/ngraph.cmake +++ b/cmake/external/ngraph.cmake @@ -37,7 +37,7 @@ INCLUDE(GNUInstallDirs) INCLUDE(ExternalProject) SET(NGRAPH_PROJECT "extern_ngraph") -SET(NGRAPH_GIT_TAG "v0.10.1") +SET(NGRAPH_GIT_TAG "08851c2c45fcf9fa9c74871dd3dbc3fe38f37cc9") SET(NGRAPH_SOURCES_DIR ${THIRD_PARTY_PATH}/ngraph) SET(NGRAPH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/ngraph) SET(NGRAPH_INC_DIR ${NGRAPH_INSTALL_DIR}/include) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c6fe2e970d3e02985e3f2b8d5df6a7358beed514..4e31392b9898f7af3457b1a70a0ab5b8053f70c9 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -115,6 +115,10 @@ function(common_link TARGET_NAME) if (WITH_PROFILER) target_link_libraries(${TARGET_NAME} gperftools::profiler) endif() + + if (WITH_JEMALLOC) + target_link_libraries(${TARGET_NAME} ${JEMALLOC_LIBRARIES}) + endif() endfunction() @@ -228,7 +232,7 @@ function(merge_static_libs TARGET_NAME) # Get the file names of the libraries to be merged set(libfiles ${libfiles} $) endforeach() - # msvc will put libarary in directory of "/Release/xxxlib" by default + # msvc will put libarary in directory of "/Release/xxxlib" by default # COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib" add_custom_command(TARGET ${TARGET_NAME} POST_BUILD COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}" diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 2ef90bf481bf6a9b58a1dd2da8965782d68722df..a167511160d074c13ca1dca36b4f2c5eeea4bb93 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -184,7 +184,7 @@ endif() target_link_libraries(executor garbage_collector) cc_library(parallel_executor SRCS parallel_executor.cc DEPS - threaded_ssa_graph_executor scope_buffered_ssa_graph_executor + threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor graph build_strategy fast_threaded_ssa_graph_executor variable_helper) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 63a68ba3a5c289be7c2d352717fe5911539df8a7..179aa145284ed62c2c96669499b277df45ea8066 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -77,6 +77,8 @@ cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUT cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) +cc_library(parallel_ssa_graph_executor SRCS parallel_ssa_graph_executor.cc DEPS threaded_ssa_graph_executor) + cc_test(broadcast_op_test SRCS broadcast_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory device_context broadcast_op_handle) cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index de7c845884d4922f7e277db3fab7deb92af5751c..a24e3d3e487e488f0d0c59809a0adc9f9524cc6e 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -19,6 +19,13 @@ #include "paddle/fluid/framework/details/variable_visitor.h" #include "paddle/fluid/platform/profiler.h" +// asynchronous nccl allreduce or synchronous issue: +// https://github.com/PaddlePaddle/Paddle/issues/15049 +DEFINE_bool( + sync_nccl_allreduce, false, + "If set true, will call `cudaStreamSynchronize(nccl_stream)`" + "after allreduce, this mode can get better performance in some scenarios."); + namespace paddle { namespace framework { namespace details { @@ -48,100 +55,104 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, void AllReduceOpHandle::RunImpl() { platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); -// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, -// this is a distributed or inter-process call, find a better way. + WaitInputVarGenerated(); + auto in_var_handles = DynamicCast(this->Inputs()); + auto out_var_handles = DynamicCast(this->Outputs()); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), places_.size(), + "The NoDummyInputSize should be equal to the number of places."); + PADDLE_ENFORCE_EQ( + in_var_handles.size(), out_var_handles.size(), + "The NoDummyInputSize and NoDummyOutputSize should be equal."); + + std::vector lod_tensors; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto *s = local_scopes_[i]; + auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get(); + auto &lod_tensor = + local_scope.FindVar(in_var_handles[i]->name_)->Get(); + lod_tensors.emplace_back(&lod_tensor); + PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_, + "The name of input and output should be equal."); + } + + if (platform::is_gpu_place(lod_tensors[0]->place())) { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - if (NoDummyInputSize() == 1 && - local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) { -#else - if (NoDummyInputSize() == 1) { -#endif - return; // No need to all reduce when GPU count = 1; - } else { - // Wait input done - WaitInputVarGenerated(); - auto in_var_handles = DynamicCast(this->Inputs()); - auto out_var_handles = DynamicCast(this->Outputs()); - PADDLE_ENFORCE_EQ( - in_var_handles.size(), places_.size(), - "The NoDummyInputSize should be equal to the number of places."); - PADDLE_ENFORCE_EQ( - in_var_handles.size(), out_var_handles.size(), - "The NoDummyInputSize and NoDummyOutputSize should be equal."); - - std::vector lod_tensors; + PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); + int dtype = -1; + size_t numel = 0; + std::vector> all_reduce_calls; for (size_t i = 0; i < local_scopes_.size(); ++i) { - auto *s = local_scopes_[i]; - auto &local_scope = *s->FindVar(kLocalExecScopeName)->Get(); - auto &lod_tensor = - local_scope.FindVar(in_var_handles[i]->name_)->Get(); - lod_tensors.emplace_back(&lod_tensor); - PADDLE_ENFORCE_EQ(in_var_handles[i]->name_, out_var_handles[i]->name_, - "The name of input and output should be equal."); - } + auto &p = places_[i]; + auto &lod_tensor = *lod_tensors[i]; + void *buffer = const_cast(lod_tensor.data()); - if (platform::is_gpu_place(lod_tensors[0]->place())) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); - int dtype = -1; - size_t numel = 0; - std::vector> all_reduce_calls; - for (size_t i = 0; i < local_scopes_.size(); ++i) { - auto &p = places_[i]; - auto &lod_tensor = *lod_tensors[i]; - void *buffer = const_cast(lod_tensor.data()); - - if (dtype == -1) { - dtype = platform::ToNCCLDataType(lod_tensor.type()); - } + if (dtype == -1) { + dtype = platform::ToNCCLDataType(lod_tensor.type()); + } + + if (numel == 0) { + numel = static_cast(lod_tensor.numel()); + } - if (numel == 0) { - numel = static_cast(lod_tensor.numel()); + int dev_id = boost::get(p).device; + auto &nccl_ctx = nccl_ctxs_->at(dev_id); + auto stream = nccl_ctx.stream(); + auto comm = nccl_ctx.comm_; + all_reduce_calls.emplace_back([=] { + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + buffer, buffer, numel, static_cast(dtype), ncclSum, + comm, stream)); + }); + } + + this->RunAndRecordEvent([&] { + if (all_reduce_calls.size() == 1UL) { + // Do not use NCCLGroup when manage NCCL by per thread per device + all_reduce_calls[0](); + } else { + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); } + } + }); + if (FLAGS_sync_nccl_allreduce) { + for (auto &p : places_) { int dev_id = boost::get(p).device; auto &nccl_ctx = nccl_ctxs_->at(dev_id); auto stream = nccl_ctx.stream(); - auto comm = nccl_ctx.comm_; - all_reduce_calls.emplace_back([=] { - PADDLE_ENFORCE(platform::dynload::ncclAllReduce( - buffer, buffer, numel, static_cast(dtype), - ncclSum, comm, stream)); - }); + cudaStreamSynchronize(stream); } - this->RunAndRecordEvent([&] { - platform::NCCLGroupGuard guard; - for (auto &call : all_reduce_calls) { - call(); - } - }); + } + #else - PADDLE_THROW("Not compiled with CUDA"); + PADDLE_THROW("Not compiled with CUDA"); #endif - } else { // Special handle CPU only Operator's gradient. Like CRF - auto &trg = *this->local_scopes_[0] - ->FindVar(kLocalExecScopeName) - ->Get() - ->FindVar(out_var_handles[0]->name_) - ->GetMutable(); - - // Reduce All Tensor to trg in CPU - ReduceLoDTensor func(lod_tensors, &trg); - VisitDataType(lod_tensors[0]->type(), func); - - for (size_t i = 1; i < local_scopes_.size(); ++i) { - auto &scope = - *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); - auto &p = places_[i]; - auto *var = scope.FindVar(out_var_handles[i]->name_); - auto *dev_ctx = dev_ctxes_.at(p); - - RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { - auto &tensor_gpu = *var->GetMutable(); - auto &tensor_cpu = trg; - TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu); - }); - } + } else { // Special handle CPU only Operator's gradient. Like CRF + auto &trg = *this->local_scopes_[0] + ->FindVar(kLocalExecScopeName) + ->Get() + ->FindVar(out_var_handles[0]->name_) + ->GetMutable(); + + // Reduce All Tensor to trg in CPU + ReduceLoDTensor func(lod_tensors, &trg); + VisitDataType(lod_tensors[0]->type(), func); + + for (size_t i = 1; i < local_scopes_.size(); ++i) { + auto &scope = + *local_scopes_[i]->FindVar(kLocalExecScopeName)->Get(); + auto &p = places_[i]; + auto *var = scope.FindVar(out_var_handles[i]->name_); + auto *dev_ctx = dev_ctxes_.at(p); + + RunAndRecordEvent(p, [&trg, var, dev_ctx, p] { + auto &tensor_gpu = *var->GetMutable(); + auto &tensor_cpu = trg; + TensorCopy(tensor_cpu, p, *dev_ctx, &tensor_gpu); + }); } } } diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 7edbe596beee5d3daa754d863b844bd6b78cf45d..a68b69e0264e2f202dd41b56faf2f589118a3a53 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/details/memory_reuse_types.h" -#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" +#include "paddle/fluid/framework/details/multi_devices_graph_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/sequential_execution_pass.h" @@ -31,7 +31,11 @@ namespace framework { namespace details { static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) { - return (!strategy.enable_sequential_execution_ && strategy.num_trainers_ > 1); + // Should fix the allreduce op order if scheduling + // them in multiple threads or processes to avoid hang. + return (!strategy.enable_sequential_execution_ && + strategy.num_trainers_ > 1) || + strategy.enable_parallel_graph_; } class ParallelExecutorPassBuilder : public ir::PassBuilder { @@ -82,12 +86,8 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { if (strategy.memory_optimize_) { auto analysis_var_pass = AppendPass("analysis_var_pass"); } - // Convert graph to run on multi-devices. - auto multi_devices_pass = AppendPass("multi_devices_pass"); - multi_devices_pass->SetNotOwned("strategy", - &strategy_); - multi_devices_pass->Set("num_trainers", - new int(strategy_.num_trainers_)); + + AppendMultiDevPass(strategy); // Add a graph print pass to record a graph with device info. if (!strategy_.debug_graphviz_path_.empty()) { @@ -113,6 +113,25 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { } } + // Convert graph to run on multi-devices. + void AppendMultiDevPass(const BuildStrategy &strategy) { + ir::Pass *multi_devices_pass; + if (strategy_.is_distribution_) { + multi_devices_pass = AppendPass("dist_multi_devices_pass").get(); + } else { + if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + multi_devices_pass = + AppendPass("allreduce_mode_multi_devices_pass").get(); + } else if (strategy.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { + multi_devices_pass = AppendPass("reduce_mode_multi_devices_pass").get(); + } else { + PADDLE_THROW("Unknown reduce strategy."); + } + } + multi_devices_pass->SetNotOwned("strategy", + &strategy_); + } + private: BuildStrategy strategy_; }; @@ -129,9 +148,14 @@ std::shared_ptr BuildStrategy::CreatePassesFromStrategy( return pass_builder_; } +bool BuildStrategy::IsMultiDevPass(const std::string &pass_name) const { + return framework::details::MultiDevSSAGraphBuilder().count(pass_name) > 0; +} + std::unique_ptr BuildStrategy::Apply( const ProgramDesc &main_program, const std::vector &places, const std::string &loss_var_name, const std::vector &local_scopes, + const size_t &nranks, #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { #else @@ -142,19 +166,23 @@ std::unique_ptr BuildStrategy::Apply( std::unique_ptr graph(new ir::Graph(main_program)); for (std::shared_ptr &pass : pass_builder_->AllPasses()) { - if (pass->Type() == "multi_devices_pass") { - pass->Erase("places"); - pass->SetNotOwned>("places", &places); - pass->Erase("loss_var_name"); - pass->SetNotOwned("loss_var_name", &loss_var_name); - pass->Erase("local_scopes"); - pass->SetNotOwned>("local_scopes", + if (IsMultiDevPass(pass->Type())) { + pass->Erase(kPlaces); + pass->SetNotOwned>(kPlaces, &places); + pass->Erase(kLossVarName); + pass->SetNotOwned(kLossVarName, &loss_var_name); + pass->Erase(kLocalScopes); + pass->SetNotOwned>(kLocalScopes, &local_scopes); + pass->Erase(kNRanks); + pass->Set(kNRanks, new size_t(nranks)); + #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; pass->Erase("nccl_ctxs"); pass->SetNotOwned("nccl_ctxs", nctx); #endif + } else if (pass->Type() == "analysis_var_pass") { const std::vector *all_op_descs = new std::vector(main_program.Block(0).AllOps()); @@ -195,7 +223,9 @@ std::unique_ptr BuildStrategy::Apply( USE_PASS(fuse_elewise_add_act_pass); USE_PASS(graph_viz_pass); USE_PASS(multi_batch_merge_pass); -USE_PASS(multi_devices_pass); +USE_PASS(reduce_mode_multi_devices_pass); +USE_PASS(allreduce_mode_multi_devices_pass); +USE_PASS(dist_multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); USE_PASS(analysis_var_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 11db184cb4efe349a340aceb4b7e1e3f4d4b24a5..15c2e01b6142571883c759efb1e26b609be9adb4 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -74,8 +74,6 @@ struct BuildStrategy { bool fuse_elewise_add_act_ops_{false}; - bool enable_data_balance_{false}; - bool memory_optimize_{false}; bool memory_early_delete_{false}; @@ -84,6 +82,10 @@ struct BuildStrategy { bool fuse_broadcast_op_{false}; + // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, + // num_trainers is 1, so the current fields of build_strategy doesn't tell if + // it's distributed model. + bool is_distribution_{false}; int num_trainers_{1}; int trainer_id_{0}; std::vector trainers_endpoints_; @@ -104,12 +106,15 @@ struct BuildStrategy { bool IsFinalized() const { return is_finalized_; } + bool IsMultiDevPass(const std::string &pass_name) const; + // Apply the passes built by the pass_builder_. The passes will be // applied to the Program and output an ir::Graph. std::unique_ptr Apply(const ProgramDesc &main_program, const std::vector &places, const std::string &loss_var_name, const std::vector &local_scopes, + const size_t &nranks, #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const; @@ -117,6 +122,13 @@ struct BuildStrategy { const bool use_cuda) const; #endif + // If set true, ParallelExecutor would build the main_program into multiple + // graphs, + // each of the graphs would run with one device. This approach can achieve + // better performance + // on some scenarios. + mutable bool enable_parallel_graph_ = false; + private: mutable bool is_finalized_ = false; mutable std::shared_ptr pass_builder_; diff --git a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc index c8ea18804630fea4ada98062256730dbf4c24860..a4bb1e26d933946b7ca36196d1c0e8a0a4ec54e2 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_check_pass.cc @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include +#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" @@ -21,68 +21,78 @@ namespace paddle { namespace framework { namespace details { -bool SSAGraghBuilderWithChecker::IsValidGraph(const ir::Graph *graph) const { - std::unordered_map pending_ops; - std::unordered_set pending_vars; - std::unordered_set ready_vars; - std::unordered_set ready_ops; +class SSAGraghBuilderWithChecker : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override { + PADDLE_ENFORCE(IsValidGraph(graph.get())); + return graph; + } - auto insert_pending_var = [&](VarHandleBase *var) { - pending_vars.insert(var); - if (var->GeneratedOp() == nullptr) { - ready_vars.emplace(var); - } - }; + bool IsValidGraph(const ir::Graph *graph) const { + std::unordered_map pending_ops; + std::unordered_set pending_vars; + std::unordered_set ready_vars; + std::unordered_set ready_ops; - for (auto &var_map : graph->Get(kGraphVars)) { - for (auto &name_pair : var_map) { - for (auto &version_pair : name_pair.second) { - insert_pending_var(version_pair); + auto insert_pending_var = [&](VarHandleBase *var) { + pending_vars.insert(var); + if (var->GeneratedOp() == nullptr) { + ready_vars.emplace(var); } - } - } + }; - for (auto &var : graph->Get(kGraphDepVars)) { - insert_pending_var(var); - } + for (auto &var_map : graph->Get(kGraphVars)) { + for (auto &name_pair : var_map) { + for (auto &version_pair : name_pair.second) { + insert_pending_var(version_pair); + } + } + } - for (OpHandleBase *op : ir::FilterByNodeWrapper(*graph)) { - if (op->Inputs().empty()) { - ready_ops.insert(op); - } else { - pending_ops.insert({op, op->NoDupInputSize()}); + for (auto &var : graph->Get(kGraphDepVars)) { + insert_pending_var(var); } - } - auto run_all_ops = [&](std::unordered_set &set) { - for (auto *op : set) { - for (auto out : op->Outputs()) { - ready_vars.emplace(out); + for (OpHandleBase *op : ir::FilterByNodeWrapper(*graph)) { + if (op->Inputs().empty()) { + ready_ops.insert(op); + } else { + pending_ops.insert({op, op->NoDupInputSize()}); } } - set.clear(); - }; - while (!pending_vars.empty()) { - run_all_ops(ready_ops); + auto run_all_ops = [&](std::unordered_set &set) { + for (auto *op : set) { + for (auto out : op->Outputs()) { + ready_vars.emplace(out); + } + } + set.clear(); + }; - if (ready_vars.empty()) { - return false; - } + while (!pending_vars.empty()) { + run_all_ops(ready_ops); - for (auto ready_var : ready_vars) { - pending_vars.erase(ready_var); - for (auto *op : ready_var->PendingOps()) { - auto &deps = --pending_ops[op]; - if (deps == 0) { - ready_ops.insert(op); + if (ready_vars.empty()) { + return false; + } + + for (auto ready_var : ready_vars) { + pending_vars.erase(ready_var); + for (auto *op : ready_var->PendingOps()) { + auto &deps = --pending_ops[op]; + if (deps == 0) { + ready_ops.insert(op); + } } } + ready_vars.clear(); } - ready_vars.clear(); + return true; } - return true; -} +}; + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_check_pass.h b/paddle/fluid/framework/details/multi_devices_graph_check_pass.h deleted file mode 100644 index 1e2b1867c376956d7d2dac465c13e2f3f64ba7eb..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/multi_devices_graph_check_pass.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/details/multi_devices_helper.h" - -#include - -namespace paddle { -namespace framework { -namespace details { - -class SSAGraghBuilderWithChecker : public ir::Pass { - protected: - std::unique_ptr ApplyImpl( - std::unique_ptr graph) const override { - PADDLE_ENFORCE(IsValidGraph(graph.get())); - return graph; - } - - bool IsValidGraph(const ir::Graph* graph) const; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 5b9a81811728b7e6c5314738920fd4b5e503ab5c..d91993bd4f8c04539cd189a4145350498911c513 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -134,15 +134,8 @@ void AddOutputToLeafOps(ir::Graph *graph) { } } // namespace -static const char kLossVarName[] = "loss_var_name"; -static const char kPlaces[] = "places"; -static const char kLocalScopes[] = "local_scopes"; -static const char kStrategy[] = "strategy"; -static const char kNumTrainers[] = "num_trainers"; - -void MultiDevSSAGraphBuilder::Init() const { +void MultiDevSSAGraphBuilderBase::Init() const { all_vars_.clear(); - balance_vars_.clear(); loss_var_name_ = Get(kLossVarName); places_ = Get>(kPlaces); @@ -151,31 +144,16 @@ void MultiDevSSAGraphBuilder::Init() const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) nccl_ctxs_ = &Get("nccl_ctxs"); #endif - - balance_vars_.resize(places_.size(), 0); - - if (strategy_.enable_data_balance_ && places_.size() == 1) { - LOG(WARNING) << "It is no need to enable data balance when there is only " - "one place. enable_data_balance is set to False."; - strategy_.enable_data_balance_ = false; - } } -std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( +std::unique_ptr MultiDevSSAGraphBuilderBase::ApplyImpl( std::unique_ptr graph) const { Init(); - // Give the topology sort order and rebuild the graph structure. - std::vector sorted_ops = ir::TopologySortOperations(*graph); - - if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) { - sorted_ops = SortForReduceMode(sorted_ops); - } + std::vector sorted_ops = SortOperations(*graph); auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; - int num_trainers = Get(kNumTrainers); - for (auto &node : nodes) { if (node->IsVar() && node->Var()) { all_vars_.emplace(node->Name(), node->Var()); @@ -187,146 +165,61 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( result.Set(kGraphDepVars, new GraphDepVars); result.Set(kGraphOps, new GraphOps); - std::vector> bcast_var_name_set; - bcast_var_name_set.resize(places_.size()); - bool is_forwarding = true; - bool is_dist_train = false; - - std::unordered_map sharded_var_device; + bool insert_collection_ops = NeedCollectiveOps(); for (ir::Node *node : sorted_ops) { - if (OpHaveRole(*node, OpRole::kRPC)) { - int op_dev_id = CreateRPCOp(&result, node, &sharded_var_device); - PADDLE_ENFORCE(op_dev_id != -1, - "Can not schedule the RPC operator to the right place."); - if (node->Op()->Type() == "recv") { - auto recv_vars_attr = - boost::get>(node->Op()->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient] - if (recv_vars_attr[0].find(".block") == std::string::npos) { - bcast_var_name_set[op_dev_id].emplace(recv_vars_attr[0]); - } - } - is_dist_train = true; - } else if (OpHaveRole(*node, OpRole::kDist)) { - int op_dev_id = CreateDistTrainOp(&result, node, &sharded_var_device); - if (node->Op()->Type() == "concat") { - auto origin_param_name = node->Op()->OutputArgumentNames()[0]; - bcast_var_name_set[op_dev_id].emplace(origin_param_name); - } - } else if (IsScaleLossOp(node)) { - // user can customize loss@grad if not use_default_grad_scale_ - if (strategy_.gradient_scale_ != - BuildStrategy::GradientScaleStrategy::kCustomized) { - // TODO(paddle-dev): Why is there no input for this op_handle? - auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; - auto out_dtype = all_vars_.at(loss_grad_name)->GetDataType(); - CreateScaleLossGradOp(&result, loss_grad_name, node->outputs[0], - out_dtype); - } - // This assumes the backward generating code will ensure IsScaleLossOp - // is true only for the op that scale the final scalar loss. - // It also assumes backward op will always follow the forward op in - // the block. - is_forwarding = false; + if (DealWithSpecialOp(&result, node)) { + continue; } else { - int op_dev_id = GetOpDeviceID(node, sharded_var_device); - if (op_dev_id != -1) { // This op only runs on one specific device. - CreateComputationalOp(&result, node, op_dev_id); - for (ir::Node *n : node->outputs) { - sharded_var_device.emplace(n->Name(), op_dev_id); - } + // This op runs on all devices + if (IsScaleLossOp(node)) { + // user can customize loss@grad if not use_default_grad_scale_ + InsertScaleLossGradOp(&result, node); + // This assumes the backward generating code will ensure IsScaleLossOp + // is true only for the op that scale the final scalar loss. + // It also assumes backward op will always follow the forward op in + // the block. + is_forwarding = false; } else { - // This op runs on all devices, and its output may have parameter's - // gradients. - // TODO(paddle-dev): Why is so special about "read" op? - if (node->Op()->Type() == "read" && strategy_.enable_data_balance_) { - node->Op()->SetAttr("throw_eof_exp", false); - CreateComputationalOps(&result, node, places_.size()); - const auto &data_var_names = node->Op()->Output("Out"); - InsertDataBalanceOp(&result, data_var_names); - } else { - CreateComputationalOps(&result, node, places_.size()); - } + CreateComputationalOps(&result, node, places_.size()); + } - if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { + // Insert collection ops + if (!is_forwarding && insert_collection_ops) { + try { bool is_bk_op = static_cast(boost::get(node->Op()->GetAttr( OpProtoAndCheckerMaker::OpRoleAttrName())) & static_cast(OpRole::kBackward)); if (!is_bk_op) continue; + // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. - try { - auto backward_vars = boost::get>( - node->Op()->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - - PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); - - for (size_t i = 0; i < backward_vars.size(); i += 2) { - auto &p_name = backward_vars[i]; - auto &g_name = backward_vars[i + 1]; - VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; - size_t cur_device_id = -1; - switch (strategy_.reduce_) { - case BuildStrategy::ReduceStrategy::kReduce: - cur_device_id = GetAppropriateDeviceID({g_name}); - CreateReduceOp(&result, g_name, cur_device_id); - sharded_var_device.emplace(g_name, cur_device_id); - if (!is_dist_train) { - bcast_var_name_set[cur_device_id].emplace(p_name); - } - break; - case BuildStrategy::ReduceStrategy::kAllReduce: - if (IsSparseGradient(g_name)) { - CreateReduceOp(&result, g_name, 0); - CreateBroadcastOp(&result, g_name, 0); - } else { - InsertAllReduceOp(&result, g_name); - } - break; - default: - LOG(FATAL) << "Unknown reduce strategy "; - break; - } - } - } catch (boost::bad_get e) { + auto backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &p_name = backward_vars[i]; + auto &g_name = backward_vars[i + 1]; + VLOG(10) << "Bcast " << g_name << " for parameter " << p_name; + + InsertCollectiveOp(&result, p_name, g_name); } + } catch (boost::bad_get e) { } } } } - bool use_gpu = false; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - use_gpu = nccl_ctxs_ != nullptr; -#endif - // Insert broadcast operators principle: - // 1. Broadcast optimized parameters in Reduce strategy; - // 2. No need broadcast optimized parameters in AllReduce strategy because of - // the optimization sub-graph would be run on every GPU; - // 3. Allways broadcast received parameters in Distribute Training. - if ((use_gpu && - strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce) || - is_dist_train) { - if (strategy_.fuse_broadcast_op_) { - CreateFusedBroadcastOp(&result, bcast_var_name_set); - } else { - for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) { - auto &to_bcast_set = bcast_var_name_set[dev_id]; - for (auto &bcast_name : to_bcast_set) { - CreateBroadcastOp(&result, bcast_name, dev_id); - } - } - } - } + InsertPostprocessOps(&result); + /* Dependency graph has been constructed. However, there are still data hazards need to be handled. - */ + */ PolishGraphToSupportDataHazards(&result); /* @@ -337,67 +230,54 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( return graph; } -std::vector MultiDevSSAGraphBuilder::SortForReduceMode( - const std::vector &topo_ops) const { - std::unordered_map sharded_var_device; - std::vector sorted_ops; - std::unordered_map> delayed_op; - sorted_ops.reserve(topo_ops.size()); - - auto insert_delayed_op = [&](const std::string &var_name, int dev_id) { - sharded_var_device.emplace(var_name, dev_id); - if (delayed_op.count(var_name)) { - auto &ops = delayed_op.at(var_name); - sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end()); - delayed_op.at(var_name).clear(); - } - }; +void MultiDevSSAGraphBuilderBase::InsertScaleLossGradOp( + ir::Graph *result, const ir::Node *node) const { + // user can customize loss@grad if not use_default_grad_scale_ + size_t loss_scale = 0; + switch (this->strategy_.gradient_scale_) { + case BuildStrategy::GradientScaleStrategy::kOne: + loss_scale = 1; + break; + case BuildStrategy::GradientScaleStrategy::kCoeffNumDevice: + loss_scale = Get(kNRanks); + break; + case BuildStrategy::GradientScaleStrategy::kCustomized: + loss_scale = 0; + break; + default: + LOG(FATAL) << "Unknown gradient scale strategy."; + break; + } + + if (loss_scale) { + // TODO(paddle-dev): Why is there no input for this op_handle? + auto loss_grad_name = node->Op()->OutputArgumentNames()[0]; + auto out_dtype = this->all_vars_.at(loss_grad_name)->GetDataType(); + this->CreateScaleLossGradOp(result, loss_grad_name, node->outputs[0], + loss_scale, out_dtype); + } +} - for (ir::Node *node : topo_ops) { - int op_dev_id = GetOpDeviceID(node, sharded_var_device, &delayed_op); - if (op_dev_id > -1) { - // This op only runs on one specific device. - sorted_ops.emplace_back(node); - for (ir::Node *n : node->outputs) { - insert_delayed_op(n->Name(), op_dev_id); - } - } else if (op_dev_id == -1) { - // This op runs on all devices, and its output may have parameter's - // gradients. - sorted_ops.emplace_back(node); - bool is_bk_op = - static_cast(boost::get(node->Op()->GetAttr( - OpProtoAndCheckerMaker::OpRoleAttrName())) & - static_cast(OpRole::kBackward)); - if (!is_bk_op) continue; - // Currently, we assume that once gradient is generated, it can be - // broadcast, and each gradient is only broadcast once. - std::vector backward_vars; - try { - backward_vars = - boost::get>(node->Op()->GetNullableAttr( - OpProtoAndCheckerMaker::OpRoleVarAttrName())); - } catch (boost::bad_get e) { - } - PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); +std::vector MultiDevSSAGraphBuilderBase::SortOperations( + const ir::Graph &graph) const { + return ir::TopologySortOperations(graph); +} - for (size_t i = 0; i < backward_vars.size(); i += 2) { - auto &g_name = backward_vars[i + 1]; - size_t cur_device_id = GetAppropriateDeviceID({g_name}); - insert_delayed_op(g_name, static_cast(cur_device_id)); - } - } else if (op_dev_id == -2) { - // The Op on which the Op depends has not yet been generated. - } - } +bool MultiDevSSAGraphBuilderBase::UseGPU() const { + bool use_gpu = false; +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + use_gpu = nccl_ctxs_ != nullptr; +#endif + return use_gpu; +} - PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size()); - return sorted_ops; +bool MultiDevSSAGraphBuilderBase::NeedCollectiveOps() const { + return Get(kNRanks) > 1; } -void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, - ir::Node *node, - size_t place_id) const { +void MultiDevSSAGraphBuilderBase::CreateOpHandleIOs(ir::Graph *result, + ir::Node *node, + size_t place_id) const { auto p = places_[place_id]; auto *op_handle = result->Get(kGraphOps).back(); op_handle->SetDeviceContext(p, @@ -420,28 +300,7 @@ void MultiDevSSAGraphBuilder::CreateOpHandleIOs(ir::Graph *result, } } -size_t MultiDevSSAGraphBuilder::GetAppropriateDeviceID( - const std::vector &var_names) const { - int64_t numel_sum = 0; - for (auto var_name : var_names) { - if (all_vars_.find(var_name) == all_vars_.end()) continue; - auto var_desc = all_vars_.at(var_name); - PADDLE_ENFORCE_NOT_NULL(var_desc); - auto dim = framework::make_ddim(var_desc->GetShape()); - int64_t numel = framework::product(dim); - PADDLE_ENFORCE_GT(numel, 0); - numel_sum += numel; - } - - auto smallest = - std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); - size_t dev_id = - static_cast(std::distance(std::begin(balance_vars_), smallest)); - balance_vars_[dev_id] += numel_sum; - return dev_id; -} - -void MultiDevSSAGraphBuilder::SetCommunicationContext( +void MultiDevSSAGraphBuilderBase::SetCommunicationContext( OpHandleBase *op_handle, const platform::Place &p) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (nccl_ctxs_ == nullptr) { @@ -454,9 +313,9 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext( #endif } -void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, - const std::string &p_name, - size_t src_dev_id) const { +void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result, + const std::string &p_name, + size_t src_dev_id) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *op_handle = new BroadcastOpHandle( result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), @@ -484,7 +343,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, } } -void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp( +void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp( ir::Graph *result, const std::vector> &bcast_varnames) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) @@ -522,17 +381,17 @@ void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp( } } -void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, - ir::Node *node, - int dev_id) const { +void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result, + ir::Node *node, + int dev_id) const { result->Get(kGraphOps).emplace_back( new ComputationOpHandle(result->CreateOpNode(node->Op()), local_scopes_[dev_id], places_[dev_id], dev_id)); CreateOpHandleIOs(result, node, dev_id); } -void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, - const std::string &og) const { +void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( + ir::Graph *result, const std::string &og) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), @@ -560,101 +419,15 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, } } -void MultiDevSSAGraphBuilder::InsertDataBalanceOp( - ir::Graph *result, const std::vector &datas) const { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - result->Get(kGraphOps).emplace_back(new DataBalanceOpHandle( - result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), - local_scopes_, places_, nccl_ctxs_)); -#else - result->Get(kGraphOps).emplace_back(new DataBalanceOpHandle( - result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), - local_scopes_, places_)); -#endif - auto *op_handle = result->Get(kGraphOps).back(); - for (size_t i = 0; i < places_.size(); ++i) { - auto &p = places_[i]; - SetCommunicationContext(op_handle, p); - for (const std::string &d_name : datas) { - auto &vars = result->Get(kGraphVars)[i][d_name]; - PADDLE_ENFORCE(!vars.empty()); - op_handle->AddInput(vars.back()); - auto var = new VarHandle( - result->CreateEmptyNode(d_name, ir::Node::Type::kVariable), - vars.size(), i, d_name, p); - vars.emplace_back(var); - op_handle->AddOutput(var); - } - } -} - -int MultiDevSSAGraphBuilder::GetOpDeviceID( - ir::Node *node, - const std::unordered_map &sharded_var_device, - std::unordered_map> *delay_ops) const { - if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { - return -1; - } - - if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { - return -1; - } - - auto param_grad = boost::get>( - node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); - - PADDLE_ENFORCE_EQ(param_grad.size(), 2U); - int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device); - - if (dev_id == -1) { - (*delay_ops)[param_grad[1]].push_back(node); - return -2; - } - return dev_id; -} - -int MultiDevSSAGraphBuilder::GetOpDeviceID( - ir::Node *node, - const std::unordered_map &sharded_var_device) const { - if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { - return -1; - } - - if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { - return -1; - } - auto param_grad = boost::get>( - node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); - - PADDLE_ENFORCE_EQ(param_grad.size(), 2U); - int dev_id = GetVarDeviceID(param_grad[1], sharded_var_device); - PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]", - node->Op()->Type(), param_grad[0], param_grad[1]); - return dev_id; -} - -int MultiDevSSAGraphBuilder::GetVarDeviceID( - const std::string &varname, - const std::unordered_map &sharded_var_device) const { - auto got = sharded_var_device.find(varname); - if (got == sharded_var_device.end()) { - auto pos = varname.find(framework::kNewGradSuffix); - if (pos != std::string::npos) { - got = sharded_var_device.find(varname.substr(0, pos)); - } - } - return got == sharded_var_device.end() ? -1 : got->second; -} - -void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( +void MultiDevSSAGraphBuilderBase::CreateScaleLossGradOp( ir::Graph *result, const std::string &loss_grad_name, - ir::Node *out_var_node, proto::VarType::Type dtype) const { + ir::Node *out_var_node, size_t loss_scale, + proto::VarType::Type dtype) const { for (size_t i = 0; i < places_.size(); ++i) { - // Insert ScaleCost OpHandle auto *dev_ctx = platform::DeviceContextPool::Instance().Get(places_[i]); auto *op_handle = new ScaleLossGradOpHandle( result->CreateEmptyNode("scale_loss_grad", ir::Node::Type::kOperation), - local_scopes_.size(), local_scopes_[i], places_[i], dev_ctx, dtype); + loss_scale, local_scopes_[i], places_[i], dev_ctx, dtype); result->Get(kGraphOps).emplace_back(op_handle); // FIXME: Currently ScaleLossGradOp only use device_count as scale @@ -668,9 +441,8 @@ void MultiDevSSAGraphBuilder::CreateScaleLossGradOp( } } -void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, - ir::Node *node, - size_t num_places) const { +void MultiDevSSAGraphBuilderBase::CreateComputationalOps( + ir::Graph *result, ir::Node *node, size_t num_places) const { for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) { auto p = places_[scope_idx]; auto s = local_scopes_[scope_idx]; @@ -680,9 +452,9 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, } } -VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, - const std::string &og, - int dst_dev_id) const { +VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(ir::Graph *result, + const std::string &og, + int dst_dev_id) const { #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), @@ -711,51 +483,273 @@ VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, return var; } -int MultiDevSSAGraphBuilder::CreateDistTrainOp( - ir::Graph *result, ir::Node *node, - std::unordered_map *sharded_var_device) const { - int op_dev_id = -1; - std::vector input_var_names; - std::vector output_var_names; - for (ir::Node *input : node->inputs) { - input_var_names.push_back(input->Name()); +bool MultiDevSSAGraphBuilderBase::IsScaleLossOp(ir::Node *node) const { + return boost::get( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == + (static_cast(OpRole::kBackward) | + static_cast(OpRole::kLoss)) && + !loss_var_name_.empty(); // If loss_var is empty. This is test mode +} + +bool MultiDevSSAGraphBuilderBase::IsSparseGradient( + const std::string &og) const { + PADDLE_ENFORCE(all_vars_.count(og) != 0); + if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { + return true; } - for (ir::Node *output : node->outputs) { - output_var_names.push_back(output->Name()); + return false; +} + +void AllReduceSSAGraphBuilder::InsertCollectiveOp( + ir::Graph *result, const std::string &p_name, + const std::string &g_name) const { + if (IsSparseGradient(g_name)) { + CreateReduceOp(result, g_name, 0); + CreateBroadcastOp(result, g_name, 0); + } else { + CreateAllReduceOp(result, g_name); } +} - if (node->Op()->Type() == "split_byref" || - node->Op()->Type() == "split_selected_rows" || - node->Op()->Type() == "split_ids") { - // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device); - if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { - op_dev_id = GetAppropriateDeviceID(input_var_names); - for (auto &varname : input_var_names) { - sharded_var_device->emplace(varname, op_dev_id); +int BalanceVarSSAGraphBuilder::GetVarDeviceID( + const std::string &varname) const { + auto got = sharded_var_device_.find(varname); + if (got == sharded_var_device_.end()) { + auto pos = varname.find(framework::kNewGradSuffix); + if (pos != std::string::npos) { + got = sharded_var_device_.find(varname.substr(0, pos)); + } + } + return got == sharded_var_device_.end() ? -1 : got->second; +} + +int BalanceVarSSAGraphBuilder::GetOpDeviceID(ir::Node *node) const { + if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) { + return -1; + } + if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { + return -1; + } + auto param_grad = boost::get>( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(param_grad.size(), 2U); + int dev_id = GetVarDeviceID(param_grad[1]); + PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s, %s]", + node->Op()->Type(), param_grad[0], param_grad[1]); + return dev_id; +} + +size_t BalanceVarSSAGraphBuilder::GetAppropriateDeviceID( + const std::vector &var_names) const { + int64_t numel_sum = 0; + for (auto var_name : var_names) { + if (all_vars_.find(var_name) == all_vars_.end()) continue; + auto var_desc = all_vars_.at(var_name); + PADDLE_ENFORCE_NOT_NULL(var_desc); + auto dim = framework::make_ddim(var_desc->GetShape()); + int64_t numel = framework::product(dim); + PADDLE_ENFORCE_GT(numel, 0); + numel_sum += numel; + } + + auto smallest = + std::min_element(std::begin(balance_vars_), std::end(balance_vars_)); + size_t dev_id = + static_cast(std::distance(std::begin(balance_vars_), smallest)); + balance_vars_[dev_id] += numel_sum; + return dev_id; +} + +void BalanceVarSSAGraphBuilder::ResetState() const { + balance_vars_.clear(); + sharded_var_device_.clear(); + + balance_vars_.resize(places_.size(), 0); +} + +void ReduceSSAGraphBuilder::Init() const { + MultiDevSSAGraphBuilderBase::Init(); + ResetState(); +} + +void ReduceSSAGraphBuilder::ResetState() const { + BalanceVarSSAGraphBuilder::ResetState(); + bcast_var_name_set_.clear(); + bcast_var_name_set_.resize(places_.size()); +} + +void ReduceSSAGraphBuilder::InsertCollectiveOp( + ir::Graph *result, const std::string &p_name, + const std::string &g_name) const { + size_t cur_device_id = GetAppropriateDeviceID({g_name}); + CreateReduceOp(result, g_name, cur_device_id); + sharded_var_device_.emplace(g_name, cur_device_id); + bcast_var_name_set_[cur_device_id].emplace(p_name); +} + +bool ReduceSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, + ir::Node *node) const { + int op_dev_id = BalanceVarSSAGraphBuilder::GetOpDeviceID(node); + if (op_dev_id != -1) { + // This op only runs on one specific device. + CreateComputationalOp(result, node, op_dev_id); + for (ir::Node *n : node->outputs) { + sharded_var_device_.emplace(n->Name(), op_dev_id); + } + return true; + } + return false; +} + +void ReduceSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { + if (UseGPU()) { + if (strategy_.fuse_broadcast_op_) { + CreateFusedBroadcastOp(result, bcast_var_name_set_); + } else { + for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { + auto &to_bcast_set = bcast_var_name_set_[dev_id]; + for (auto &bcast_name : to_bcast_set) { + CreateBroadcastOp(result, bcast_name, dev_id); + } } } - for (auto &varname : output_var_names) { - sharded_var_device->emplace(varname, op_dev_id); + } +} + +int ReduceSSAGraphBuilder::GetOpDeviceID( + ir::Node *node, + std::unordered_map> *delay_ops) const { + if (!OpHaveRole(*node, framework::OpRole::kOptimize)) { + return -1; + } + + auto param_grad = boost::get>( + node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); + + PADDLE_ENFORCE_EQ(param_grad.size(), 2U); + int dev_id = GetVarDeviceID(param_grad[1]); + + if (dev_id == -1) { + (*delay_ops)[param_grad[1]].push_back(node); + return -2; + } + return dev_id; +} + +std::vector ReduceSSAGraphBuilder::SortOperations( + const ir::Graph &graph) const { + std::vector sorted_ops = ir::TopologySortOperations(graph); + return SortForReduceMode(sorted_ops); +} + +std::vector ReduceSSAGraphBuilder::SortForReduceMode( + const std::vector &topo_ops) const { + std::vector sorted_ops; + std::unordered_map> delayed_op; + sorted_ops.reserve(topo_ops.size()); + ResetState(); + + auto insert_delayed_op = [&](const std::string &var_name, int dev_id) { + sharded_var_device_.emplace(var_name, dev_id); + if (delayed_op.count(var_name)) { + auto &ops = delayed_op.at(var_name); + sorted_ops.insert(sorted_ops.end(), ops.begin(), ops.end()); + delayed_op.at(var_name).clear(); } - } else if (node->Op()->Type() == "concat") { - op_dev_id = GetVarDeviceID(input_var_names[0], *sharded_var_device); - for (auto &varname : output_var_names) { - sharded_var_device->emplace(varname, op_dev_id); + }; + + for (ir::Node *node : topo_ops) { + int op_dev_id = GetOpDeviceID(node, &delayed_op); + if (op_dev_id > -1) { + // This op only runs on one specific device. + sorted_ops.emplace_back(node); + for (ir::Node *n : node->outputs) { + insert_delayed_op(n->Name(), op_dev_id); + } + } else if (op_dev_id == -1) { + // This op runs on all devices, and its output may have parameter's + // gradients. + sorted_ops.emplace_back(node); + bool is_bk_op = + static_cast(boost::get(node->Op()->GetAttr( + OpProtoAndCheckerMaker::OpRoleAttrName())) & + static_cast(OpRole::kBackward)); + if (!is_bk_op) continue; + // Currently, we assume that once gradient is generated, it can be + // broadcast, and each gradient is only broadcast once. + std::vector backward_vars; + try { + backward_vars = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + } catch (boost::bad_get e) { + } + PADDLE_ENFORCE_EQ(backward_vars.size() % 2, 0); + + for (size_t i = 0; i < backward_vars.size(); i += 2) { + auto &g_name = backward_vars[i + 1]; + size_t cur_device_id = GetAppropriateDeviceID({g_name}); + insert_delayed_op(g_name, static_cast(cur_device_id)); + } + } else if (op_dev_id == -2) { + // The Op on which the Op depends has not yet been generated. } - } else { - LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type(); - PADDLE_THROW( - "the distribute training related op should be in [split_byref, " - "concat]."); } - PADDLE_ENFORCE(op_dev_id != -1, - "can not find right place for distributed op: %s", - node->Op()->Type()); + PADDLE_ENFORCE_EQ(sorted_ops.size(), topo_ops.size()); - CreateComputationalOp(result, node, op_dev_id); - return op_dev_id; + ResetState(); + return sorted_ops; +} + +void DistSSAGraphBuilder::Init() const { + MultiDevSSAGraphBuilderBase::Init(); + ResetState(); +} + +void DistSSAGraphBuilder::ResetState() const { + BalanceVarSSAGraphBuilder::ResetState(); + bcast_var_name_set_.clear(); + bcast_var_name_set_.resize(places_.size()); +} + +bool DistSSAGraphBuilder::DealWithSpecialOp(ir::Graph *result, + ir::Node *node) const { + bool insert_op = false; + if (OpHaveRole(*node, OpRole::kRPC)) { + int op_dev_id = CreateRPCOp(result, node); + PADDLE_ENFORCE(op_dev_id != -1, + "Can not schedule the RPC operator to the right place."); + if (node->Op()->Type() == "recv") { + auto recv_vars_attr = + boost::get>(node->Op()->GetNullableAttr( + OpProtoAndCheckerMaker::OpRoleVarAttrName())); + PADDLE_ENFORCE(recv_vars_attr.size() == 2UL); // [parameter, gradient] + if (recv_vars_attr[0].find(".block") == std::string::npos) { + bcast_var_name_set_[op_dev_id].emplace(recv_vars_attr[0]); + } + } + insert_op = true; + need_broadcast_var_ = true; + } else if (OpHaveRole(*node, OpRole::kDist)) { + int op_dev_id = CreateDistTrainOp(result, node); + if (node->Op()->Type() == "concat") { + auto origin_param_name = node->Op()->OutputArgumentNames()[0]; + bcast_var_name_set_[op_dev_id].emplace(origin_param_name); + } + insert_op = true; + } else { + int op_dev_id = GetOpDeviceID(node); + if (op_dev_id != -1) { // This op only runs on one specific device. + CreateComputationalOp(result, node, op_dev_id); + for (ir::Node *n : node->outputs) { + sharded_var_device_.emplace(n->Name(), op_dev_id); + } + insert_op = true; + } + } + return insert_op; } void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { @@ -774,13 +768,11 @@ void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) { } // Create RPC related op handles that connects its in ops and out ops. -int MultiDevSSAGraphBuilder::CreateRPCOp( - ir::Graph *result, ir::Node *node, - std::unordered_map *sharded_var_device) const { +int DistSSAGraphBuilder::CreateRPCOp(ir::Graph *result, ir::Node *node) const { int op_dev_id = -1; if (node->Op()->Type() == "send") { // TODO(paddle-dev): getting the first var is not safe. - op_dev_id = GetVarDeviceID(node->inputs[0]->Name(), *sharded_var_device); + op_dev_id = GetVarDeviceID(node->inputs[0]->Name()); PADDLE_ENFORCE(!ir::IsControlDepVar(*node->inputs[0]), "This hack no longer holds, please fix."); // the variable name which contains .block means it was splited by @@ -798,9 +790,9 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( VLOG(10) << "send grad " << input_var_names[0] << " origin " << send_param_grad[1] << " place: " << op_dev_id; for (auto &varname : input_var_names) { - sharded_var_device->emplace(varname, op_dev_id); + sharded_var_device_.emplace(varname, op_dev_id); } - sharded_var_device->emplace(send_param_grad[1], op_dev_id); + sharded_var_device_.emplace(send_param_grad[1], op_dev_id); } } else if (node->Op()->Type() == "recv") { std::vector output_var_names; @@ -810,7 +802,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( auto recv_param_grad = boost::get>( node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName())); if (recv_param_grad.size() == 2U) { - op_dev_id = GetVarDeviceID(recv_param_grad[1], *sharded_var_device); + op_dev_id = GetVarDeviceID(recv_param_grad[1]); VLOG(10) << "recv param " << recv_param_grad[0] << " get grad place: " << recv_param_grad[1] << " place: " << op_dev_id; @@ -818,7 +810,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( op_dev_id = GetAppropriateDeviceID(output_var_names); } for (auto &varname : output_var_names) { - sharded_var_device->emplace(varname, op_dev_id); + sharded_var_device_.emplace(varname, op_dev_id); } } else { // send_barrier, fetch_barrier will run on place 0; @@ -845,7 +837,7 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( for (ir::Node *output : node->outputs) { int outvar_dev_id = op_dev_id; if (node->Op()->Type() == "fetch_barrier") { - outvar_dev_id = GetVarDeviceID(output->Name(), *sharded_var_device); + outvar_dev_id = GetVarDeviceID(output->Name()); PADDLE_ENFORCE_NE(outvar_dev_id, -1, "output name %s", output->Name()); } p = places_[outvar_dev_id]; @@ -862,29 +854,124 @@ int MultiDevSSAGraphBuilder::CreateRPCOp( return op_dev_id; } -bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { - PADDLE_ENFORCE(all_vars_.count(og) != 0); - if (all_vars_.at(og)->GetType() == proto::VarType::SELECTED_ROWS) { - return true; +int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result, + ir::Node *node) const { + int op_dev_id = -1; + std::vector input_var_names; + std::vector output_var_names; + for (ir::Node *input : node->inputs) { + input_var_names.push_back(input->Name()); } - return false; + for (ir::Node *output : node->outputs) { + output_var_names.push_back(output->Name()); + } + + if (node->Op()->Type() == "split_byref" || + node->Op()->Type() == "split_selected_rows" || + node->Op()->Type() == "split_ids") { + // TODO(paddle-dev): getting the first var is not safe. + op_dev_id = GetVarDeviceID(input_var_names[0]); + if (strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) { + op_dev_id = GetAppropriateDeviceID(input_var_names); + for (auto &varname : input_var_names) { + sharded_var_device_.emplace(varname, op_dev_id); + } + } + for (auto &varname : output_var_names) { + sharded_var_device_.emplace(varname, op_dev_id); + } + } else if (node->Op()->Type() == "concat") { + op_dev_id = GetVarDeviceID(input_var_names[0]); + for (auto &varname : output_var_names) { + sharded_var_device_.emplace(varname, op_dev_id); + } + } else { + LOG(ERROR) << "got unexpected dist op: " << node->Op()->Type(); + PADDLE_THROW( + "the distribute training related op should be in [split_byref, " + "concat]."); + } + + PADDLE_ENFORCE(op_dev_id != -1, + "can not find right place for distributed op: %s", + node->Op()->Type()); + + CreateComputationalOp(result, node, op_dev_id); + return op_dev_id; } -bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const { - return boost::get( - node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleAttrName())) == - (static_cast(OpRole::kBackward) | - static_cast(OpRole::kLoss)) && - !loss_var_name_.empty(); // If loss_var is empty. This is test mode +void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result, + const std::string &p_name, + const std::string &g_name) const { + size_t cur_device_id = 0; + switch (strategy_.reduce_) { + case BuildStrategy::ReduceStrategy::kReduce: + cur_device_id = GetAppropriateDeviceID({g_name}); + CreateReduceOp(result, g_name, cur_device_id); + sharded_var_device_.emplace(g_name, cur_device_id); + break; + case BuildStrategy::ReduceStrategy::kAllReduce: + if (IsSparseGradient(g_name)) { + CreateReduceOp(result, g_name, 0); + CreateBroadcastOp(result, g_name, 0); + } else { + CreateAllReduceOp(result, g_name); + } + break; + default: + LOG(FATAL) << "Unknown reduce strategy."; + break; + } +} + +void DistSSAGraphBuilder::InsertPostprocessOps(ir::Graph *result) const { + if (need_broadcast_var_ || + (UseGPU() && + strategy_.reduce_ == BuildStrategy::ReduceStrategy::kReduce)) { + if (strategy_.fuse_broadcast_op_) { + CreateFusedBroadcastOp(result, bcast_var_name_set_); + } else { + for (size_t dev_id = 0; dev_id < bcast_var_name_set_.size(); ++dev_id) { + auto &to_bcast_set = bcast_var_name_set_[dev_id]; + for (auto &bcast_name : to_bcast_set) { + CreateBroadcastOp(result, bcast_name, dev_id); + } + } + } + } +} + +std::unordered_set &MultiDevSSAGraphBuilder() { + static std::unordered_set regs; + return regs; } + +static int MultiDevSSAGraphBuilderRegister(const std::string &builder_mode) { + MultiDevSSAGraphBuilder().insert(builder_mode); + return 0; +} + } // namespace details } // namespace framework } // namespace paddle -REGISTER_PASS(multi_devices_pass, - paddle::framework::details::MultiDevSSAGraphBuilder) - .RequirePassAttr(paddle::framework::details::kLossVarName) - .RequirePassAttr(paddle::framework::details::kPlaces) - .RequirePassAttr(paddle::framework::details::kLocalScopes) - .RequirePassAttr(paddle::framework::details::kStrategy) - .RequirePassAttr(paddle::framework::details::kNumTrainers); +#define REGISTER_MULTI_DEVICES_PASS(pass_name, pass_class) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + _reg_ssa_graph_builder_##pass_name, \ + "REGISTER_MULTI_DEVICES_PASS must be called in global namespace."); \ + int _reg_ssa_graph_builder_entry_##pass_name = \ + paddle::framework::details::MultiDevSSAGraphBuilderRegister(#pass_name); \ + REGISTER_PASS(pass_name, pass_class) \ + .RequirePassAttr(paddle::framework::details::kLossVarName) \ + .RequirePassAttr(paddle::framework::details::kPlaces) \ + .RequirePassAttr(paddle::framework::details::kLocalScopes) \ + .RequirePassAttr(paddle::framework::details::kStrategy) \ + .RequirePassAttr(paddle::framework::details::kNRanks) + +REGISTER_MULTI_DEVICES_PASS(reduce_mode_multi_devices_pass, + paddle::framework::details::ReduceSSAGraphBuilder); +REGISTER_MULTI_DEVICES_PASS( + allreduce_mode_multi_devices_pass, + paddle::framework::details::AllReduceSSAGraphBuilder); +REGISTER_MULTI_DEVICES_PASS(dist_multi_devices_pass, + paddle::framework::details::DistSSAGraphBuilder); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index 7029e9dc18cbacf0c5f0d7c6430d84fb72d6a0a3..6d4386538ea7d0cc318647c92282af9d598fa699 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -13,6 +13,7 @@ // limitations under the License. #pragma once + #include #include #include @@ -30,78 +31,70 @@ namespace framework { class Scope; namespace details { -class MultiDevSSAGraphBuilder : public ir::Pass { +constexpr char kLossVarName[] = "loss_var_name"; +constexpr char kPlaces[] = "places"; +constexpr char kLocalScopes[] = "local_scopes"; +constexpr char kStrategy[] = "strategy"; +constexpr char kNRanks[] = "nranks"; + +class MultiDevSSAGraphBuilderBase : public ir::Pass { protected: std::unique_ptr ApplyImpl( std::unique_ptr graph) const override; - private: - void CreateOpHandleIOs(ir::Graph *result, ir::Node *node, - size_t device_id) const; - void Init() const; + virtual void Init() const; -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - mutable platform::NCCLContextMap *nccl_ctxs_; -#endif + virtual std::vector SortOperations(const ir::Graph &graph) const; - int GetVarDeviceID( - const std::string &varname, - const std::unordered_map &sharded_var_device) const; + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const = 0; - bool IsScaleLossOp(ir::Node *node) const; + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const = 0; + + virtual void InsertPostprocessOps(ir::Graph *result) const = 0; - int CreateRPCOp( - ir::Graph *result, ir::Node *node, - std::unordered_map *sharded_var_device) const; - int CreateDistTrainOp( - ir::Graph *result, ir::Node *node, - std::unordered_map *sharded_var_device) const; + bool UseGPU() const; + + bool NeedCollectiveOps() const; + + bool IsScaleLossOp(ir::Node *node) const; void CreateComputationalOps(ir::Graph *result, ir::Node *node, size_t num_places) const; void CreateScaleLossGradOp(ir::Graph *result, const std::string &loss_grad_name, - ir::Node *out_var_node, + ir::Node *out_var_node, size_t loss_scale, proto::VarType::Type dtype) const; VarHandle *CreateReduceOp(ir::Graph *result, const std::string &og, int dst_dev_id) const; + void CreateComputationalOp(ir::Graph *result, ir::Node *node, int dev_id) const; - int GetOpDeviceID( - ir::Node *node, - const std::unordered_map &sharded_var_device) const; - - void InsertAllReduceOp(ir::Graph *result, const std::string &og) const; + bool IsSparseGradient(const std::string &og) const; - void InsertDataBalanceOp(ir::Graph *result, - const std::vector &datas) const; + void CreateAllReduceOp(ir::Graph *result, const std::string &og) const; void CreateBroadcastOp(ir::Graph *result, const std::string &p_name, size_t src_dev_id) const; + void InsertScaleLossGradOp(ir::Graph *result, const ir::Node *node) const; + void CreateFusedBroadcastOp( ir::Graph *result, const std::vector> &bcast_varnames) const; - bool IsSparseGradient(const std::string &og) const; - - size_t GetAppropriateDeviceID( - const std::vector &var_names) const; - void SetCommunicationContext(OpHandleBase *op_handle, const platform::Place &p) const; - std::vector SortForReduceMode( - const std::vector &) const; + void CreateOpHandleIOs(ir::Graph *result, ir::Node *node, + size_t device_id) const; - int GetOpDeviceID( - ir::Node *node, - const std::unordered_map &shared_var_device, - std::unordered_map> *delay_ops) - const; +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + mutable platform::NCCLContextMap *nccl_ctxs_; +#endif mutable std::string loss_var_name_; mutable std::vector places_; @@ -109,8 +102,83 @@ class MultiDevSSAGraphBuilder : public ir::Pass { mutable BuildStrategy strategy_; mutable std::unordered_map all_vars_; +}; + +class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { + protected: + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const; + + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const { + return false; + } + + virtual void InsertPostprocessOps(ir::Graph *result) const {} +}; + +class BalanceVarSSAGraphBuilder : public MultiDevSSAGraphBuilderBase { + protected: + int GetVarDeviceID(const std::string &varname) const; + + int GetOpDeviceID(ir::Node *node) const; + + size_t GetAppropriateDeviceID( + const std::vector &var_names) const; + + virtual void ResetState() const; + + mutable std::unordered_map sharded_var_device_; mutable std::vector balance_vars_; }; + +class ReduceSSAGraphBuilder : public BalanceVarSSAGraphBuilder { + protected: + virtual void Init() const; + + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const; + + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const; + + virtual void InsertPostprocessOps(ir::Graph *result) const; + + virtual std::vector SortOperations(const ir::Graph &graph) const; + + virtual void ResetState() const; + + int GetOpDeviceID(ir::Node *node, + std::unordered_map> + *delay_ops) const; + + std::vector SortForReduceMode( + const std::vector &topo_ops) const; + + mutable std::vector> bcast_var_name_set_; +}; + +class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder { + protected: + virtual void Init() const; + + virtual bool DealWithSpecialOp(ir::Graph *result, ir::Node *node) const; + + virtual void InsertPostprocessOps(ir::Graph *result) const; + + virtual void InsertCollectiveOp(ir::Graph *result, const std::string &p_name, + const std::string &g_name) const; + + virtual void ResetState() const; + + int CreateRPCOp(ir::Graph *result, ir::Node *node) const; + + int CreateDistTrainOp(ir::Graph *result, ir::Node *node) const; + + mutable std::vector> bcast_var_name_set_; + mutable bool need_broadcast_var_{false}; +}; + +std::unordered_set &MultiDevSSAGraphBuilder(); + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc new file mode 100644 index 0000000000000000000000000000000000000000..128aaa33a2c60e62fdca13768cdc0a815167f3ef --- /dev/null +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" + +namespace paddle { +namespace framework { +namespace details { + +ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( + const ExecutionStrategy &strategy, const std::vector &local_scopes, + const std::vector &places, + std::vector> &&graphs) + : strategy_(std::move(strategy)), + local_scopes_(std::move(local_scopes)), + pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), + places_(std::move(places)), + graphs_(std::move(graphs)) { + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); + + // set the correct size of thread pool to each device. + strategy_.num_threads_ = strategy_.num_threads_ < places_.size() + ? 1UL + : strategy_.num_threads_ / places_.size(); + VLOG(1) << "set num_threads: " << strategy_.num_threads_ + << " to run the operators of the graph on each device."; + for (size_t i = 0; i < places.size(); ++i) { + executors_.emplace_back(new details::ThreadedSSAGraphExecutor( + strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); + } +} + +FeedFetchList ParallelSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + std::vector> run_futures; + + std::vector fetch_data; + FeedFetchList ret; + + fetch_data.reserve(places_.size()); + ret.reserve(fetch_tensors.size()); + exception_holder_.Clear(); + + for (size_t i = 0; i < places_.size(); ++i) { + auto call = [this, i, &fetch_tensors]() -> FeedFetchList { + try { + return executors_[i]->Run(fetch_tensors); + } catch (...) { + exception_holder_.Catch(std::current_exception()); + } + return FeedFetchList(); + }; + + if (pool_) { + run_futures.emplace_back(pool_->enqueue(std::move(call))); + } else { + fetch_data.emplace_back(std::move(call())); + } + } + + if (pool_) { + for (auto &f : run_futures) { + if (exception_holder_.IsCaught()) { + f.wait(); + } else { + fetch_data.emplace_back(std::move(f.get())); + } + } + } + if (exception_holder_.IsCaught()) { + exception_holder_.ReThrow(); + } + + for (size_t fetch_idx = 0; fetch_idx < fetch_tensors.size(); ++fetch_idx) { + std::vector lodtensor_ptrs; + lodtensor_ptrs.reserve(local_scopes_.size()); + for (size_t scope_idx = 0; scope_idx < local_scopes_.size(); ++scope_idx) { + lodtensor_ptrs.push_back(&fetch_data.at(scope_idx).at(fetch_idx)); + } + ret.emplace_back(); + ret.back().MergeLoDTensor(lodtensor_ptrs, platform::CPUPlace()); + } + return ret; +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h new file mode 100644 index 0000000000000000000000000000000000000000..c00c5bc2d1b4b78593f99c819b5a3d642150e773 --- /dev/null +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "ThreadPool.h" +#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" + +namespace paddle { +namespace framework { +namespace details { + +class ParallelSSAGraphExecutor : public SSAGraphExecutor { + public: + ParallelSSAGraphExecutor(const ExecutionStrategy &strategy, + const std::vector &local_scopes, + const std::vector &places, + std::vector> &&graphs); + ~ParallelSSAGraphExecutor() final = default; + const ir::Graph &Graph() const override { return *graphs_[0]; } + + FeedFetchList Run(const std::vector &fetch_tensors) override; + + private: + ExecutionStrategy strategy_; + std::vector local_scopes_; + std::unique_ptr<::ThreadPool> pool_{nullptr}; + std::vector places_; + std::vector> graphs_; + + std::vector> executors_; + ExceptionHolder exception_holder_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 1ed4b2c8e860312a88450a0eba9c2de9191f5fe8..91e4f9adb418978c30f512abe6924c0ace182124 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -56,7 +56,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( } } std::vector fetch_data; - std::exception_ptr eptr; + std::exception_ptr eptr = nullptr; try { fetch_data = underlying_executor_->Run(fetch_tensors); } catch (...) { diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index f1642bc0d2b10f97295e80ee201db8f83bfd06ef..86e6b1f7d92bc7bc97180e05f6a7c14ab375f92f 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -40,14 +40,14 @@ void NaiveExecutor::Prepare(Scope *scope, const ProgramDesc &program_desc, void NaiveExecutor::Run() { #ifndef PADDLE_ON_INFERENCE - LOG_FIRST_N(WARNING, 15) << "The NaiveExecutor can not work properly if the " - "cmake flag ON_INFER is not set."; - LOG_FIRST_N(WARNING, 15) << "Unlike the training phase, all the scopes and " - "variables will be reused to save the allocation " - "overhead."; - LOG_FIRST_N(WARNING, 15) << "Please re-compile the inference library by " - "setting the cmake flag ON_INFER=ON if you are " - "running Paddle Inference"; + LOG_FIRST_N(WARNING, 5) << "The NaiveExecutor can not work properly if the " + "cmake flag ON_INFER is not set."; + LOG_FIRST_N(WARNING, 5) << "Unlike the training phase, all the scopes and " + "variables will be reused to save the allocation " + "overhead."; + LOG_FIRST_N(WARNING, 5) << "Please re-compile the inference library by " + "setting the cmake flag ON_INFER=ON if you are " + "running Paddle Inference"; #endif // PADDLE_ON_INFERENCE for (auto &op : ops_) { VLOG(3) << std::this_thread::get_id() << " run " << op->Type() diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc index 42190b52289bfc6fc510f13cb5190a0d3e03b836..b083493ba4f4d2ea35e805333e028ed7840f9c8d 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -32,8 +32,11 @@ std::map>>)>> NgraphBridge::NG_NODE_MAP = { {"fill_constant", paddle::operators::ngraphs::BuildFillConstantNode}, + {"mean", paddle::operators::ngraphs::BuildMeanNode}, + {"mean_grad", paddle::operators::ngraphs::BuildMeanGradNode}, {"mul", paddle::operators::ngraphs::BuildMulNode}, {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode}, + {"scale", paddle::operators::ngraphs::BuildScaleNode}, {"relu", paddle::operators::ngraphs::BuildUnaryNode}, {"tanh", paddle::operators::ngraphs::BuildUnaryNode}, {"top_k", paddle::operators::ngraphs::BuildTopKNode}}; diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index 57345f12ccc5d59c84001f1c5c1ebdacadc97ed5..7e174c7def1ffa4089a94d9cc504b18843557c53 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -539,7 +539,7 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { } } - backend_->call(ngraph_function_, t_out, t_in); + backend_->call(backend_->compile(ngraph_function_), t_out, t_in); } // NgraphEngine::RunImpl } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 5709eb1a7d4d26e6cc358651c1521ebf9a279801..4d29564aeed74558b7f0ec580568f70dad0b40cc 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -377,6 +377,30 @@ class ExecutionContext { return op_.Outputs(name); } + template + Tensor AllocateTmpTensor(const framework::DDim& dim, + const DevContext& dev_ctx) const { + auto tmp_allocation_ptr = platform::DeviceTemporaryAllocator::Instance() + .Get(dev_ctx) + .Allocate(product(dim) * sizeof(T)); + auto& deleter = tmp_allocation_ptr.get_deleter(); + auto* allocation_ptr = tmp_allocation_ptr.release(); + auto shared_allocation = std::shared_ptr( + allocation_ptr, deleter); + + PADDLE_ENFORCE( + dynamic_cast(allocation_ptr) != nullptr, + "The AllocationPtr must be TemporaryAllocation."); + PADDLE_ENFORCE_EQ(allocation_ptr->size(), + framework::product(dim) * sizeof(T)); + + paddle::framework::Tensor temp_tensor( + framework::ToDataType(std::type_index(typeid(T)))); + temp_tensor.Resize(dim); + temp_tensor.ResetHolder(std::move(shared_allocation)); + return temp_tensor; + } + private: const OperatorBase& op_; const Scope& scope_; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index e14b74a87302a92de7724f3822859026a44b13d0..450fe1508f2a505a233b3d300cb7c500894231e7 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -21,12 +21,9 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph.h" -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) -#include "paddle/fluid/platform/nccl_helper.h" -#endif - #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" #include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" @@ -38,6 +35,8 @@ limitations under the License. */ DEFINE_string(pe_profile_fname, "", "Profiler filename for PE, which generated by gperftools." "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable."); +DEFINE_bool(enable_parallel_graph, false, + "Force disable parallel graph execution mode if set false."); namespace paddle { namespace framework { @@ -106,6 +105,7 @@ class ParallelExecutorPrivate { bool own_local_scope_; bool use_cuda_; bool use_all_reduce_; + size_t nranks_; // global_ref_cnts_ is only initialized when ParallelExecutor constructs, and // then keeps unchanged @@ -201,6 +201,7 @@ ParallelExecutor::ParallelExecutor( member_->build_strategy_ = build_strategy; member_->use_all_reduce_ = build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; + member_->nranks_ = num_trainers * places.size(); if (!member_->use_all_reduce_) { PADDLE_ENFORCE(places.size() > 1, @@ -224,62 +225,98 @@ ParallelExecutor::ParallelExecutor( } } + // FIXME(Yancey1989): parallel graph mode get better performance + // in GPU allreduce distributed training. Need an elegant way to + // choice the execution strategy. + build_strategy.enable_parallel_graph_ = + EnableParallelGraphExecution(main_program, exec_strategy, build_strategy); + + VLOG(1) << "Enable ParallelGraph Execution: " + << build_strategy.enable_parallel_graph_; + if (member_->use_cuda_) { // Bcast Parameters to all GPUs #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); ncclUniqueId *nccl_id = nullptr; + // gen_nccl_id operator can broadcast the ncclUniqueId for nccl2 collective + // distributed training + auto *nccl_id_var = scope->FindVar(NCCL_ID_VARNAME); if (nccl_id_var != nullptr) { nccl_id = nccl_id_var->GetMutable(); } + if (build_strategy.enable_parallel_graph_ && member_->nranks_ > 1UL) { + if (nccl_id == nullptr) { + local_nccl_id_.reset(new ncclUniqueId()); + platform::dynload::ncclGetUniqueId(local_nccl_id_.get()); + nccl_id = local_nccl_id_.get(); + } + } + member_->nccl_ctxs_.reset(new platform::NCCLContextMap( member_->places_, nccl_id, num_trainers, trainer_id)); #else PADDLE_THROW("Not compiled with CUDA"); #endif } - if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { BCastParamsToDevices(bcast_vars); } -// Startup Program has been run. All local scopes has correct parameters. + // Startup Program has been run. All local scopes has correct parameters. -// Step 2. Convert main_program to SSA form and dependency graph. Also, insert -// ncclOp + // Step 2. Convert main_program to SSA form and dependency graph. Also, insert + // ncclOp + std::vector> graphs; #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + if (build_strategy.enable_parallel_graph_) { + for (size_t i = 0; i < member_->places_.size(); ++i) { + std::unique_ptr graph = build_strategy.Apply( + main_program, {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_, + member_->nccl_ctxs_.get()); + graphs.push_back(std::move(graph)); + } + } else { + std::unique_ptr graph = build_strategy.Apply( + main_program, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get()); + graphs.push_back(std::move(graph)); + } +#else std::unique_ptr graph = build_strategy.Apply( main_program, member_->places_, loss_var_name, member_->local_scopes_, - member_->use_cuda_, member_->nccl_ctxs_.get()); -#else - std::unique_ptr graph = - build_strategy.Apply(main_program, member_->places_, loss_var_name, - member_->local_scopes_, member_->use_cuda_); + member_->nranks_, member_->use_cuda_); + graphs.push_back(std::move(graph)); #endif auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { - graph = member_->PrepareGCAndRefCnts(std::move(graph), - static_cast(max_memory_size)); + for (size_t i = 0; i < graphs.size(); ++i) { + graphs[i] = member_->PrepareGCAndRefCnts( + std::move(graphs[i]), static_cast(max_memory_size)); + } } // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars std::vector var_infos; - for (auto &node : graph->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); + for (auto &graph : graphs) { + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos.emplace_back(); + var_infos.back().name_ = node->Var()->Name(); + var_infos.back().type_ = node->Var()->GetType(); + var_infos.back().persistable_ = node->Var()->Persistable(); + } } } + // If the loss_var_name is given, the number of graph should be only one. if (loss_var_name.size()) { - size_t graph_num = ir::GraphNum(*graph); + size_t graph_num = ir::GraphNum(*graphs[0]); if (graph_num > 1) { LOG(WARNING) << "The number of graph should be only one, " "but the current graph has " - << ir::GraphNum(*graph) + << ir::GraphNum(*graphs[0]) << " sub_graphs. If you want to see the nodes of the " "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' " "to specify the output dir. NOTES: if you not do training, " @@ -287,14 +324,20 @@ ParallelExecutor::ParallelExecutor( } } - if (exec_strategy.type_ == ExecutionStrategy::kDefault) { - member_->executor_.reset(new details::ThreadedSSAGraphExecutor( + if (build_strategy.enable_parallel_graph_) { + member_->executor_.reset(new details::ParallelSSAGraphExecutor( exec_strategy, member_->local_scopes_, member_->places_, - std::move(graph))); + std::move(graphs))); } else { - member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->places_, - std::move(graph))); + if (exec_strategy.type_ == ExecutionStrategy::kDefault) { + member_->executor_.reset(new details::ThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs[0]))); + } else { + member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graphs[0]))); + } } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( @@ -423,6 +466,36 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( } } +bool ParallelExecutor::EnableParallelGraphExecution( + const ProgramDesc &main_program, const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy) const { + if (!FLAGS_enable_parallel_graph) return false; + + bool enable_parallel_graph = true; + // TODO(Yancey1989): support sparse update in ParallelGraph mode. + for (auto &var_desc : main_program.Block(0).AllVars()) { + if (var_desc->GetType() == proto::VarType::SELECTED_ROWS) { + enable_parallel_graph = false; + } + } + + // TODO(Yancey1989): support pserver mode + for (auto &op_desc : main_program.Block(0).AllOps()) { + if (op_desc->Type() == "send" || op_desc->Type() == "recv") { + enable_parallel_graph = false; + break; + } + } + + if (!member_->use_all_reduce_ || !member_->use_cuda_) + enable_parallel_graph = false; + + if (build_strategy.enable_sequential_execution_ || + exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental) + enable_parallel_graph = false; + return enable_parallel_graph; +} + ParallelExecutor::~ParallelExecutor() { for (auto &p : member_->places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 5f6c2159aa2d90378ac298a8e56b51a188225d45..49d3f0d3f6f2a8965d39b656071d86bde42bfd93 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -28,6 +28,10 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) +#include "paddle/fluid/platform/nccl_helper.h" +#endif + namespace paddle { namespace framework { @@ -68,8 +72,14 @@ class ParallelExecutor { private: void BCastParamsToDevices(const std::unordered_set &vars) const; + bool EnableParallelGraphExecution(const ProgramDesc &main_program, + const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy) const; ParallelExecutorPrivate *member_; +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + std::unique_ptr local_nccl_id_; +#endif }; } // namespace framework diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 871c7bd2a77d1cc5057177619b5cd7b2083ff308..1ffd357e62b4bdc72dbec627c463730aa2c8f720 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -151,27 +151,5 @@ void TensorToVector(const Tensor& src, std::vector* dst) { memory::Copy(dst_place, dst_ptr, boost::get(src.place()), src_ptr, size); } - -template -paddle::framework::Tensor GetTensor( - memory::allocation::AllocationPtr temp_allocation_ptr, - const framework::DDim& dim) { - auto& deleter = temp_allocation_ptr.get_deleter(); - auto* allocation_ptr = temp_allocation_ptr.release(); - auto shared_allocation = - std::shared_ptr(allocation_ptr, deleter); - - PADDLE_ENFORCE( - dynamic_cast(allocation_ptr) != nullptr, - "The AllocationPtr must be TemporaryAllocation."); - PADDLE_ENFORCE_EQ(allocation_ptr->size(), - framework::product(dim) * sizeof(T)); - - paddle::framework::Tensor temp_tensor( - framework::ToDataType(std::type_index(typeid(T)))); - temp_tensor.Resize(dim); - temp_tensor.ResetHolder(std::move(shared_allocation)); - return temp_tensor; -} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/threadpool.cc b/paddle/fluid/framework/threadpool.cc index fcec955360f1c681a62929e904d5736854a8ffad..d34f826c1abb99198fd4dbe9537495edff7b63af 100644 --- a/paddle/fluid/framework/threadpool.cc +++ b/paddle/fluid/framework/threadpool.cc @@ -89,7 +89,6 @@ void ThreadPool::TaskLoop() { task = std::move(tasks_.front()); tasks_.pop(); } - // run the task task(); } diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 2db5705d0944b2ab10defdda9a7b616daa8fd47e..2d8980b1d15d89cdf9c243a57188a0acb354940d 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -123,8 +123,6 @@ struct Argument { DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool); - DECL_ARGUMENT_FIELD(tensorrt_node_teller, TensorRtNodeTeller, - std::function); DECL_ARGUMENT_FIELD(tensorrt_max_batch_size, TensorRtMaxBatchSize, int); DECL_ARGUMENT_FIELD(tensorrt_workspace_size, TensorRtWorkspaceSize, int); DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index b8c9426ed3b62d35f78247269cb32d2f6344b092..e37fea38bcb2b1f514347ecbfe7072abb6f07455 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -49,13 +49,6 @@ void IRPassManager::CreatePasses(Argument *argument, for (const std::string &pass_name : passes) { auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); - // Set some pass attributes. - if (pass_name == "ir_analysis_pass") { - pass->Set("tensorrt_node_teller", - new SubgraphDetector::NodeInsideSubgraphTeller( - argument->tensorrt_node_teller())); - } - if (pass_name == "graph_viz_pass") { std::string dot_file_path = std::to_string(pass_num) + "_ir_" + (pre_pass.empty() ? "origin" : pre_pass) + @@ -70,9 +63,6 @@ void IRPassManager::CreatePasses(Argument *argument, } if (pass_name == "tensorrt_subgraph_pass") { - PADDLE_ENFORCE(argument->tensorrt_node_teller_valid()); - pass->SetNotOwned("tensorrt_node_teller", - argument->tensorrt_node_teller_ptr()); pass->Set("workspace_size", new int(argument->tensorrt_workspace_size())); pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size())); pass->Set("min_subgraph_size", diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index 822c7799bb3ae6d79da6cf2a7b3c8c9b20353ed7..9ae5b8aa173b85904df360eb196aefe5af08c6aa 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -1,9 +1,13 @@ cc_library(subgraph_detector SRCS subgraph_detector.cc DEPS proto_desc) -cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector) -set(analysis_deps ${analysis_deps} - subgraph_detector tensorrt_subgraph_pass - CACHE INTERNAL "") -set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) -file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n") -set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "") +if (TENSORRT_FOUND) + cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_detector tensorrt_op_teller) + + set(analysis_deps ${analysis_deps} + subgraph_detector tensorrt_subgraph_pass + CACHE INTERNAL "") + + set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h) + file(APPEND ${pass_file} "USE_PASS(tensorrt_subgraph_pass);\n") + set(INFER_IR_PASSES ${INFER_IR_PASSES} tensorrt_subgraph_pass CACHE INTERNAL "") +endif() diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index ad10010e42be9717e3298fc88c89764e4ae2690b..bc06e78ae6997b0d4d0456c15d6e4158efdad300 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" +#include "paddle/fluid/inference/tensorrt/op_teller.h" namespace paddle { namespace inference { @@ -35,8 +36,10 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( std::unique_ptr graph) const { framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); - auto teller = - Get("tensorrt_node_teller"); + auto teller = [](const framework::ir::Node *node) { + if (!node->IsOp() || !node->Op()) return false; + return tensorrt::OpTeller::Global().Tell(node->Op()->Type(), *node->Op()); + }; SubGraphFuser fuser(graph.get(), teller, Get("min_subgraph_size") /*min subgraph size*/); @@ -232,7 +235,6 @@ std::vector ExtractParameters( REGISTER_PASS(tensorrt_subgraph_pass, paddle::inference::analysis::TensorRtSubgraphPass) - .RequirePassAttr("tensorrt_node_teller") .RequirePassAttr("max_batch_size") .RequirePassAttr("workspace_size") .RequirePassAttr("min_subgraph_size"); diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc index c3a2b3ca1d3b09e71921fde0b0bad8d195aaa38f..490189e550760b4de62724e685dd07f6e521445e 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.cc @@ -27,9 +27,6 @@ namespace analysis { void IrAnalysisComposePass::RunImpl(Argument *argument) { ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes); - if (argument->use_tensorrt_valid() && argument->use_tensorrt()) { - InitTensorRTAttrs(argument); - } ApplyIrPasses(argument); CollectFusionStatis(argument); } @@ -38,26 +35,6 @@ std::string IrAnalysisComposePass::repr() const { return "ir-analysis-compose-pass"; } -void IrAnalysisComposePass::InitTensorRTAttrs(Argument *argument) { - if (argument->use_tensorrt_valid() && argument->use_tensorrt()) { - LOG(INFO) << "Initing TensorRT pass"; - argument->SetTensorRtNodeTeller([](const framework::ir::Node *node) { - std::unordered_set teller_set( - {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", - "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", - "elementwise_add", "elementwise_mul", "dropout", "split", "prelu", - "conv2d_transpose", "leaky_relu"}); - if (!node->IsOp()) return false; - - if (teller_set.count(node->Op()->Type())) { - return true; - } else { - return false; - } - }); - } -} - void IrAnalysisComposePass::ApplyIrPasses(Argument *argument) { std::vector passes({ "ir_graph_build_pass", "ir_analysis_pass", diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h index 53e2ebb0038a5c105f68a0146b3da90a6ae34af8..16c6b7d84df88d0ebbc06b547c75a45dcb0c2440 100644 --- a/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_analysis_compose_pass.h @@ -33,8 +33,6 @@ class IrAnalysisComposePass : public AnalysisPass { std::string repr() const override; private: - void InitTensorRTAttrs(Argument* argument); - void ApplyIrPasses(Argument* argument); void CollectFusionStatis(Argument* argument); diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 6d6e799fdec9c67b4714f203b91b8bccb61510ba..211c691504de2c0bd8ff50f34b92cbc01397d5c9 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -14,86 +14,101 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/api/paddle_pass_builder.h" #include "paddle/fluid/platform/enforce.h" -#include "paddle_pass_builder.h" // NOLINT +#include "paddle/fluid/platform/gpu_info.h" namespace paddle { PassStrategy *contrib::AnalysisConfig::pass_builder() const { - PADDLE_ENFORCE( - pass_builder_.get(), - "Should call constructor first, that will init the pass_builder_."); + if (!pass_builder_.get()) { + if (use_gpu_) { + LOG(INFO) << "Create GPU IR passes"; + pass_builder_.reset(new GpuPassStrategy); + } else { + LOG(INFO) << "Create CPU IR passes"; + pass_builder_.reset(new CpuPassStrategy); + } + } else if (pass_builder_->use_gpu() ^ use_gpu()) { + LOG(WARNING) << "The use_gpu flag is not compatible between Config and " + "PassBuilder, the flags are " + << use_gpu() << " " << pass_builder_->use_gpu(); + LOG(WARNING) << "Please make them compatible, still use the existing " + "PassBuilder."; + } + return pass_builder_.get(); } -contrib::AnalysisConfig::AnalysisConfig(bool use_gpu) { - this->use_gpu = use_gpu; - if (use_gpu) { - pass_builder_.reset(new GpuPassStrategy); - } else { - pass_builder_.reset(new CpuPassStrategy); - } +contrib::AnalysisConfig::AnalysisConfig(const std::string &model_dir) { + model_dir_ = model_dir; +} +contrib::AnalysisConfig::AnalysisConfig(const std::string &prog_file, + const std::string ¶ms_file) { + prog_file_ = prog_file; + params_file_ = params_file; +} +void contrib::AnalysisConfig::SetModel(const std::string &prog_file_path, + const std::string ¶ms_file_path) { + prog_file_ = prog_file_path; + params_file_ = params_file_path; +} +void contrib::AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb, + int device_id) { +#ifdef PADDLE_WITH_CUDA + use_gpu_ = true; + memory_pool_init_size_mb_ = memory_pool_init_size_mb; + device_id_ = device_id; +#else + LOG(ERROR) << "Please compile with gpu to EnableGpu"; + use_gpu_ = false; +#endif } +void contrib::AnalysisConfig::DisableGpu() { use_gpu_ = false; } contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { - // fields from Config - model_dir = other.model_dir; - // fields from NativeConfig - use_gpu = other.use_gpu; - device = other.device; - fraction_of_gpu_memory = other.fraction_of_gpu_memory; - prog_file = other.prog_file; - param_file = other.param_file; - specify_input_name = other.specify_input_name; - cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; - // fields from this. - enable_ir_optim = other.enable_ir_optim; - // For mkldnn - use_mkldnn_ = other.use_mkldnn_; - mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_; - - use_feed_fetch_ops = other.use_feed_fetch_ops; - use_tensorrt_ = other.use_tensorrt_; - tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; - tensorrt_workspace_size_ = other.tensorrt_workspace_size_; - tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_; - model_from_memory_ = other.model_from_memory_; - - if (use_gpu) { +#define CP_MEMBER(member__) member__ = other.member__; + + // Model related. + CP_MEMBER(model_dir_); + CP_MEMBER(prog_file_); + CP_MEMBER(params_file_); + CP_MEMBER(model_from_memory_); // the memory model reuses prog_file_ and + // params_file_ fields. + // Gpu releated. + CP_MEMBER(use_gpu_); + CP_MEMBER(device_id_); + CP_MEMBER(memory_pool_init_size_mb_); + // TensorRT releated. + CP_MEMBER(use_tensorrt_); + CP_MEMBER(tensorrt_workspace_size_); + CP_MEMBER(tensorrt_max_batchsize_); + CP_MEMBER(tensorrt_min_subgraph_size_); + // MKLDNN releated. + CP_MEMBER(use_mkldnn_); + CP_MEMBER(mkldnn_enabled_op_types_); + + // Ir related. + CP_MEMBER(enable_ir_optim_); + CP_MEMBER(use_feed_fetch_ops_); + CP_MEMBER(ir_debug_); + CP_MEMBER(specify_input_name_); + + CP_MEMBER(cpu_math_library_num_threads_); + + CP_MEMBER(serialized_info_cache_); + + if (use_gpu_) { pass_builder_.reset(new GpuPassStrategy( *static_cast(other.pass_builder()))); } else { pass_builder_.reset(new CpuPassStrategy( *static_cast(other.pass_builder()))); } -} -contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { - // fields from Config - model_dir = other.model_dir; - // fields from NativeConfig - use_gpu = other.use_gpu; - device = other.device; - fraction_of_gpu_memory = other.fraction_of_gpu_memory; - prog_file = other.prog_file; - param_file = other.param_file; - specify_input_name = other.specify_input_name; - cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; - // fields from this. - enable_ir_optim = other.enable_ir_optim; - // For mkldnn - use_mkldnn_ = other.use_mkldnn_; - mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_; - - use_feed_fetch_ops = other.use_feed_fetch_ops; - use_tensorrt_ = other.use_tensorrt_; - tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; - tensorrt_workspace_size_ = other.tensorrt_workspace_size_; - tensorrt_min_subgraph_size_ = other.tensorrt_min_subgraph_size_; - model_from_memory_ = other.model_from_memory_; - - pass_builder_ = std::move(other.pass_builder_); +#undef CP_MEMBER } void contrib::AnalysisConfig::EnableMKLDNN() { @@ -112,17 +127,90 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, use_tensorrt_ = true; tensorrt_workspace_size_ = workspace_size; tensorrt_max_batchsize_ = max_batch_size; - tensorrt_min_subgraph_size_ = min_subgraph_size; - // Append after the conv+affine_channel fuse pass. - pass_builder()->InsertPass(3, "tensorrt_subgraph_pass"); +} + +void contrib::AnalysisConfig::Update() { + auto info = SerializeInfoCache(); + if (info == serialized_info_cache_) return; + + if (use_gpu_) { + pass_builder_.reset(new GpuPassStrategy); + } else { + pass_builder_.reset(new CpuPassStrategy); + } + + if (use_tensorrt_) { + if (!use_gpu_) { + LOG(ERROR) + << "TensorRT engine is not available when EnableGpu() not actived."; + } else { + // Append after the infer_clean pass. + pass_builder()->InsertPass(1, "tensorrt_subgraph_pass"); + } + } + + if (use_mkldnn_) { + if (!enable_ir_optim_) { + LOG(ERROR) + << "EnableMKLDNN() only works when IR optimization is enabled."; + } +#ifdef PADDLE_WITH_MKLDNN + pass_builder()->EnableMKLDNN(); + use_mkldnn_ = true; +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; + use_mkldnn_ = false; +#endif + } + + if (ir_debug_) { + pass_builder()->TurnOnDebug(); + } +} + +std::string contrib::AnalysisConfig::SerializeInfoCache() { + std::stringstream ss; + ss << use_gpu_; + ss << memory_pool_init_size_mb_; + + ss << use_tensorrt_; + ss << tensorrt_workspace_size_; + ss << tensorrt_max_batchsize_; + + ss << use_mkldnn_; + ss << enable_ir_optim_; + ss << use_feed_fetch_ops_; + ss << ir_debug_; + + return ss.str(); +} + +void contrib::AnalysisConfig::SetCpuMathLibraryNumThreads( + int cpu_math_library_num_threads) { + cpu_math_library_num_threads_ = cpu_math_library_num_threads; +} + +float contrib::AnalysisConfig::fraction_of_gpu_memory_for_pool() const { +#ifdef PADDLE_WITH_CUDA + // Get the GPU memory details and calculate the fraction of memory for the + // GPU memory pool. + size_t gpu_used, gpu_available; + platform::GpuMemoryUsage(&gpu_used, &gpu_available); + double total_gpu_memory = (gpu_used + gpu_available) / 1024. / 1024.; + float fraction_of_gpu_memory = + static_cast(memory_pool_init_size_mb()) / total_gpu_memory; + return fraction_of_gpu_memory; +#else + return 0.; +#endif } void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, size_t prog_buffer_size, const char *param_buffer, size_t param_buffer_size) { - prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size); - param_file = std::string(param_buffer, param_buffer + param_buffer_size); + prog_file_ = std::string(prog_buffer, prog_buffer + prog_buffer_size); + params_file_ = std::string(param_buffer, param_buffer + param_buffer_size); model_from_memory_ = true; } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 3aaec10ee2d442f834c490d51d73a58421d2c38f..585634fae9c85f77cc77d774ac166891014a025c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -33,6 +33,7 @@ #include "paddle/fluid/inference/utils/singleton.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/profiler.h" DECLARE_bool(profile); @@ -59,8 +60,8 @@ bool AnalysisPredictor::Init( if (FLAGS_profile) { LOG(WARNING) << "Profiler is actived, might affect the performance"; LOG(INFO) << "You can turn off by set gflags '-profile false'"; - auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll - : platform::ProfilerState::kCPU; + auto tracking_device = config_.use_gpu() ? platform::ProfilerState::kAll + : platform::ProfilerState::kCPU; platform::EnableProfiler(tracking_device); } @@ -112,7 +113,7 @@ bool AnalysisPredictor::PrepareProgram( // Optimize the program, and load parameters and modify them in the // scope_. // This will change the scope_ address. - if (config_.enable_ir_optim) { + if (config_.ir_optim()) { status_ir_optim_enabled_ = true; OptimizeInferenceProgram(); } else { @@ -140,9 +141,9 @@ bool AnalysisPredictor::PrepareProgram( return true; } bool AnalysisPredictor::CreateExecutor() { - if (config_.use_gpu) { + if (config_.use_gpu_) { status_use_gpu_ = true; - place_ = paddle::platform::CUDAPlace(config_.device); + place_ = paddle::platform::CUDAPlace(config_.device_id_); } else { place_ = paddle::platform::CPUPlace(); } @@ -151,7 +152,7 @@ bool AnalysisPredictor::CreateExecutor() { } bool AnalysisPredictor::PrepareExecutor() { executor_->Prepare(sub_scope_, *inference_program_, 0, - config_.use_feed_fetch_ops); + config_.use_feed_fetch_ops_); PADDLE_ENFORCE_NOT_NULL(sub_scope_); @@ -250,7 +251,7 @@ bool AnalysisPredictor::SetFeed(const std::vector &inputs, } input.set_lod(lod); int idx = -1; - if (config_.specify_input_name) { + if (config_.specify_input_name_) { auto name = inputs[i].name; if (feed_names_.find(name) == feed_names_.end()) { LOG(ERROR) << "feed names from program do not have name: [" << name @@ -314,22 +315,22 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, void AnalysisPredictor::OptimizeInferenceProgram() { status_program_optimized_ = true; - argument_.SetUseGPU(config_.use_gpu); - argument_.SetGPUDeviceId(config_.device); + argument_.SetUseGPU(config_.use_gpu()); + argument_.SetGPUDeviceId(config_.gpu_device_id()); argument_.SetModelFromMemory(config_.model_from_memory_); // Analyze inference_program - if (!config_.model_dir.empty()) { - argument_.SetModelDir(config_.model_dir); + if (!config_.model_dir().empty()) { + argument_.SetModelDir(config_.model_dir()); } else { PADDLE_ENFORCE( - !config_.param_file.empty(), + !config_.params_file().empty(), "Either model_dir or (param_file, prog_file) should be set."); - PADDLE_ENFORCE(!config_.prog_file.empty()); - argument_.SetModelProgramPath(config_.prog_file); - argument_.SetModelParamsPath(config_.param_file); + PADDLE_ENFORCE(!config_.prog_file().empty()); + argument_.SetModelProgramPath(config_.prog_file()); + argument_.SetModelParamsPath(config_.params_file()); } - if (config_.use_gpu && config_.use_tensorrt_) { + if (config_.use_gpu() && config_.tensorrt_engine_enabled()) { argument_.SetUseTensorRT(true); argument_.SetTensorRtWorkspaceSize(config_.tensorrt_workspace_size_); argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); @@ -341,7 +342,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { } auto passes = config_.pass_builder()->AllPasses(); - if (!config_.enable_ir_optim) passes.clear(); + if (!config_.ir_optim()) passes.clear(); argument_.SetIrAnalysisPasses(passes); argument_.SetScopeNotOwned(const_cast(scope_.get())); Analyzer().Run(&argument_); @@ -358,18 +359,26 @@ template <> std::unique_ptr CreatePaddlePredictor< AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { VLOG(3) << "create AnalysisConfig"; - if (config.use_gpu) { + if (config.use_gpu()) { // 1. GPU memeroy - PADDLE_ENFORCE_GT( - config.fraction_of_gpu_memory, 0.f, - "fraction_of_gpu_memory in the config should be set to range (0., 1.]"); - PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); + PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f); + PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d", + config.gpu_device_id()); std::vector flags; - if (config.fraction_of_gpu_memory >= 0.0f || - config.fraction_of_gpu_memory <= 0.95f) { + + float fraction_of_gpu_memory = config.fraction_of_gpu_memory_for_pool(); + if (fraction_of_gpu_memory > 0.95f) { + LOG(ERROR) + << "Allocate too much memory for the GPU memory pool, assigned " + << config.memory_pool_init_size_mb() << " MB"; + LOG(ERROR) + << "Try to shink the value by setting AnalysisConfig::EnableGpu(...)"; + } + + if (fraction_of_gpu_memory >= 0.0f || fraction_of_gpu_memory <= 0.95f) { flags.push_back("dummpy"); std::string flag = "--fraction_of_gpu_memory_to_use=" + - std::to_string(config.fraction_of_gpu_memory); + std::to_string(fraction_of_gpu_memory); flags.push_back(flag); VLOG(3) << "set flag: " << flag; framework::InitGflags(flags); @@ -443,22 +452,22 @@ bool AnalysisPredictor::ZeroCopyRun() { bool AnalysisPredictor::LoadProgramDesc() { // Initialize the inference program std::string filename; - if (!config_.model_dir.empty()) { - filename = config_.model_dir + "/__model__"; - } else if (!config_.prog_file.empty() && !config_.param_file.empty()) { + if (!config_.model_dir().empty()) { + filename = config_.model_dir() + "/__model__"; + } else if (!config_.prog_file().empty() && !config_.params_file().empty()) { // All parameters are saved in a single file. // The file names should be consistent with that used // in Python API `fluid.io.save_inference_model`. - filename = config_.prog_file; + filename = config_.prog_file(); } else { - if (config_.model_dir.empty() && config_.prog_file.empty()) { + if (config_.model_dir().empty() && config_.prog_file().empty()) { LOG(ERROR) << "Either model_dir or (prog_file, param_file) should be set."; return false; } LOG(ERROR) << string::Sprintf( - "not valid model path '%s' or program path '%s'.", config_.model_dir, - config_.param_file); + "not valid model path '%s' or program path '%s'.", config_.model_dir(), + config_.params_file()); return false; } @@ -478,7 +487,7 @@ bool AnalysisPredictor::LoadProgramDesc() { proto.ParseFromString(pb_content); } else { - proto.ParseFromString(config_.prog_file); + proto.ParseFromString(config_.prog_file()); } inference_program_.reset(new framework::ProgramDesc(proto)); return true; @@ -508,27 +517,27 @@ bool AnalysisPredictor::LoadParameters() { new_var->SetLoDLevel(var->GetLoDLevel()); new_var->SetPersistable(true); - if (!config_.param_file.empty()) { + if (!config_.params_file().empty()) { params.push_back(new_var->Name()); } else { // append_op framework::OpDesc *op = load_block->AppendOp(); op->SetType("load"); op->SetOutput("Out", {new_var->Name()}); - op->SetAttr("file_path", {config_.model_dir + "/" + new_var->Name()}); + op->SetAttr("file_path", {config_.model_dir() + "/" + new_var->Name()}); op->CheckAttrs(); } } } - if (!config_.param_file.empty()) { + if (!config_.params_file().empty()) { // sort paramlist to have consistent ordering std::sort(params.begin(), params.end()); // append just the load_combine op framework::OpDesc *op = load_block->AppendOp(); op->SetType("load_combine"); op->SetOutput("Out", params); - op->SetAttr("file_path", {config_.param_file}); + op->SetAttr("file_path", {config_.params_file()}); op->CheckAttrs(); } diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index a361b34437ade36dfba2c99db800a7d77ada8704..6169e60541e4a14d560e719d56624b3219dbcefd 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -25,9 +25,9 @@ namespace paddle { using contrib::AnalysisConfig; TEST(AnalysisPredictor, analysis_off) { - AnalysisConfig config(false); - config.model_dir = FLAGS_dirname; - config.enable_ir_optim = false; + AnalysisConfig config; + config.SetModel(FLAGS_dirname); + config.SwitchIrOptim(false); auto _predictor = CreatePaddlePredictor(config); auto* predictor = static_cast(_predictor.get()); @@ -55,14 +55,14 @@ TEST(AnalysisPredictor, analysis_off) { } TEST(AnalysisPredictor, analysis_on) { + AnalysisConfig config; + config.SetModel(FLAGS_dirname); + config.SwitchIrOptim(true); #ifdef PADDLE_WITH_CUDA - AnalysisConfig config(true); - config.fraction_of_gpu_memory = 0.15; + config.EnableUseGpu(100, 0); #else - AnalysisConfig config; + config.DisableGpu(); #endif - config.model_dir = FLAGS_dirname; - config.enable_ir_optim = true; auto _predictor = CreatePaddlePredictor(config); auto* predictor = static_cast(_predictor.get()); @@ -89,7 +89,8 @@ TEST(AnalysisPredictor, analysis_on) { } // compare with NativePredictor - auto naive_predictor = CreatePaddlePredictor(config); + auto naive_predictor = + CreatePaddlePredictor(config.ToNativeConfig()); std::vector naive_outputs; ASSERT_TRUE(naive_predictor->Run(inputs, &naive_outputs)); ASSERT_EQ(naive_outputs.size(), 1UL); @@ -98,9 +99,8 @@ TEST(AnalysisPredictor, analysis_on) { TEST(AnalysisPredictor, ZeroCopy) { AnalysisConfig config; - config.model_dir = FLAGS_dirname; - config.use_feed_fetch_ops = false; - + config.SetModel(FLAGS_dirname); + config.SwitchUseFeedFetchOps(false); auto predictor = CreatePaddlePredictor(config); auto w0 = predictor->GetInputTensor("firstw"); @@ -137,9 +137,9 @@ TEST(AnalysisPredictor, ZeroCopy) { TEST(AnalysisPredictor, Clone) { AnalysisConfig config; - config.model_dir = FLAGS_dirname; - config.use_feed_fetch_ops = true; - config.enable_ir_optim = true; + config.SetModel(FLAGS_dirname); + config.SwitchUseFeedFetchOps(true); + config.SwitchIrOptim(true); std::vector> predictors; predictors.emplace_back(CreatePaddlePredictor(config)); diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h index 6a8b81cc57281b12cd3a4c89c863b20a824ce34a..e14d93de2c41f740bc175c8e59412d7b828dd381 100644 --- a/paddle/fluid/inference/api/api_anakin_engine.h +++ b/paddle/fluid/inference/api/api_anakin_engine.h @@ -19,8 +19,6 @@ limitations under the License. */ #pragma once -#define WITH_ANAKIN - #include #include "framework/core/net/net.h" diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 102147a493ed1454db1a78124200f163f68e555b..85e250aaaf4a18a261a4bfc5271670f93565a336 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -288,7 +288,7 @@ std::unique_ptr CreatePaddlePredictor< VLOG(3) << "create NativePaddlePredictor"; if (config.use_gpu) { // 1. GPU memeroy - PADDLE_ENFORCE_GT( + PADDLE_ENFORCE_GE( config.fraction_of_gpu_memory, 0.f, "fraction_of_gpu_memory in the config should be set to range (0., 1.]"); PADDLE_ENFORCE_GE(config.device, 0, "Invalid device id %d", config.device); diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 78396397397c3125c3990073d6b2887ebb477ff2..54895679ca37362c7267677af80274b8de95e296 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -295,7 +295,8 @@ TEST(inference_api_native, image_classification_gpu) { #endif TEST(PassBuilder, Delete) { - contrib::AnalysisConfig config(false); + contrib::AnalysisConfig config; + config.DisableGpu(); config.pass_builder()->DeletePass("attention_lstm_fuse_pass"); const auto& passes = config.pass_builder()->AllPasses(); auto it = std::find(passes.begin(), passes.end(), "attention_lstm_fuse_pass"); diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc index 61ecd7bce683e40bbf89a343bfdbaa2b7051ae73..30215e480f908f353f00cbc9077e6c057222423a 100644 --- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc @@ -36,12 +36,11 @@ namespace demo { */ void Main() { std::unique_ptr predictor; - paddle::contrib::AnalysisConfig config(true); - config.param_file = FLAGS_modeldir + "/__params__"; - config.prog_file = FLAGS_modeldir + "/__model__"; - config.device = 0; + paddle::contrib::AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel(FLAGS_modeldir + "/__params__", + FLAGS_modeldir + "/__model__"); config.EnableTensorRtEngine(); - config.fraction_of_gpu_memory = 0.1; // set by yourself predictor = CreatePaddlePredictor(config); VLOG(3) << "begin to process data"; diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc index bc8891455dc8e4a30ddfcc5f89792296e59c2548..5320992b7e78f4aa0ea8950af03038c1953dd027 100644 --- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc +++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc @@ -40,15 +40,14 @@ using contrib::AnalysisConfig; */ void Main(bool use_gpu) { std::unique_ptr predictor, analysis_predictor; - AnalysisConfig config(use_gpu); - config.param_file = FLAGS_modeldir + "/__params__"; - config.prog_file = FLAGS_modeldir + "/__model__"; - config.device = 0; - if (FLAGS_use_gpu) { - config.fraction_of_gpu_memory = 0.1; // set by yourself + AnalysisConfig config; + if (use_gpu) { + config.EnableUseGpu(100, 0); } + config.SetModel(FLAGS_modeldir + "/__model__", + FLAGS_modeldir + "/__params__"); - predictor = CreatePaddlePredictor(config); + predictor = CreatePaddlePredictor(config.ToNativeConfig()); analysis_predictor = CreatePaddlePredictor(config); // Just a single batch of data. diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index e7ccea6587a250d9d931fa0e85146e32af714d26..2d61098f933f9e391bc7b2bee9f8fd8518302168 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -34,26 +34,67 @@ class AnalysisPredictor; namespace contrib { // NOTE WIP, not stable yet. -struct AnalysisConfig : public NativeConfig { - explicit AnalysisConfig(bool use_gpu = false); +struct AnalysisConfig { + AnalysisConfig() = default; explicit AnalysisConfig(const AnalysisConfig& other); - explicit AnalysisConfig(AnalysisConfig&& other); + explicit AnalysisConfig(const std::string& model_dir); + explicit AnalysisConfig(const std::string& prog_file, + const std::string& params_file); + + // Model path related. + void SetModel(const std::string& model_dir) { model_dir_ = model_dir; } + void SetModel(const std::string& prog_file_path, + const std::string& params_file_path); + void SetProgFile(const std::string& x) { prog_file_ = x; } + void SetParamsFile(const std::string& x) { params_file_ = x; } + const std::string& model_dir() const { return model_dir_; } + const std::string& prog_file() const { return prog_file_; } + const std::string& params_file() const { return params_file_; } + + // GPU related. + void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0); + void DisableGpu(); + bool use_gpu() const { return use_gpu_; } + int gpu_device_id() const { return device_id_; } + int memory_pool_init_size_mb() const { return memory_pool_init_size_mb_; } + float fraction_of_gpu_memory_for_pool() const; // Determine whether to perform graph optimization. - bool enable_ir_optim = true; + void SwitchIrOptim(int x = true) { enable_ir_optim_ = x; } + bool ir_optim() const { return enable_ir_optim_; } - // Get a pass builder for customize the passes in IR analysis phase. - PassStrategy* pass_builder() const; + void SwitchUseFeedFetchOps(int x = true) { use_feed_fetch_ops_ = x; } + bool use_feed_fetch_ops_enabled() const { return use_feed_fetch_ops_; } - // NOT stable yet. - bool use_feed_fetch_ops{true}; + void SwitchSpecifyInputNames(bool x = true) { specify_input_name_ = x; } + bool specify_input_name() const { return specify_input_name_; } void EnableTensorRtEngine(int workspace_size = 1 << 20, int max_batch_size = 1, int min_subgraph_size = 3); - bool use_tensorrt() const { return use_tensorrt_; } + bool tensorrt_engine_enabled() const { return use_tensorrt_; } + + void SwitchIrDebug(int x = true) { ir_debug_ = x; } void EnableMKLDNN(); - bool use_mkldnn() const { return use_mkldnn_; } + bool mkldnn_enabled() const { return use_mkldnn_; } + + // Set and get the number of cpu math library threads. + void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads); + int cpu_math_library_num_threads() const { + return cpu_math_library_num_threads_; + } + + NativeConfig ToNativeConfig() const { + NativeConfig config; + config.model_dir = model_dir_; + config.prog_file = prog_file_; + config.param_file = params_file_; + config.use_gpu = use_gpu_; + config.device = device_id_; + config.fraction_of_gpu_memory = fraction_of_gpu_memory_for_pool(); + config.specify_input_name = specify_input_name_; + return config; + } void SetMKLDNNOp(std::unordered_set op_list) { mkldnn_enabled_op_types_ = op_list; } @@ -65,10 +106,29 @@ struct AnalysisConfig : public NativeConfig { friend class ::paddle::AnalysisPredictor; + // NOTE just for developer, not an official API, easily to be broken. + // Get a pass builder for customize the passes in IR analysis phase. + PassStrategy* pass_builder() const; + + protected: + // Update the config. + void Update(); + + std::string SerializeInfoCache(); + protected: + // Model pathes. + std::string model_dir_; + std::string prog_file_; + std::string params_file_; + + // GPU releated. + bool use_gpu_{false}; + int device_id_{0}; + uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. + + // TensorRT releated. bool use_tensorrt_{false}; - bool use_mkldnn_{false}; - std::unordered_set mkldnn_enabled_op_types_; // For workspace_size, refer it from here: // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting int tensorrt_workspace_size_; @@ -82,17 +142,24 @@ struct AnalysisConfig : public NativeConfig { // We set this variable to control the minimum number of nodes in the // subgraph, 3 as default value. int tensorrt_min_subgraph_size_{3}; - std::unique_ptr pass_builder_; + + bool use_mkldnn_{false}; + std::unordered_set mkldnn_enabled_op_types_; + bool model_from_memory_{false}; -}; -// Configurations for Anakin engine. -struct AnakinConfig : public PaddlePredictor::Config { - enum TargetType { NVGPU = 0, X86 }; - int device; - std::string model_file; - int max_batch_size{-1}; - TargetType target_type; + bool enable_ir_optim_{true}; + bool use_feed_fetch_ops_{true}; + bool ir_debug_{false}; + + bool specify_input_name_{false}; + + int cpu_math_library_num_threads_{1}; + + // A runtime cache, shouldn't be transferred to others. + std::string serialized_info_cache_; + + mutable std::unique_ptr pass_builder_; }; } // namespace contrib diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index 92fb51d647cf4e2c8a4914d8df2e8b7b6318d1d1..1785bd520a17d5f5060d789b2e4e4f1eda26aa6a 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -26,9 +26,8 @@ limitations under the License. */ #include #include -#include "paddle_api.h" // NOLINT -#ifndef WITH_ANAKIN #include "paddle_analysis_config.h" // NOLINT -#else +#include "paddle_api.h" // NOLINT +#ifdef WITH_ANAKIN #include "paddle_anakin_config.h" // NOLINT #endif diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 1062ac5f58b90d8649dae8bacc9ce154b8b9d844..b4cbc40e0fa0313412af0acb5a4cc620d9a8ae50 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -62,7 +62,12 @@ class PassStrategy : public PaddlePassBuilder { // still some CPU kernels running in CPU mode. virtual void EnableMKLDNN() = 0; + bool use_gpu() const { return use_gpu_; } + virtual ~PassStrategy() = default; + + protected: + bool use_gpu_{false}; }; /* @@ -88,6 +93,7 @@ class CpuPassStrategy : public PassStrategy { "conv_eltwiseadd_bn_fuse_pass", // "is_test_pass", // }); + use_gpu_ = false; } virtual ~CpuPassStrategy() = default; @@ -126,10 +132,14 @@ class GpuPassStrategy : public PassStrategy { "conv_elementwise_add2_act_fuse_pass", // "conv_elementwise_add_fuse_pass", // }); + + use_gpu_ = true; } GpuPassStrategy(const GpuPassStrategy &other) - : PassStrategy(other.AllPasses()) {} + : PassStrategy(other.AllPasses()) { + use_gpu_ = true; + } void EnableMKLDNN() override; diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt index 17f6c6d9f10abf99fd93364d1356e2b3ef1b3934..9afeafd176c70bc03166ec7732ae5e2faf67ea54 100644 --- a/paddle/fluid/inference/tensorrt/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt @@ -1,4 +1,5 @@ nv_library(tensorrt_engine SRCS engine.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context) +nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto) nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader) nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine) add_subdirectory(plugin) diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc new file mode 100644 index 0000000000000000000000000000000000000000..9fecad6eb3889f48f2e0012a718ed0d04f34ae66 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/op_teller.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +// Just tell by the op_types. +struct SimpleOpTypeSetTeller : public Teller { + SimpleOpTypeSetTeller() {} + + bool operator()(const std::string& op_type, + const framework::OpDesc& desc) override { + return teller_set.count(op_type); + } + + private: + std::unordered_set teller_set{ + {"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", + "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", + "elementwise_add", "elementwise_mul", "dropout", "split", "prelu", + "conv2d_transpose", "leaky_relu"}}; +}; + +bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc) { + for (auto& teller : tellers_) { + if ((*teller)(op_type, desc)) return true; + } + return false; +} + +OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); } + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h new file mode 100644 index 0000000000000000000000000000000000000000..b98f052bf2478098d74f19858ec79823d5ab1e2d --- /dev/null +++ b/paddle/fluid/inference/tensorrt/op_teller.h @@ -0,0 +1,68 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_desc.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Single Op teller definition. + * One can override this and define a more complex tell logic, considerring more + * issues such as op_desc. + */ +struct Teller { + virtual bool operator()(const std::string& op_type, + const framework::OpDesc& desc) = 0; + + virtual ~Teller() = default; +}; +/* + * A real example: + * + * struct SomeTeller : public Teller { + * bool operator()(const std::string& op_type, + * const framework::OpDesc& desc) override { + * return op_type == "fc" && desc.Inputs().size() == 2; + * } + *}; + */ + +/* + * class OpTeller helps to tell whether a fluid + * operator can be transformed to a TensorRT layer. + */ +class OpTeller { + public: + static OpTeller& Global() { + static std::unique_ptr x(new OpTeller); + return *x; + } + + bool Tell(const std::string& op_type, const framework::OpDesc& desc); + + private: + OpTeller(); + + private: + std::vector> tellers_; +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index a1a79c68855686d31d7174d929d199d266608ba0..131712ca88370aa977184fcb00d09f2283db110c 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -41,7 +41,7 @@ endfunction() if(NOT APPLE AND WITH_MKLML) set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") download_model_and_data(${RNN1_INSTALL_DIR} "rnn1%2Fmodel.tar.gz" "rnn1%2Fdata.txt.tar.gz") - inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc) + inference_analysis_api_test(test_analyzer_rnn1 ${RNN1_INSTALL_DIR} analyzer_rnn1_tester.cc SERIAL) else() # TODO: fix this test on MACOS and OPENBLAS, the reason is that # fusion_seqexpand_concat_fc_op is not supported on MACOS and OPENBLAS @@ -56,14 +56,14 @@ inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2 # normal DAM set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") -inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc) +inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc SERIAL) # small DAM set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam") download_model_and_data(${DAM_SMALL_INSTALL_DIR} "dam_small_model.tar.gz" "dam_small_data.txt.tar.gz") inference_analysis_test(test_analyzer_small_dam SRCS analyzer_dam_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1) + ARGS --infer_model=${DAM_SMALL_INSTALL_DIR}/model --infer_data=${DAM_SMALL_INSTALL_DIR}/data.txt --max_turn_num=1 SERIAL) # chinese_ner set(CHINESE_NER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/chinese_ner") @@ -111,11 +111,11 @@ inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose # resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 - "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") + "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz" SERIAL) # mobilenet with depthwise_conv op inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv - "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz") + "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz" SERIAL) # anakin if (WITH_ANAKIN AND WITH_MKL) # only needed in CI diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index 12d61d06ce188a2478448373427f2defae5a2524..5ad6e4a8570b309e94375234d673e27698999cb7 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -165,12 +165,9 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } void SetConfig(contrib::AnalysisConfig *cfg) { - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->param_file = FLAGS_infer_model + "/param"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param"); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(true); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc index 2213971c1764b1a0bddfce5830bbdf2ffedd61ee..b9666e01adb23e0cbd9257bc55081c3a5001e887 100644 --- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc @@ -105,11 +105,10 @@ void GetOneBatch(std::vector *input_slots, DataRecord *data, } void SetConfig(AnalysisConfig *cfg) { - cfg->model_dir = FLAGS_infer_model; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc index 9d3c7519430522878ace697ea5ed38aebb6b0855..1318fbcbc4022457354fb34c727cf56ce26e12ec 100644 --- a/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_mm_dnn_tester.cc @@ -76,11 +76,10 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } void SetConfig(contrib::AnalysisConfig *cfg) { - cfg->model_dir = FLAGS_infer_model; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 04f8b3ffe894c7df0fb0c95e94a92b4f216f02de..6fef79dc4608acd6eee679ad4939e7684db98f5b 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -84,13 +84,12 @@ void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) { cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0], buffer_param.size()); } else { - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->param_file = FLAGS_infer_model + "/param"; + cfg->SetModel(FLAGS_infer_model + "/__model__", + FLAGS_infer_model + "/param"); } - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc index 764ae5ed8506a7ed7dc51a5c36d0dd7e9df925f3..629981d565f1b6eeabc192287cb9f892df21b8e4 100644 --- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -21,12 +21,10 @@ namespace inference { namespace analysis { void SetConfig(AnalysisConfig *cfg) { - cfg->param_file = FLAGS_infer_model + "/params"; - cfg->prog_file = FLAGS_infer_model + "/model"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->enable_ir_optim = true; - cfg->specify_input_name = true; + cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); + cfg->DisableGpu(); + cfg->SwitchIrOptim(); + cfg->SwitchSpecifyInputNames(); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); } diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc index 17f4587a5093a2f1cd2d8acc0e17f2129ad36353..3c52afbfb8f60b1e9389d416a5640c9685d8e764 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc @@ -204,12 +204,10 @@ void PrepareZeroCopyInputs(ZeroCopyTensor *lod_attention_tensor, } void SetConfig(AnalysisConfig *cfg) { - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->param_file = FLAGS_infer_model + "/param"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param"); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { @@ -225,10 +223,10 @@ void SetInput(std::vector> *inputs) { // Easy for profiling independently. TEST(Analyzer_rnn1, profile) { - contrib::AnalysisConfig cfg(false); + contrib::AnalysisConfig cfg; SetConfig(&cfg); - cfg.fraction_of_gpu_memory = 0.1; - cfg.pass_builder()->TurnOnDebug(); + cfg.DisableGpu(); + cfg.SwitchIrDebug(); std::vector outputs; std::vector> input_slots_all; @@ -293,16 +291,18 @@ TEST(Analyzer_rnn1, multi_thread) { TEST(Analyzer_rnn1, ZeroCopy) { AnalysisConfig config; SetConfig(&config); - config.use_feed_fetch_ops = false; + config.SwitchUseFeedFetchOps(false); PaddlePlace place; auto predictor = CreatePaddlePredictor(config); - config.use_feed_fetch_ops = true; - auto native_predictor = CreatePaddlePredictor(config); + config.SwitchUseFeedFetchOps(true); + auto native_predictor = + CreatePaddlePredictor(config.ToNativeConfig()); - config.use_feed_fetch_ops = true; // the analysis predictor needs feed/fetch. + config.SwitchUseFeedFetchOps( + true); // the analysis predictor needs feed/fetch. auto analysis_predictor = CreatePaddlePredictor(config); #define NEW_TENSOR(name__) \ @@ -362,7 +362,7 @@ TEST(Analyzer_rnn1, ZeroCopy) { TEST(Analyzer_rnn1, ZeroCopyMultiThread) { AnalysisConfig config; SetConfig(&config); - config.use_feed_fetch_ops = false; + config.SwitchUseFeedFetchOps(false); #define NEW_TENSOR(name__) \ auto name__##_tensor = predictor->GetInputTensor(#name__); diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc index f8354e76871e7f489fd21f2f74e7402db01845c3..007f9f0b66a7b276f5f2e8500a3001788ad41e79 100644 --- a/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_rnn2_tester.cc @@ -105,12 +105,10 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } void SetConfig(AnalysisConfig *cfg) { - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->param_file = FLAGS_infer_model + "/param"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model + "/__model__", FLAGS_infer_model + "/param"); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index e6d6cd2960b394e8cd20b473bed90ce511f806be..47c1d7375843e4bad212c1d7d621c9e6d45e5982 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -89,11 +89,10 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } void SetConfig(AnalysisConfig *cfg) { - cfg->model_dir = FLAGS_infer_model; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc index 1c251e0c22f1ec88f0e59c71d623e4e0585db795..a1742f606819334e7b15e644f8b9e330795bf16e 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc @@ -122,12 +122,9 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data) { } void SetConfig(AnalysisConfig *cfg) { - cfg->param_file = FLAGS_infer_model + "/params"; - cfg->prog_file = FLAGS_infer_model + "/model"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->enable_ir_optim = true; - cfg->specify_input_name = true; + cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); cfg->pass_builder()->TurnOnDebug(); cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); } diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc index 79f3c81ade450fa00419b652042b2cfc79b08e4c..7b448a3200351f902df277f7a653cf7114becba0 100644 --- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc @@ -47,11 +47,10 @@ struct DataReader { }; void SetConfig(AnalysisConfig *cfg) { - cfg->model_dir = FLAGS_infer_model; - cfg->use_gpu = false; - cfg->device = 0; - cfg->specify_input_name = true; - cfg->enable_ir_optim = true; + cfg->SetModel(FLAGS_infer_model); + cfg->DisableGpu(); + cfg->SwitchSpecifyInputNames(); + cfg->SwitchIrOptim(); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index d73bccefd5fc8a8ad8679b7de3feac50f786daed..5a77b53a8513cdbef5620d36ba5e0722ae993916 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -51,12 +51,11 @@ Record ProcessALine(const std::string &line) { } void SetConfig(AnalysisConfig *cfg) { - cfg->param_file = FLAGS_infer_model + "/__params__"; - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->use_gpu = false; - cfg->device = 0; - cfg->enable_ir_optim = true; - cfg->specify_input_name = true; + cfg->SetModel(FLAGS_infer_model + "/__model__", + FLAGS_infer_model + "/__params__"); + cfg->DisableGpu(); + cfg->SwitchIrDebug(); + cfg->SwitchSpecifyInputNames(); // TODO(TJ): fix fusion gru cfg->pass_builder()->DeletePass("fc_gru_fuse_pass"); } diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index 7046bce303e2bd46197ab512ae273500b9af88bf..cf0f1d5c18c79e34c96d4301dbf13c924ae2a3f0 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -64,19 +64,23 @@ std::ostream &operator<<(std::ostream &os, num_spaces++; os << *reinterpret_cast(&config); if (!config.model_from_memory()) { - os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n"; - os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n"; + os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file() << "\n"; + os << GenSpaces(num_spaces) << "param_file: " << config.params_file() + << "\n"; } else { os << GenSpaces(num_spaces) << "prog_file and param_file: load from memory \n"; } - os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim + os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim() << "\n"; + os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim() + << "\n"; + os << GenSpaces(num_spaces) + << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n"; os << GenSpaces(num_spaces) - << "use_feed_fetch_ops: " << config.use_feed_fetch_ops << "\n"; - os << GenSpaces(num_spaces) << "use_tensorrt: " << config.use_tensorrt() + << "use_tensorrt: " << config.tensorrt_engine_enabled() << "\n"; + os << GenSpaces(num_spaces) << "use_mkldnn: " << config.mkldnn_enabled() << "\n"; - os << GenSpaces(num_spaces) << "use_mkldnn: " << config.use_mkldnn() << "\n"; num_spaces--; os << GenSpaces(num_spaces) << "}\n"; return os; diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 7eb44d9f4ea6e27a504984eac4f960bddc9032e1..41d033df85811a4730cab8b3275aaffd1ba338e5 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -328,7 +328,10 @@ void CompareNativeAndAnalysis( const std::vector> &inputs) { PrintConfig(config, true); std::vector native_outputs, analysis_outputs; - TestOneThreadPrediction(config, inputs, &native_outputs, false); + const auto *analysis_config = + reinterpret_cast(config); + auto native_config = analysis_config->ToNativeConfig(); + TestOneThreadPrediction(&native_config, inputs, &native_outputs, false); TestOneThreadPrediction(config, inputs, &analysis_outputs, true); CompareResult(analysis_outputs, native_outputs); } diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index d3bd035c1c49c926fc9f5ed83085b2e6d9ca8c93..21df6eab814dad5e2f654bf6d9558a2f9859d5ae 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -46,22 +46,20 @@ void SetConfig(contrib::AnalysisConfig* config, std::string model_dir, bool use_gpu, bool use_tensorrt, int batch_size) { if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) { - config->prog_file = model_dir + "/" + FLAGS_prog_filename; - config->param_file = model_dir + "/" + FLAGS_param_filename; + config->SetModel(model_dir + "/" + FLAGS_prog_filename, + model_dir + "/" + FLAGS_param_filename); } else { - config->model_dir = model_dir; + config->SetModel(model_dir); } if (use_gpu) { - config->use_gpu = true; - config->device = 0; - config->fraction_of_gpu_memory = 0.15; + config->EnableUseGpu(100, 0); if (use_tensorrt) { config->EnableTensorRtEngine(1 << 10, batch_size); config->pass_builder()->DeletePass("conv_bn_fuse_pass"); config->pass_builder()->DeletePass("fc_fuse_pass"); config->pass_builder()->TurnOnDebug(); } else { - config->enable_ir_optim = true; + config->SwitchIrOptim(); } } } @@ -77,7 +75,8 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { std::vector outputs; if (use_analysis || use_tensorrt) { - contrib::AnalysisConfig config(true); + contrib::AnalysisConfig config; + config.EnableUseGpu(100, 0); config.pass_builder()->TurnOnDebug(); SetConfig(&config, model_dir, true, use_tensorrt, FLAGS_batch_size); @@ -109,7 +108,8 @@ void compare(std::string model_dir, bool use_tensorrt) { &native_outputs, false); std::vector analysis_outputs; - contrib::AnalysisConfig analysis_config(true); + contrib::AnalysisConfig analysis_config; + analysis_config.EnableUseGpu(50, 0); SetConfig(&analysis_config, model_dir, true, use_tensorrt, FLAGS_batch_size); TestOneThreadPrediction( @@ -154,9 +154,9 @@ TEST(TensorRT_mobilenet, analysis) { TEST(AnalysisPredictor, use_gpu) { std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; - AnalysisConfig config(true); - config.model_dir = model_dir; - config.fraction_of_gpu_memory = 0.15; + AnalysisConfig config; + config.EnableUseGpu(100, 0); + config.SetModel(model_dir); config.pass_builder()->TurnOnDebug(); std::vector> inputs_all; diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index ee154207754f20791bdba55d0d66aea600e8dee6..e53a6a562ad1ed2ca02683b07cf6d4b56bc2cde7 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -53,7 +53,7 @@ if (WITH_GPU) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) endif() # conv_fusion_op needs cudnn 7 above - if (NOT ${CUDNN_MAJOR_VERSION} VERSION_LESS 7) + if (NOT ${CUDNN_VERSION} VERSION_LESS 7100) op_library(conv_fusion_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n") endif() diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 8c116c4abfe42296b616dc536821e9be55a8be84..03d9d466c3238c6c853bca75f5b9791a0841ff78 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -12,6 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/conv_op.h" @@ -68,13 +69,22 @@ inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format, } } -template +template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); + bool is_INT8 = + std::is_same::value || std::is_same::value; + if (!is_INT8) { + ComputeFP32(ctx); + } else { + ComputeINT8(ctx); + } + } + void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const { const bool is_test = ctx.Attr("is_test"); auto& dev_ctx = @@ -274,6 +284,271 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { output->set_layout(DataLayout::kMKLDNN); output->set_format(GetMKLDNNFormat(*dst_memory_p)); } + void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const { + const bool is_test = ctx.Attr("is_test"); + + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; + auto* output = ctx.Output("Output"); + + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); + PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5, + "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); + PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5, + "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW"); + if (bias) { + PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN && + bias->format() != memory::format::format_undef, + "Wrong layout/format set for Bias tensor"); + PADDLE_ENFORCE(bias->dims().size() == 1, + "Bias must only have 1 dimension, i.e. X"); + } + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + + bool fuse_relu = ctx.Attr("fuse_relu"); + + bool force_fp32_output = ctx.Attr("force_fp32_output"); + + bool is_conv3d = strides.size() == 3U; + // TODO(tpatejko): add support for dilation + PADDLE_ENFORCE( + is_conv3d + ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 && + dilations[2] == 1 + : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, + "dilation in convolution is not implemented yet"); + + PADDLE_ENFORCE(is_conv3d != true, "int8 does not support conv3d currently"); + + const T* input_data = input->data(); + + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector weights_tz = + paddle::framework::vectorize2int(filter->dims()); + int g = std::max(groups, 1); + GetWeightsTz(weights_tz, g, is_conv3d); + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + + mkldnn::memory::data_type src_dt = + paddle::framework::ToMKLDNNDataType(input->type()); + auto dst_dt = fuse_relu ? paddle::framework::ToMKLDNNDataType( + framework::DataTypeTrait::DataType) + : paddle::framework::ToMKLDNNDataType( + framework::DataTypeTrait::DataType); + + if (force_fp32_output) { + dst_dt = paddle::framework::ToMKLDNNDataType( + framework::DataTypeTrait::DataType); + } + + // Get unique name for storing MKLDNN primitives + std::string key; + key.reserve(MaxKeyLength); + platform::ConvMKLDNNHandler::AppendKey( + &key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt, + input->format(), dst_dt, ctx.op().Output("Output")); + const std::string key_conv_pd = key + "@conv_pd"; + + std::shared_ptr conv_p = nullptr; + std::shared_ptr src_memory_p = nullptr; + std::shared_ptr user_src_memory_p = nullptr; + std::shared_ptr dst_memory_p = nullptr; + std::vector pipeline; + std::shared_ptr conv_pd = + nullptr; + std::shared_ptr handler = nullptr; + + auto prim_key = key + "@conv_p"; + auto dst_key = key + "@dst_mem_p"; + auto src_key = key + "@src_mem_p"; + auto user_src_key = key + "@user_src_mem_p"; + auto src_reorder_key = key + "@src_mem_preorder_p"; + conv_p = std::static_pointer_cast( + dev_ctx.GetBlob(prim_key)); + if (conv_p == nullptr || !is_test) { + const K* filter_data = filter->data(); + auto scale_in_data = ctx.Attr("Scale_in"); + auto scale_weights_data = ctx.Attr>("Scale_weights"); + auto scale_out_data = + force_fp32_output ? 1.0f : ctx.Attr("Scale_out"); + + bool is_multi_channel = scale_weights_data.size() > 1; + + int count = is_multi_channel ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] + : (weights_tz)[0]) + : 1; + std::vector output_shift_scale(count); +#pragma omp parallel for if (count > 1) + for (int i = 0; i < count; i++) { + if (scale_weights_data[i] == 0.0) + output_shift_scale[i] = + scale_out_data; // weights data will contain 0 + // in some models, then weights + // scale couldn't be calculated + else + output_shift_scale[i] = + scale_out_data / (scale_in_data * scale_weights_data[i]); + } + + auto user_src_md = + platform::MKLDNNMemDesc({src_tz}, src_dt, input->format()); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), + ((g) == 1) ? mkldnn::memory::format::oihw + : mkldnn::memory::format::goihw); + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + std::string data_format = ctx.Attr("data_format"); + auto chosen_memory_format = + platform::data_format_to_memory_format(data_format); + + std::vector bias_tz; + + auto src_md = + platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::s8, chosen_memory_format); + auto dst_md = + platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format); + // create a conv primitive descriptor and save it for usage in backward + if (bias) { + bias_tz = paddle::framework::vectorize2int(bias->dims()); + auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32, + memory::format::x); + conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, + strides, paddings, mkldnn_engine, + fuse_relu, output_shift_scale, is_test); + } else { + conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, + paddings, mkldnn_engine, fuse_relu, + output_shift_scale, is_test); + } + // Save conv_pd/src_memory/weights_memory for backward pass + dev_ctx.SetBlob(key_conv_pd, conv_pd); + + handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, + mkldnn_engine, key)); + + // create mkldnn memory from input tensors (data/weights) + user_src_memory_p = + handler->AcquireSrcMemory(user_src_md, to_void_cast(input_data)); + auto user_weights_memory_p = handler->AcquireWeightsMemory( + user_weights_md, to_void_cast(filter_data)); + + // create reorder primitive if the input format is not the preferred one + src_memory_p = + handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); + + std::shared_ptr weights_memory_p; + int mask_reorder = + is_multi_channel ? ((g != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0; + weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive( + user_weights_memory_p, pipeline, is_test, true, scale_weights_data, + mask_reorder); + + if (!force_fp32_output) { + if (fuse_relu) { + dst_memory_p = platform::SetDstMemory(ctx, output, handler); + } else { + dst_memory_p = platform::SetDstMemory(ctx, output, handler); + } + } else { + dst_memory_p = platform::SetDstMemory(ctx, output, handler); + } + + // create convolution op primitive + auto scale_bias_key = key + "@scale_bias"; + if (bias) { + const float* bias_data = bias->data(); + auto user_bias_md = platform::MKLDNNMemDesc( + {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); + auto user_bias_memory_p = handler->AcquireBiasMemory( + user_bias_md, to_void_cast(bias_data)); + std::shared_ptr bias_memory_p; + int mask_reorder = is_multi_channel ? 1 << 0 : 1; + int count = + is_multi_channel + ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) + : 1; + std::vector scale_bias_data(count); +#pragma omp parallel for if (count > 1) + for (int i = 0; i < count; i++) { + scale_bias_data[i] = scale_in_data * scale_weights_data[i]; + } + bias_memory_p = handler->AcquireBiasMemoryFromPrimitive( + user_bias_memory_p, pipeline, is_test, true, scale_bias_data, + mask_reorder); + conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, + bias_memory_p, dst_memory_p); + } else { + conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, + dst_memory_p); + } + + // push primitive to stream and wait until it's executed + pipeline.push_back(*conv_p); + } else { + auto src_memory_reorder_p = std::static_pointer_cast( + dev_ctx.GetBlob(src_reorder_key)); + src_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(src_key)); + if (src_memory_reorder_p) { + user_src_memory_p = std::static_pointer_cast( + dev_ctx.GetBlob(user_src_key)); + user_src_memory_p->set_data_handle(to_void_cast(input_data)); + } else if (src_memory_p) { + src_memory_p->set_data_handle(to_void_cast(input_data)); + } + + dst_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(dst_key)); + conv_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_conv_pd)); + if (conv_pd) { + handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, + mkldnn_engine, key)); + } + if (!force_fp32_output) { + if (fuse_relu) { + dst_memory_p = + platform::SetDstMemoryHandler(ctx, output, handler); + } else { + dst_memory_p = + platform::SetDstMemoryHandler(ctx, output, handler); + } + } else { + dst_memory_p = + platform::SetDstMemoryHandler(ctx, output, handler); + } + if (src_memory_reorder_p) { + pipeline.push_back(*src_memory_reorder_p); + } + pipeline.push_back(*conv_p); + } + // push primitive to stream and wait until it's executed + stream(stream::kind::eager).submit(pipeline).wait(); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(*dst_memory_p)); + } private: mkldnn::primitive_attr CreatePostOps(bool fuse_relu, @@ -301,6 +576,23 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { return conv_attr; } + mkldnn::primitive_attr CreatePostOps( + bool fuse_relu, const std::vector output_shift_scale) const { + mkldnn::primitive_attr conv_attr; + mkldnn::post_ops post_operations; + int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0; + conv_attr.set_output_scales(mask, output_shift_scale); + if (fuse_relu) { + constexpr float scale = 1.0f; + constexpr float negative_slope = 0.0f; + constexpr float placeholder = 1.0f; // beta + post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, + negative_slope, placeholder); + } + conv_attr.set_post_ops(post_operations); + return conv_attr; + } + std::unique_ptr ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, const memory::desc& dst, const std::vector& strides, @@ -325,6 +617,33 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { p_conv_pd); } + std::unique_ptr + ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, + const memory::desc& dst, const std::vector& strides, + const std::vector& paddings, + const mkldnn::engine& engine, const bool fuse_relu, + const std::vector output_shift_scale, + bool is_test) const { + memory::dims stride_dims = {strides[0], strides[1]}; + memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto propagation = is_test ? mkldnn::prop_kind::forward_scoring + : mkldnn::prop_kind::forward_training; + + auto conv_desc = mkldnn::convolution_forward::desc( + propagation, mkldnn::convolution_direct, src, weights, dst, stride_dims, + padding_dims, padding_dims, mkldnn::padding_kind::zero); + + mkldnn::primitive_attr conv_attr = + CreatePostOps(fuse_relu, output_shift_scale); + + auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( + conv_desc, conv_attr, engine); + + return std::unique_ptr( + p_conv_pd); + } + std::unique_ptr ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, const memory::desc& bias, const memory::desc& dst, @@ -349,6 +668,34 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { return std::unique_ptr( p_conv_pd); } + + std::unique_ptr + ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, + const memory::desc& bias, const memory::desc& dst, + const std::vector& strides, + const std::vector& paddings, + const mkldnn::engine& engine, const bool fuse_relu, + const std::vector output_shift_scale, + bool is_test) const { + memory::dims stride_dims = {strides[0], strides[1]}; + memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto propagation = is_test ? mkldnn::prop_kind::forward_scoring + : mkldnn::prop_kind::forward_training; + + auto conv_desc = mkldnn::convolution_forward::desc( + propagation, mkldnn::convolution_direct, src, weights, bias, dst, + stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); + + mkldnn::primitive_attr conv_attr = + CreatePostOps(fuse_relu, output_shift_scale); + + auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( + conv_desc, conv_attr, engine); + + return std::unique_ptr( + p_conv_pd); + } }; template @@ -555,7 +902,17 @@ namespace ops = paddle::operators; REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, ::paddle::platform::CPUPlace, FP32, ops::kConvMKLDNNFP32, - ops::ConvMKLDNNOpKernel); + ops::ConvMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, + ::paddle::platform::CPUPlace, U8, + ops::kConvMKLDNNFP32, + ops::ConvMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, + ::paddle::platform::CPUPlace, S8, + ops::kConvMKLDNNFP32, + ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, FP32, @@ -565,7 +922,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN, ::paddle::platform::CPUPlace, FP32, ops::kConvMKLDNNFP32, - ops::ConvMKLDNNOpKernel); + ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN, ::paddle::platform::CPUPlace, FP32, diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 8e0d2824953a372b96d5819be658636f9a3d78ba..c8b33b8932ddd3bb9706d5b555ca68df4560a31e 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -98,10 +98,12 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( #endif auto input_data_type = ctx.Input("Input")->type(); - auto filter_data_type = ctx.Input("Filter")->type(); - PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, - "input and filter data type should be consistent"); - + if (input_data_type != framework::proto::VarType::INT8 && + input_data_type != framework::proto::VarType::UINT8) { + auto filter_data_type = ctx.Input("Filter")->type(); + PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, + "input and filter data type should be consistent"); + } if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN, "float16 can only be used when CUDNN is used"); @@ -179,6 +181,26 @@ void Conv2DOpMaker::Make() { "whenever convolution output is as an input to residual " "connection.") .SetDefault(false); + AddAttr("Scale_in", + "Scale_in to be used for int8 input data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr("Scale_out", + "Scale_out to be used for int8 output data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr("Scale_in_eltwise", + "Scale_in_eltwise to be used for int8 eltwise input data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr>("Scale_weights", + "Scale_weights to be used for int8 weights data." + "Only used with MKL-DNN INT8.") + .SetDefault({1.0f}); + AddAttr("force_fp32_output", + "(bool, default false) Force INT8 kernel output FP32, only " + "used in MKL-DNN INT8") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " @@ -303,6 +325,9 @@ void Conv3DOpMaker::Make() { "Defaults to \"NHWC\". Specify the data format of the output data, " "the input will be transformed automatically. ") .SetDefault("AnyLayout"); + AddAttr("force_fp32_output", + "(bool, default false) Only used in mkldnn INT8 kernel") + .SetDefault(false); // TODO(dzhwinter): need to registered layout transform function AddAttr("workspace_size_MB", "Only used in cudnn kernel. workspace size for cudnn, in MB, " diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 2519f5e7acdb7828743c6e114adfe5e530058406..eaa288edc554d2b62eb67ca01ed2459a88772430 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/im2col.h" @@ -30,6 +29,7 @@ namespace operators { using Tensor = framework::Tensor; constexpr int kConvMKLDNNFP32 = 1; constexpr int kConvMKLDNNINT8 = 2; +constexpr int MaxKeyLength = 256; // Base convolution operator definations for other conv // like operators to reuse the implementation. @@ -158,10 +158,7 @@ class GemmConvKernel : public framework::OpKernel { // to call the matrix multiplication interface. Tensor col_matrix; if (is_expand) { - auto tmp_allocation_ptr = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - framework::product(col_shape) * sizeof(T)); - col = framework::GetTensor(std::move(tmp_allocation_ptr), col_shape); + col = context.AllocateTmpTensor(col_shape, dev_ctx); col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } @@ -293,10 +290,7 @@ class GemmConvGradKernel : public framework::OpKernel { // to call the matrix multiplication interface. Tensor col_matrix; if (is_expand) { - auto tmp_allocation_ptr = - platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate( - framework::product(col_shape) * sizeof(T)); - col = framework::GetTensor(std::move(tmp_allocation_ptr), col_shape); + col = context.AllocateTmpTensor(col_shape, dev_ctx); col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu index acd5993154ed03f206f20082231feb5059ef32e1..6337a4837a64cef2ce0e7bae70d8ba5b8994958e 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cu +++ b/paddle/fluid/operators/detection/density_prior_box_op.cu @@ -148,7 +148,7 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel { // blockx is multiple of 32. int blockx = std::min( static_cast(((feature_width * num_priors + 31) >> 5) << 5), - 512L); + static_cast(512L)); int gridx = (feature_width * num_priors + blockx - 1) / blockx; dim3 threads(blockx, 1); dim3 grids(gridx, feature_height); diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.cc b/paddle/fluid/operators/distributed/parameter_prefetch.cc index a96dec10866c012ed903b956747638848b63e23f..c63d65348880ebb4085d83059d9fead6456216d7 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.cc +++ b/paddle/fluid/operators/distributed/parameter_prefetch.cc @@ -32,7 +32,7 @@ namespace paddle { namespace operators { namespace distributed { -using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor; using SelectedRows = framework::SelectedRows; using DDim = framework::DDim; @@ -117,6 +117,12 @@ static void MergeMultipleVarsIntoOneBySection( auto& id_tensor = scope->FindVar(id_name)->Get(); auto* out_tensor = scope->FindVar(out_name)->GetMutable(); + + PADDLE_ENFORCE_GT( + out_tensor->numel(), 0, + "When calling this method, the LoDTensor's numel must larger than zero. " + "Please check LoDTensor::Resize has been called first."); + auto* out_tensor_data = out_tensor->mutable_data(id_tensor.place()); bool is_on_cpu_place = true; @@ -138,7 +144,7 @@ static void MergeMultipleVarsIntoOneBySection( auto row_numel = dims[1]; - for (size_t i = 0; i < dims[0]; ++i) { + for (int64_t i = 0; i < dims[0]; ++i) { auto id = ids_in_this_section[i]; auto origin_id = id + abs_sections[section_idx]; auto& offsets = id_to_offset[origin_id]; @@ -172,8 +178,9 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& table_names, const std::vector& epmap, const std::vector& height_sections, - const framework::ExecutionContext& context) { - auto& local_scope = context.scope().NewScope(); + const framework::ExecutionContext& context, + const framework::Scope& scope) { + auto& local_scope = scope.NewScope(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& cpu_ctx = *pool.Get(platform::CPUPlace()); @@ -190,11 +197,11 @@ void prefetch(const std::string& id_name, const std::string& out_name, out_var_names.push_back(out_name + "@" + epmap[i]); } - auto& id_tensor = local_scope.FindVar(id_name)->Get(); + auto& id_tensor = scope.FindVar(id_name)->Get(); std::vector ids_vector; if (platform::is_cpu_place(id_tensor.place())) { auto* id_data = id_tensor.data(); - for (size_t i = 0; i < id_tensor.numel(); ++i) { + for (int64_t i = 0; i < id_tensor.numel(); ++i) { ids_vector.push_back(id_data[i]); } } else { @@ -202,7 +209,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, PADDLE_THROW("paddle is not compiled with CUDA!"); #else auto cpu_place = platform::CPUPlace(); - framework::Tensor cpu_tensor; + framework::LoDTensor cpu_tensor; auto* cpu_tensor_data = cpu_tensor.mutable_data(id_tensor.dims(), cpu_place); auto stream = @@ -246,8 +253,7 @@ void prefetch(const std::string& id_name, const std::string& out_name, MergeMultipleVarsIntoOneBySection(id_name, ids_vector, out_name, out_var_names, height_sections, splited_ids, context, &local_scope, &actual_ctx); - - context.scope().DeleteScope(&local_scope); + scope.DeleteScope(&local_scope); } }; // namespace distributed diff --git a/paddle/fluid/operators/distributed/parameter_prefetch.h b/paddle/fluid/operators/distributed/parameter_prefetch.h index 53b0fbfb51f60fa86351cca34fd1665c7802591b..2f850a0332256d458e79ed9da361c86eb8a2f780 100644 --- a/paddle/fluid/operators/distributed/parameter_prefetch.h +++ b/paddle/fluid/operators/distributed/parameter_prefetch.h @@ -27,7 +27,56 @@ void prefetch(const std::string& id_name, const std::string& out_name, const std::vector& table_names, const std::vector& epmap, const std::vector& height_sections, - const framework::ExecutionContext& context); + const framework::ExecutionContext& context, + const framework::Scope& scope); + +template +void prefetch_with_reconstruct(const std::string& id_name, + const std::string& out_name, + const std::vector& table_names, + const std::vector& epmap, + const std::vector& height_sections, + const framework::ExecutionContext& context, + const framework::Scope& scope, + framework::LoDTensor* original) { + prefetch(id_name, out_name, table_names, epmap, height_sections, context, + scope); + auto& out = scope.FindVar(out_name)->Get(); + auto& ids = scope.FindVar(id_name)->Get(); + auto* original_value = original->data(); + auto* out_value = out.data(); + size_t original_width = original->numel() / original->dims()[0]; + + bool is_on_cpu_place = true; + if (!platform::is_cpu_place(ids.place())) { + is_on_cpu_place = false; + } + if (is_on_cpu_place) { + for (int64_t i = 0; i < ids.numel(); i++) { + const T* out_rows = out_value + original_width * i; + T* original_row = + original_value + original_width * ids.data()[i]; + std::memcpy(original_row, out_rows, original_width * sizeof(T)); + } + } else { +#ifndef PADDLE_WITH_CUDA + PADDLE_THROW("paddle is not compiled with CUDA!"); +#else + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& actual_ctx = *pool.Get(context.GetPlace()); + for (int64_t i = 0; i < ids.numel(); i++) { + const T* out_rows = out_value + original_width * i; + T* original_row = + original_value + original_width * ids.data()[i]; + auto stream = + static_cast(&actual_ctx)->stream(); + memory::Copy(boost::get(ids.place()), original_row, + platform::CPUPlace(), out_rows, original_width * sizeof(T), + stream); + } +#endif + } +} }; // namespace distributed }; // namespace operators diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 2bddba7db2f1c1a4bf7a207d361d900ec625807f..42ab8e99662e1ec67b7a4061b274e84103a7d5b1 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -2,7 +2,9 @@ include(operators) register_operators(EXCLUDES fusion_transpose_flatten_concat_op fusion_conv_inception_op) if (WITH_GPU) op_library(fusion_transpose_flatten_concat_op) - op_library(fusion_conv_inception_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fusion_transpose_flatten_concat);\n") - file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n") + if (NOT ${CUDNN_VERSION} VERSION_LESS 7100) + op_library(fusion_conv_inception_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_inception_fusion);\n") + endif() endif() diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..fe4c73f4723355d4b56d075423de29b45b9cd4e4 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc @@ -0,0 +1,194 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace operators { + +class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input W of FusedEmbeddingSeqPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Ids"), + "Input Ids of FusedEmbeddingSeqPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output of FusedEmbeddingSeqPoolOp should not be null."); + + auto table_dims = ctx->GetInputDim("W"); + auto ids_dims = ctx->GetInputDim("Ids"); + const std::string& combiner = ctx->Attrs().Get("combiner"); + + PADDLE_ENFORCE_EQ(table_dims.size(), 2); + PADDLE_ENFORCE_GE(ids_dims.size(), 1, + "The dim size of the 'Ids' tensor must greater than 1."); + PADDLE_ENFORCE_EQ(ids_dims[ids_dims.size() - 1], 1, + "The last dimension of the 'Ids' tensor must be 1."); + // we only support sum now + PADDLE_ENFORCE_EQ(combiner, "sum"); + + int64_t last_dim = table_dims[1]; + for (int i = 1; i != ids_dims.size(); ++i) { + last_dim *= ids_dims[i]; + } + + if (ctx->IsRuntime()) { + framework::Variable* ids_var = + boost::get(ctx->GetInputVarPtrs("Ids")[0]); + const auto& ids_lod = ids_var->Get().lod(); + + // in run time, the LoD of ids must be 1 + PADDLE_ENFORCE(ids_lod.size(), 1u, + "The LoD level of Input(Ids) must be 1"); + PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty"); + + int64_t batch_size = ids_lod[0].size() - 1; + + // in run time, the shape from Ids -> output + // should be [seq_length, 1] -> [batch_size, embedding_size] + ctx->SetOutputDim("Out", framework::make_ddim({batch_size, last_dim})); + } else { + // in compile time, the lod level of ids must be 1 + framework::VarDesc* ids_desc = + boost::get(ctx->GetInputVarPtrs("Ids")[0]); + PADDLE_ENFORCE_EQ(ids_desc->GetLoDLevel(), 1); + + // in compile time, the shape from Ids -> output + // should be [-1, 1] -> [-1, embedding_size] + ctx->SetOutputDim("Out", framework::make_ddim({-1, last_dim})); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("W", + "(Tensor) The input represents embedding tensors, " + "which is a learnable parameter."); + AddInput("Ids", + "An input with type int32 or int64 " + "contains the ids to be looked up in W. " + "The last dimension size must be 1."); + AddOutput("Out", "The lookup results, which have the same type as W."); + AddAttr("combiner", + "(string, default sum) " + "A string specifying the reduction op. Currently sum " + "are supported, sum computes the weighted sum of the " + "embedding results for each row.") + .SetDefault("sum"); + // NOTE(minqiyang): grad_inplace is an temporal attribute, + // please do NOT set this attribute in python layer. + AddAttr("grad_inplace", + "(boolean, default false) " + "If the grad op reuse the input's variable.") + .SetDefault(false); + AddAttr("is_sparse", + "(boolean, default false) " + "Sparse update.") + .SetDefault(false); + AddComment(R"DOC( +FusedEmbeddingSeqPool Operator. + +Computes embeddings for the given ids and weights. + +This operator is used to perform lookups on the parameter W, +then computes the weighted sum of the lookups results for each row +and concatenated into a dense tensor. + +The input Ids should carry the LoD (Level of Details) information. +And the output will change the LoD information with input Ids. + +)DOC"); + } +}; + +class FusedEmbeddingSeqPoolOpGradDescMaker + : public framework::DefaultGradOpDescMaker { + using ::paddle::framework::DefaultGradOpDescMaker< + true>::DefaultGradOpDescMaker; + + protected: + virtual std::string GradOpType() const { + return "fused_embedding_seq_pool_grad"; + } +}; + +class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + auto table_dims = ctx->GetInputDim("W"); + ctx->SetOutputDim(framework::GradVarName("W"), table_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class FusedEmbeddingSeqPoolOpGradVarTypeInference + : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto out_var_name = op_desc.Output(framework::GradVarName("W")).front(); + auto attr = op_desc.GetAttr("is_sparse"); + bool is_sparse = boost::get(attr); + if (is_sparse) { + VLOG(3) << "fused_embedding_seq_pool_grad op " + << framework::GradVarName("W") << " is set to SelectedRows"; + block->Var(out_var_name) + ->SetType(framework::proto::VarType::SELECTED_ROWS); + } else { + VLOG(3) << "fused_embedding_seq_pool_grad op " + << framework::GradVarName("W") << " is set to LoDTensor"; + block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); + } + block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fused_embedding_seq_pool, ops::FusedEmbeddingSeqPoolOp, + ops::FusedEmbeddingSeqPoolOpGradDescMaker, + ops::FusedEmbeddingSeqPoolOpMaker); +REGISTER_OPERATOR(fused_embedding_seq_pool_grad, + ops::FusedEmbeddingSeqPoolOpGrad, + ops::FusedEmbeddingSeqPoolOpGradVarTypeInference); + +REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool, + ops::FusedEmbeddingSeqPoolKernel, + ops::FusedEmbeddingSeqPoolKernel); +REGISTER_OP_CPU_KERNEL(fused_embedding_seq_pool_grad, + ops::FusedEmbeddingSeqPoolGradKernel, + ops::FusedEmbeddingSeqPoolGradKernel); diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h new file mode 100644 index 0000000000000000000000000000000000000000..758432fd9e4197302e0bd8f76a1ca7c524026a70 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/math/blas.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +template +struct EmbeddingVSumFunctor { + void operator()(const framework::ExecutionContext &context, + const LoDTensor *table_t, const LoDTensor *ids_t, + LoDTensor *output_t) { + auto *table = table_t->data(); + int64_t row_number = table_t->dims()[0]; + int64_t row_width = table_t->dims()[1]; + int64_t last_dim = output_t->dims()[1]; + const int64_t *ids = ids_t->data(); + auto ids_lod = ids_t->lod()[0]; + int64_t ids_count = ids_t->numel() / ids_lod.back(); + + auto *output = output_t->mutable_data(context.GetPlace()); + + auto blas = math::GetBlas(context); + for (int64_t i = 0; i != ids_lod.size() - 1; ++i) { + size_t begin = ids_lod[i] * ids_count; + for (int64_t j = 0; j != ids_count; ++j) { + PADDLE_ENFORCE_LT(ids[begin], row_number); + PADDLE_ENFORCE_GE(ids[begin], 0, "ids %d", i); + blas.VCOPY(row_width, table + ids[begin + j] * row_width, + output + i * last_dim + j * row_width); + } + + for (int64_t r = (ids_lod[i] + 1) * ids_count; + r < ids_lod[i + 1] * ids_count; ++r) { + PADDLE_ENFORCE_LT(ids[r], row_number); + PADDLE_ENFORCE_GE(ids[r], 0, "ids %d", i); + blas.AXPY(row_width, 1., table + ids[r] * row_width, + output + i * last_dim + (r % ids_count) * row_width); + } + } + } +}; + +template +class FusedEmbeddingSeqPoolKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + const LoDTensor *ids_t = context.Input("Ids"); // int tensor + LoDTensor *output_t = context.Output("Out"); // float tensor + const LoDTensor *table_var = context.Input("W"); + const std::string &combiner_type = context.Attr("combiner"); + + if (combiner_type == "sum") { + EmbeddingVSumFunctor functor; + functor(context, table_var, ids_t, output_t); + } + } +}; + +template +class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *table_var = context.InputVar("W"); + DDim table_dim; + if (table_var->IsType()) { + table_dim = context.Input("W")->dims(); + } else if (table_var->IsType()) { + auto *table_t = context.Input("W"); + table_dim = table_t->value().dims(); + } else { + PADDLE_THROW( + "The parameter W of a LookupTable " + "must be either LoDTensor or SelectedRows"); + } + + bool is_sparse = context.Attr("is_sparse"); + // Since paddings are not trainable and fixed in forward, the gradient of + // paddings makes no sense and we don't deal with it in backward. + if (is_sparse) { + auto *ids = context.Input("Ids"); + auto *d_output = context.Input(framework::GradVarName("Out")); + auto *d_table = context.Output(framework::GradVarName("W")); + + auto *ids_data = ids->data(); + int64_t ids_num = ids->numel(); + auto lod = ids->lod()[0]; + int64_t row_width = d_output->dims()[1]; + + framework::Vector *new_rows = d_table->mutable_rows(); + new_rows->resize(ids_num); + std::memcpy(&(*new_rows)[0], ids_data, ids_num * sizeof(int64_t)); + + auto *d_table_value = d_table->mutable_value(); + d_table_value->Resize({ids_num, table_dim[1]}); + T *d_table_data = d_table_value->mutable_data(context.GetPlace()); + const T *d_output_data = d_output->data(); + + auto blas = math::GetBlas(context); + for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { + int64_t h = static_cast(lod[i + 1] - lod[i]); + int64_t in_offset = lod[i] * row_width; + const T *out_pos = d_output_data + i * row_width; + T *in_pos = d_table_data + in_offset; + for (int r = 0; r != h; ++r) { + blas.VCOPY(row_width, out_pos, in_pos + r * row_width); + } + } + } else { + LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now"; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index 3349b0b31ebf6e266820b077011f4f4d11974e09..6e13887866485bd114ebf12f4bdfa8d60fca6d01 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -21,7 +21,7 @@ DECLARE_uint64(conv_workspace_size_limit); namespace paddle { namespace operators { -#if CUDNN_VERSION >= 7001 +#if CUDNN_VERSION >= 7100 using Tensor = framework::Tensor; using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; @@ -264,7 +264,7 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -#if CUDNN_VERSION >= 7001 +#if CUDNN_VERSION >= 7100 namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL(conv2d_inception_fusion, ops::CUDNNConvInceptionFusionOpKernel, diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index a807117115763486a58052a6240cdedba6af9ac8..6ca6f0bc04aa696852ed7338dcb4b88a49b2fc81 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -67,6 +67,11 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null."); PADDLE_ENFORCE(ctx->HasOutput("PreOut"), "Output(PreOut) should not be null."); + auto with_prefetch = ctx->Attrs().Get("remote_prefetch"); + if (with_prefetch) { + PADDLE_ENFORCE(ctx->HasOutput("W_Out"), + "Output(W_Out) should not be null."); + } const int64_t batch_size = ctx->GetInputDim("X")[0]; std::vector output_shape({batch_size, 1}); ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); @@ -95,7 +100,7 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Label", "(LoDTensor, required), The labels of training data. It's a" "tensor with shape [N, 1]."); - AddInput("PTable", + AddInput("PathTable", "(LoDTensor, optional), The Path Table from root to current word" "it should have shape like [N, L], L is the length of the Path") .AsDispensable(); @@ -119,8 +124,30 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { "[batch_size, code_length], where code_length represents the " "maximum path length from root to leaf nodes.") .AsIntermediate(); + AddOutput( + "W_Out", + "(LoDTensor, optinal) using input 'W' as Output to make it mutable" + "When we are using prefetch") + .AsIntermediate(); AddAttr("num_classes", "(int, optional), The number of classes") .SetDefault(2); + // for parameter prefetch + AddAttr("remote_prefetch", "").SetDefault(false); + AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + AddAttr>( + "epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({}); + AddAttr>( + "table_names", + "(string vector, the splited table names that will be fetched from " + "parameter server)" + "in the order of input variables for mapping") + .SetDefault({}); AddComment(R"DOC( The hierarchical sigmoid operator organize the classes into a binary tree. At each node, a sigmoid function is used to calculate the probability of @@ -189,23 +216,17 @@ class HierarchicalSigmoidGradOpGradVarTypeInference << " is set to SelectedRows"; block->Var(w_grad_var_name) ->SetType(framework::proto::VarType::SELECTED_ROWS); - if (hasBias) { - VLOG(30) << "hierarchical_sigmoid_grad op " - << framework::GradVarName("Bias") << " is set to SelectedRows"; - block->Var(bias_grad_var_name) - ->SetType(framework::proto::VarType::SELECTED_ROWS); - } } else { VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") << " is set to LoDTensor"; block->Var(w_grad_var_name) ->SetType(framework::proto::VarType::LOD_TENSOR); - if (hasBias) { - VLOG(30) << "hierarchical_sigmoid_grad op " - << framework::GradVarName("Bias") << " is set to LoDTensor"; - block->Var(bias_grad_var_name) - ->SetType(framework::proto::VarType::LOD_TENSOR); - } + } + if (hasBias) { + VLOG(30) << "hierarchical_sigmoid_grad op " + << framework::GradVarName("Bias") << " is set to LoDTensor"; + block->Var(bias_grad_var_name) + ->SetType(framework::proto::VarType::LOD_TENSOR); } block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType()); } diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index d212e6f8437e69e71c010b6af27a33ff5e39e1e1..1a7ca963010112bbcab69f1ceeb9cb8d19ca9b9e 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once #include +#include #include +#include #include #include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" @@ -24,6 +26,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/matrix_bit_code.h" #include "paddle/fluid/platform/transform.h" +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" +#endif + namespace paddle { namespace operators { @@ -34,8 +40,9 @@ using platform::Transform; static std::vector PathToRows(const framework::LoDTensor& path) { std::set rows; + const int64_t* paths = path.data(); for (int64_t i = 0; i < path.numel(); ++i) { - int64_t row = path.data()[i]; + int64_t row = paths[i]; if (row < 0) { continue; } @@ -49,13 +56,54 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto& in = detail::Ref(ctx.Input("X")); auto& w = detail::Ref(ctx.Input("W")); - auto* path = ctx.Input("PTable"); + auto* path = ctx.Input("PathTable"); auto* code = ctx.Input("PathCode"); auto& label = detail::Ref(ctx.Input("Label")); auto* bias = ctx.Input("Bias"); auto* out = ctx.Output("Out"); auto* pre_out = ctx.Output("PreOut"); size_t num_classes = static_cast(ctx.Attr("num_classes")); + // for remote prefetch + + auto epmap = ctx.Attr>("epmap"); + if (!epmap.empty()) { + // if epmap is not empty, then the parameter will be fetched from remote + // parameter + // server + auto height_sections = ctx.Attr>("height_sections"); + auto table_names = ctx.Attr>("table_names"); + std::vector real_rows = PathToRows(*path); + framework::Scope& local_scope = ctx.scope().NewScope(); + auto* ids = local_scope.Var("Ids@Prefetch"); + auto* x_tensor = ids->GetMutable(); + + x_tensor->mutable_data( + framework::make_ddim({static_cast(real_rows.size()), 1}), + ctx.GetPlace()); + // copy. + + std::memcpy(x_tensor->data(), real_rows.data(), + real_rows.size() * sizeof(int64_t)); + + framework::DDim w_dims = ctx.Input("W")->dims(); + w_dims[0] = x_tensor->dims()[0]; + auto* w_tensor = + local_scope.Var("W@Prefetch")->GetMutable(); + w_tensor->Resize(w_dims); + +#ifdef PADDLE_WITH_DISTRIBUTE + // w_Out is set to used by prefetch, never change it in other cases + auto* w_out = ctx.Output("W_Out"); + operators::distributed::prefetch_with_reconstruct( + "Ids@Prefetch", "W@Prefetch", table_names, epmap, height_sections, + ctx, local_scope, w_out); +#else + PADDLE_THROW( + "paddle is not compiled with distribute support, can not do " + "parameter prefetch!"); +#endif + } + bool is_custom = false; if (path) { is_custom = true; @@ -116,9 +164,8 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto& in = detail::Ref(ctx.Input("X")); auto& w = detail::Ref(ctx.Input("W")); - auto* path = ctx.Input("PTable"); + auto* path = ctx.Input("PathTable"); auto* code = ctx.Input("PathCode"); - auto* bias = ctx.Input("Bias"); auto* in_grad = ctx.Output(framework::GradVarName("X")); bool is_sparse = ctx.Attr("is_sparse"); @@ -173,15 +220,14 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { } // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to // be consistent with the clipping in forward. - + auto* bias_grad = + ctx.Output(framework::GradVarName("Bias")); + if (bias_grad) { + bias_grad->mutable_data(ctx.GetPlace()); + zero(dev_ctx, bias_grad, static_cast(0.0)); + bit_code->AddGrad(pre_out_grad, bias_grad); + } if (!is_sparse) { - auto* bias_grad = - ctx.Output(framework::GradVarName("Bias")); - if (bias_grad) { - bias_grad->mutable_data(ctx.GetPlace()); - zero(dev_ctx, bias_grad, static_cast(0.0)); - bit_code->AddGrad(pre_out_grad, bias_grad); - } auto* w_grad = ctx.Output(framework::GradVarName("W")); w_grad->mutable_data(ctx.GetPlace()); @@ -200,21 +246,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { w_grad_value->mutable_data(temp_dim, ctx.GetPlace()); zero(dev_ctx, w_grad_value, static_cast(0.0)); - auto* bias_grad = - ctx.Output(framework::GradVarName("Bias")); - if (bias_grad) { - bias_grad->set_rows(real_rows); - // build ids -> rows index map - bias_grad->SyncIndex(); - bias_grad->set_height(bias->dims()[0]); - auto* bias_grad_value = bias_grad->mutable_value(); - std::vector dims = {static_cast(real_rows.size()), - bias->dims()[1]}; - bias_grad_value->mutable_data(framework::make_ddim(dims), - ctx.GetPlace()); - zero(dev_ctx, bias_grad_value, static_cast(0.0)); - bit_code->AddGrad(pre_out_grad, bias_grad); - } bit_code->MulGradWeight(pre_out_grad, w_grad, in); } bit_code->MulGradError(pre_out_grad, w, in_grad); diff --git a/paddle/fluid/operators/huber_loss_op.h b/paddle/fluid/operators/huber_loss_op.h index 9efda3dfc9871f197e0a66329772df2caedc4da4..fa21bd01cb052e5393385f0ef6c0203fc7c9e1a3 100644 --- a/paddle/fluid/operators/huber_loss_op.h +++ b/paddle/fluid/operators/huber_loss_op.h @@ -105,14 +105,16 @@ class HuberLossGradKernel : public framework::OpKernel { out0->mutable_data(context.GetPlace()); auto x_grad = EigenVector::Flatten(*out0); x_grad.device(place) = - out_grad * residual.unaryExpr(HuberLossBackward(delta, -1.0)); + residual.unaryExpr(HuberLossBackward(delta, -1.0)); + x_grad.device(place) = out_grad * x_grad; } if (out1) { out1->mutable_data(context.GetPlace()); auto y_grad = EigenVector::Flatten(*out1); y_grad.device(place) = - out_grad * residual.unaryExpr(HuberLossBackward(delta, 1.0)); + residual.unaryExpr(HuberLossBackward(delta, 1.0)); + y_grad.device(place) = out_grad * y_grad; } } }; diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc index 998b7f09c3146dcdd57fda13d7834473693eaf9c..1da14631e35608d479e1b861228d52d6d57def79 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cc +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -230,10 +230,12 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { if (ctx->HasOutput(framework::GradVarName("Emission"))) { ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims); + ctx->ShareLoD("Emission", framework::GradVarName("Emission")); } if (ctx->HasOutput(framework::GradVarName("Transition"))) { ctx->SetOutputDim(framework::GradVarName("Transition"), transition_exps_dims); + ctx->ShareLoD("Transition", framework::GradVarName("Transition")); } } diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 6a0d6bad512fe7cc15e60ed25028bc3cbbbca2ab..fd15539f7b6727496988c9b13d0d2551659a420a 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -92,7 +92,8 @@ class LookupTableCUDAKernel : public framework::OpKernel { // server #ifdef PADDLE_WITH_DISTRIBUTE operators::distributed::prefetch(id_name, out_name, table_names, epmap, - height_sections, context); + height_sections, context, + context.scope()); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " diff --git a/paddle/fluid/operators/lookup_table_op.h b/paddle/fluid/operators/lookup_table_op.h index 3a73a7637c6d7d3eff7443802a4a52be9149e0ef..a7d0fd4856edc74237151c64f286d468ad86e7ca 100644 --- a/paddle/fluid/operators/lookup_table_op.h +++ b/paddle/fluid/operators/lookup_table_op.h @@ -59,7 +59,8 @@ class LookupTableKernel : public framework::OpKernel { // server #ifdef PADDLE_WITH_DISTRIBUTE operators::distributed::prefetch(id_name, out_name, table_names, epmap, - height_sections, context); + height_sections, context, + context.scope()); #else PADDLE_THROW( "paddle is not compiled with distribute support, can not do " diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index 58f7be12ce6b5d447e93cf86c4954a86fccf48ef..d35073029a3440d8a17e383ce97fcfc582663888 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -62,19 +62,27 @@ struct CUBlas { cudaDataType_t Atype, int lda, const void *B, cudaDataType_t Btype, int ldb, const float *beta, void *C, cudaDataType_t Ctype, int ldc) { -// Because the gcc 4.8 doesn't expand template parameter pack that -// appears in a lambda-expression, I can not use template parameter pack -// here. + // Because the gcc 4.8 doesn't expand template parameter pack that + // appears in a lambda-expression, I can not use template parameter pack + // here. + auto cublas_call = [&]() { #if CUDA_VERSION >= 8000 - VLOG(5) << "use_tensor_op_math: " - << (dev_ctx->tensor_core_available() ? "True" : "False"); - dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + VLOG(5) << "use_tensor_op_math: " + << (platform::TensorCoreAvailable() ? "True" : "False"); PADDLE_ENFORCE(platform::dynload::cublasSgemmEx( - handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, - beta, C, Ctype, ldc)); - }); + dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, + lda, B, Btype, ldb, beta, C, Ctype, ldc)); #else - PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); + PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); +#endif + }; + +#if CUDA_VERSION >= 9000 + // NOTES: To use Tensor Core, we should change the cublas config, + // but the cublas may be hold by multi-thread. + dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); +#else + cublas_call(); #endif } }; @@ -162,24 +170,32 @@ struct CUBlas { cudaDataType_t Btype, int ldb, const void *beta, void *C, cudaDataType_t Ctype, int ldc, cudaDataType_t computeType) { + auto cublas_call = [&]() { #if CUDA_VERSION >= 8000 - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; #if CUDA_VERSION >= 9000 - bool use_tensor_op_math = dev_ctx->tensor_core_available(); - if (use_tensor_op_math) { - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); + bool use_tensor_op_math = platform::TensorCoreAvailable(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); #endif // CUDA_VERSION >= 9000 - dev_ctx->TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { PADDLE_ENFORCE(platform::dynload::cublasGemmEx( - handle, transa, transb, m, n, k, alpha, A, Atype, lda, B, Btype, ldb, - beta, C, Ctype, ldc, computeType, algo)); - }); + dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, + lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo)); #else - PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); + PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); +#endif + }; + +#if CUDA_VERSION >= 9000 + // NOTES: To use Tensor Core, we should change the cublas config, + // but the cublas may be hold by multi-thread. + dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); +#else + cublas_call(); #endif } }; @@ -207,10 +223,9 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, CUDA_R_32F, N); } else { #endif // CUDA_VERSION >= 8000 - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, - lda, &beta, C, N); - }); + + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, + &alpha, B, ldb, A, lda, &beta, C, N); #if CUDA_VERSION >= 8000 } @@ -251,12 +266,9 @@ inline void Blas::GEMM( CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F); #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm - - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, - &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C, - N); - }); + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, + N, M, K, &h_alpha, h_B, ldb, h_A, lda, + &h_beta, h_C, N); #endif // CUDA_VERSION >= 8000 } @@ -280,10 +292,8 @@ void Blas::GEMM(bool transA, bool transB, int M, } else { #endif // CUDA_VERSION >= 8000 - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, - lda, &beta, C, ldc); - }); + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, + &alpha, B, ldb, A, lda, &beta, C, ldc); #if CUDA_VERSION >= 8000 } @@ -301,19 +311,16 @@ inline void Blas::GEMM( cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM(handle, cuTransB, cuTransA, N, M, K, &alpha, - B, ldb, A, lda, &beta, C, ldc); - }); + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, A, lda, &beta, C, + ldc); } template <> template void Blas::AXPY(int n, T alpha, const T *x, T *y) const { - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::AXPY(handle, n, &alpha, x, 1, y, 1); - }); + CUBlas::AXPY(context_.cublas_handle(), n, &alpha, x, 1, y, 1); } template <> @@ -323,9 +330,8 @@ void Blas::GEMV(bool trans_a, int M, int N, T beta, T *C) const { cublasOperation_t cuTransA = !trans_a ? CUBLAS_OP_T : CUBLAS_OP_N; - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMV(handle, cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1); - }); + CUBlas::GEMV(context_.cublas_handle(), cuTransA, N, M, &alpha, A, N, B, 1, + &beta, C, 1); } template <> @@ -347,28 +353,28 @@ void Blas::BatchedGEMM( #if CUDA_VERSION >= 9010 if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; - bool use_tensor_op_math = context_.tensor_core_available(); - if (use_tensor_op_math) { - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } - VLOG(5) << "use_tensor_op_math: " - << (use_tensor_op_math ? "True" : "False"); - - context_.TensorCoreCublasCallIfAvailable([&](cublasHandle_t handle) { + auto cublas_call = [&]() { + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + bool use_tensor_op_math = platform::TensorCoreAvailable(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); + PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx( - handle, cuTransB, cuTransA, N, M, K, &alpha, B, CUDA_R_32F, ldb, - strideB, A, CUDA_R_32F, lda, strideA, &beta, C, CUDA_R_32F, ldc, - strideC, batchCount, CUDA_R_32F, algo)); - }); + context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, + CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C, + CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo)); + }; + auto &dev_ctx = const_cast(context_); + dev_ctx.CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); } else { #endif // CUDA_VERSION >= 9010 - context_.CublasCall([&](cublasHandle_t handle) { - CUBlas::GEMM_STRIDED_BATCH(handle, cuTransB, cuTransA, N, M, K, &alpha, - B, ldb, strideB, A, lda, strideA, &beta, C, - ldc, strideC, batchCount); - }); + CUBlas::GEMM_STRIDED_BATCH(context_.cublas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, strideB, A, lda, + strideA, &beta, C, ldc, strideC, batchCount); #if CUDA_VERSION >= 9010 } diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index d55e832cc2d9a4a5e2cb7fe5cf451a1205601951..d6f51c6e5c693becb14ff0bac0088bb9dc2b2f55 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -84,41 +84,6 @@ void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, code_table_.apply_visitor(func); } -template -struct MatrixBitCodeFunctorSelectedRowsAddGrad - : public boost::static_visitor { - const framework::Tensor &tmat_; - framework::SelectedRows *vec_; - - MatrixBitCodeFunctorSelectedRowsAddGrad(const framework::Tensor &tmat, - framework::SelectedRows *vec) - : tmat_(tmat), vec_(vec) {} - - template - void operator()(const CodeTable &code_table) { - size_t batch_size = tmat_.dims()[0]; - size_t width = tmat_.dims()[1]; - auto *vec_data = vec_->mutable_value()->template data(); - auto *tmat_data = tmat_.data(); - for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table.get_code(i); - int code_length = code.get_length(); - for (int j = 0; j < code_length; ++j) { - size_t index = code.calc_index(j); - int64_t row_index = vec_->GetIndexFromId(static_cast(index)); - vec_data[row_index] += tmat_data[i * width + j]; - } - } - } -}; - -template -void MatrixBitCodeFunctor::AddGrad(const framework::Tensor &tmat, - framework::SelectedRows *vec) { - MatrixBitCodeFunctorSelectedRowsAddGrad func(tmat, vec); - code_table_.apply_visitor(func); -} - template struct MatrixBitCodeFunctorSum : public boost::static_visitor { const framework::Tensor &tmat_; diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 01e4889d34ad6e409f1b8a9c4bf783800187e863..c399cb5d44aaa50fab00fd170c021c8c70eee990 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -124,11 +124,12 @@ class SimpleCode { template class CustomCode { public: - CustomCode(const framework::Tensor& ptable, const framework::Tensor& pcode, - const int64_t* ids, int index) { - seq_len_ = ptable.dims()[1]; - ptable_data_ = ptable.data() + seq_len_ * index; - pcode_data_ = pcode.data() + seq_len_ * index; + CustomCode(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids, + int index) { + seq_len_ = path_table.dims()[1]; + path_table_data_ = path_table.data() + seq_len_ * index; + path_code_data_ = path_code.data() + seq_len_ * index; } /** * Here the id of root should be 1 rather than 0, thus the encoding of class c @@ -139,25 +140,25 @@ class CustomCode { * Binary classification path is the suffixes of encoding, thus leave out the * left most bit in calc_bit. */ - size_t calc_index(int bit) const { return ptable_data_[bit]; } - bool calc_bit(int bit) const { return pcode_data_[bit]; } + size_t calc_index(int bit) const { return path_table_data_[bit]; } + bool calc_bit(int bit) const { return path_code_data_[bit]; } // NOTE: this function is not thread-safe. int get_length() const { if (length_ < 0) { auto len = seq_len_; - length_ = - static_cast(std::find_if(ptable_data_, ptable_data_ + len, - [](const T& val) { return val < 0; }) - - ptable_data_); + length_ = static_cast( + std::find_if(path_table_data_, path_table_data_ + len, + [](const T& val) { return val < 0; }) - + path_table_data_); } return length_; } private: int64_t seq_len_; - const T* ptable_data_; - const T* pcode_data_; + const T* path_table_data_; + const T* path_code_data_; mutable int length_{-1}; }; @@ -181,9 +182,9 @@ class SimpleCodeTable { template class CustomCodeTable { public: - CustomCodeTable(const framework::Tensor& ptable, - const framework::Tensor& pcode, const int64_t* ids) - : ptable_(ptable), pcode_(pcode), ids_(ids) {} + CustomCodeTable(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids) + : ptable_(path_table), pcode_(path_code), ids_(ids) {} CustomCode get_code(int64_t code) const { return CustomCode(ptable_, pcode_, ids_, code); @@ -210,11 +211,11 @@ class MatrixBitCodeFunctor { ids_(ids), code_table_(SimpleCodeTable(num_classes, ids)) {} - MatrixBitCodeFunctor(const framework::Tensor& ptable, - const framework::Tensor& pcode, const int64_t* ids) - : num_classes_(static_cast(ptable.dims()[1])), + MatrixBitCodeFunctor(const framework::Tensor& path_table, + const framework::Tensor& path_code, const int64_t* ids) + : num_classes_(static_cast(path_table.dims()[1])), ids_(ids), - code_table_(CustomCodeTable(ptable, pcode, ids)) {} + code_table_(CustomCodeTable(path_table, path_code, ids)) {} /* For j < code_length tmat(i, j) += vec(0, index(i, j)) */ @@ -225,11 +226,6 @@ class MatrixBitCodeFunctor { */ void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec); - /* For selected rows For j < code_length - vec(0, index(i, j)) += tmat(i, j) - */ - void AddGrad(const framework::Tensor& tmat, framework::SelectedRows* vec); - /* For j < code_length sum(i, 0) = \sum_j bit(i, j) * tmat(i, j) */ diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 784e07b5bd7f3836f3515c789f998ba1bf30f6e8..256da34912560ddf1f7e430e8543efe00e5885bc 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -153,6 +153,24 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("is_sparse", "(boolean, default false) Sparse update.") .SetDefault(false); + // for parameter prefetch + AddAttr("remote_prefetch", "").SetDefault(false); + AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + AddAttr>( + "epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({}); + AddAttr>( + "table_names", + "(string vector, the splited table names that will be fetched from " + "parameter server)" + "in the order of input variables for mapping") + .SetDefault({}); + AddAttr>("custom_neg_classes", "This attribute only be used in unitest. Classes " "in this list wiil be used as negative classes " @@ -222,24 +240,20 @@ class NCEOpGradVarTypeInference : public framework::VarTypeInference { void operator()(const framework::OpDesc &op_desc, framework::BlockDesc *block) const override { auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front(); - auto bias_grad = op_desc.Output(framework::GradVarName("Bias")).front(); auto attr = op_desc.GetAttr("is_sparse"); bool is_sparse = boost::get(attr); if (is_sparse) { - VLOG(3) << "nce_op_grad op " << weight_grad << " and " << bias_grad + VLOG(3) << "nce_op_grad op " << weight_grad << " and " << " is set to SelectedRows"; block->Var(weight_grad) ->SetType(framework::proto::VarType::SELECTED_ROWS); - block->Var(bias_grad)->SetType(framework::proto::VarType::SELECTED_ROWS); } else { - VLOG(3) << "nce_op_grad op " << weight_grad << " and " << bias_grad + VLOG(3) << "nce_op_grad op " << weight_grad << " and " << " is set to LoDTensor"; block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR); - block->Var(bias_grad)->SetType(framework::proto::VarType::LOD_TENSOR); } block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType()); - block->Var(bias_grad)->SetDataType(block->Var("Input")->GetDataType()); } }; diff --git a/paddle/fluid/operators/nce_op.h b/paddle/fluid/operators/nce_op.h index f2ca6ec247fd1ea09b707c2eaaad0548c8aa5757..2c97eef096eb3d23273e362e658cb1b5fc808609 100644 --- a/paddle/fluid/operators/nce_op.h +++ b/paddle/fluid/operators/nce_op.h @@ -15,8 +15,10 @@ limitations under the License. */ #pragma once #include +#include #include #include +#include #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" @@ -24,6 +26,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/sampler.h" #include "unsupported/Eigen/CXX11/Tensor" +#ifdef PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/parameter_prefetch.h" +#endif + namespace paddle { namespace operators { @@ -43,7 +49,6 @@ void PrepareSamples(const framework::ExecutionContext &context, auto label = context.Input("Label"); const int64_t *label_data = label->data(); auto label_dims = label->dims(); - // int num_total_classes = context.Attr("num_total_classes"); // for unitest std::vector custom_neg_classes = context.Attr>("custom_neg_classes"); @@ -144,15 +149,82 @@ class NCEKernel : public framework::OpKernel { } // forward mul auto input_mat = EigenMatrix::From(*(context.Input("Input"))); - auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); - for (int64_t i = 0; i < sample_labels->numel(); ++i) { - Eigen::Tensor result = - (input_mat.chip(static_cast(i / sample_labels->dims()[1]), 0) * - weight_mat.chip(sample_labels_data[i], 0)) - .sum(); - sample_out_data[i] += result(0); - sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + + // for remote prefetch + auto epmap = context.Attr>("epmap"); + + if (!epmap.empty()) { + // if epmap is not empty, then the parameter will be fetched from remote + // parameter + // server + + std::vector labels; + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + labels.push_back(sample_labels_data[i]); + } + std::set st(labels.begin(), labels.end()); + labels.assign(st.begin(), st.end()); + + framework::Scope &local_scope = context.scope().NewScope(); + + auto height_sections = context.Attr>("height_sections"); + auto table_names = context.Attr>("table_names"); + + auto *ids = local_scope.Var("Ids@Prefetch"); + auto *x_tensor = ids->GetMutable(); + x_tensor->mutable_data( + framework::make_ddim({static_cast(labels.size()), 1}), + context.GetPlace()); + // copy. + std::memcpy(x_tensor->data(), labels.data(), + labels.size() * sizeof(int64_t)); + + std::vector w_dims = paddle::framework::vectorize2int( + context.Input("Weight")->dims()); + w_dims[0] = static_cast(labels.size()); + + auto *w_tensor = local_scope.Var("Weight@Prefetch") + ->GetMutable(); + w_tensor->Resize(framework::make_ddim(w_dims)); + +#ifdef PADDLE_WITH_DISTRIBUTE + operators::distributed::prefetch("Ids@Prefetch", "Weight@Prefetch", + table_names, epmap, height_sections, + context, local_scope); +#else + PADDLE_THROW( + "paddle is not compiled with distribute support, can not do " + "parameter prefetch!"); +#endif + + auto weight_mat = EigenMatrix::From( + (local_scope.Var("Weight@Prefetch")->Get())); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + std::vector::iterator it = + std::find(labels.begin(), labels.end(), sample_labels_data[i]); + int idx = std::distance(labels.begin(), it); + + Eigen::Tensor result = + (input_mat.chip(static_cast(i / sample_labels->dims()[1]), 0) * + weight_mat.chip(idx, 0)) + .sum(); + sample_out_data[i] += result(0); + sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + } + context.scope().DeleteScope(&local_scope); + } else { + auto weight_mat = + EigenMatrix::From(*(context.Input("Weight"))); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + Eigen::Tensor result = + (input_mat.chip(static_cast(i / sample_labels->dims()[1]), 0) * + weight_mat.chip(sample_labels_data[i], 0)) + .sum(); + sample_out_data[i] += result(0); + sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); + } } + // forward cost for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) { out_data[i] = 0; @@ -240,18 +312,19 @@ class NCEGradKernel : public framework::OpKernel { sample_grad_data[i] *= d_out_data[sample_idx]; } + // get d_bias + auto d_bias = context.Output(framework::GradVarName("Bias")); + if (d_bias != nullptr) { + T *d_bias_data = d_bias->mutable_data(context.GetPlace()); + std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); + for (int64_t i = 0; i < sample_labels->numel(); ++i) { + d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; + } + } + bool is_sparse = context.Attr("is_sparse"); if (!is_sparse) { - // get d_bias - auto d_bias = context.Output(framework::GradVarName("Bias")); - if (d_bias != nullptr) { - T *d_bias_data = d_bias->mutable_data(context.GetPlace()); - std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); - for (int64_t i = 0; i < sample_labels->numel(); ++i) { - d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; - } - } // get d_w auto d_w = context.Output(framework::GradVarName("Weight")); if (d_w != nullptr) { @@ -273,34 +346,6 @@ class NCEGradKernel : public framework::OpKernel { std::set st(labels.begin(), labels.end()); labels.assign(st.begin(), st.end()); - auto *bias_var = context.InputVar("Bias"); - DDim bias_dim; - if (bias_var->IsType()) { - bias_dim = context.Input("Bias")->dims(); - } else if (bias_var->IsType()) { - auto *table_t = context.Input("Bias"); - bias_dim = table_t->value().dims(); - } else { - PADDLE_THROW( - "The parameter Bias of a NCE_OP " - "must be either LoDTensor or SelectedRows"); - } - - auto d_bias = - context.Output(framework::GradVarName("Bias")); - d_bias->set_rows(labels); - d_bias->set_height(bias_dim[0]); - - d_bias->mutable_value()->Resize( - {static_cast(labels.size()), bias_dim[1]}); - T *d_bias_data = - d_bias->mutable_value()->mutable_data(context.GetPlace()); - std::fill(d_bias_data, d_bias_data + labels.size(), 0.0); - for (int64_t i = 0; i < sample_labels->numel(); ++i) { - d_bias_data[d_bias->Index(sample_labels_data[i])] += - sample_grad_data[i]; - } - auto *table_var = context.InputVar("Weight"); DDim table_dim; if (table_var->IsType()) { diff --git a/paddle/fluid/operators/ngraph/ngraph_ops.h b/paddle/fluid/operators/ngraph/ngraph_ops.h index 8e7457dd56c2413f84008ce467537e07b3e80cc7..2a479081f1e40a4bdc3d80067e4a7d8ebc2bf550 100644 --- a/paddle/fluid/operators/ngraph/ngraph_ops.h +++ b/paddle/fluid/operators/ngraph/ngraph_ops.h @@ -23,5 +23,7 @@ limitations under the License. */ #include "ops/binary_unnary_op.h" #include "ops/fill_constant_op.h" +#include "ops/mean_op.h" #include "ops/mul_op.h" +#include "ops/scale_op.h" #include "ops/top_k_op.h" diff --git a/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h b/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h new file mode 100644 index 0000000000000000000000000000000000000000..15fbd58b02d2b13a8f5401f7cbe291da35748e83 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h @@ -0,0 +1,61 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_NGRAPH +#pragma once + +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +template +std::shared_ptr ElementwiseScalar( + float scale, std::shared_ptr node) { + auto node_shape = node->get_shape(); + auto scale_const = ngraph::op::Constant::create(node->get_element_type(), + node_shape, {scale}); + return std::make_shared(scale_const, node); +} + +template +std::shared_ptr ElementwiseScalar( + std::shared_ptr scale_1d, + std::shared_ptr node) { + auto scale_shape = scale_1d->get_shape(); + PADDLE_ENFORCE_EQ(scale_shape.size(), 1, "Supporting 1d scale node"); + PADDLE_ENFORCE_EQ(scale_shape.at(0), 1, "scale 1d in in shape {1}"); + + auto node_shape = node->get_shape(); + ngraph::AxisSet axis_set; + for (size_t i = 0; i < node_shape.size(); ++i) { + axis_set.insert(i); + } + node_shape.push_back(1); + + auto scale_bcast = + std::make_shared(scale_1d, node_shape, axis_set); + + auto scale_reshape = + paddle::platform::NgReshaper(scale_bcast, node->get_shape()); + + return std::make_shared(scale_reshape, node); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle +#endif diff --git a/paddle/fluid/operators/ngraph/ops/mean_op.h b/paddle/fluid/operators/ngraph/ops/mean_op.h new file mode 100644 index 0000000000000000000000000000000000000000..7fcf8f09cd346db8cf6706014e0d4573ced7a86c --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/mean_op.h @@ -0,0 +1,68 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_NGRAPH +#pragma once + +#include +#include + +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildMeanNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto input = paddle::platform::GetInputNode(op, "X", ngb_node_map); + ngraph::AxisSet axes; + for (size_t i = 0; i < input->get_shape().size(); ++i) { + axes.insert(i); + } + + auto mean = ngraph::builder::mean(input, axes); + auto mean_1d = std::make_shared( + mean, ngraph::AxisVector{}, ngraph::Shape{1}); + paddle::platform::SetOutputNode(op, "Out", mean_1d, ngb_node_map); +} + +void BuildMeanGradNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto og = paddle::platform::GetInputNode(op, "Out@GRAD", ngb_node_map); + auto x_shape = x->get_shape(); + float x_size = std::accumulate(std::begin(x_shape), std::end(x_shape), 1, + std::multiplies()); + auto node_const = ngraph::op::Constant::create(og->get_element_type(), + ngraph::Shape{1}, {x_size}); + auto node_div = std::make_shared(og, node_const); + + auto result = ElementwiseScalar( + og / node_const, + ngraph::op::Constant::create(og->get_element_type(), x_shape, {0})); + paddle::platform::SetOutputNode(op, "X@GRAD", result, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle +#endif diff --git a/paddle/fluid/operators/ngraph/ops/scale_op.h b/paddle/fluid/operators/ngraph/ops/scale_op.h new file mode 100644 index 0000000000000000000000000000000000000000..24ab0702aa50861b34fe1af7ccaf37d4e1dffc41 --- /dev/null +++ b/paddle/fluid/operators/ngraph/ops/scale_op.h @@ -0,0 +1,41 @@ +/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_NGRAPH +#pragma once + +#include +#include "ngraph/ngraph.hpp" +#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" +#include "paddle/fluid/platform/ngraph_helper.h" + +namespace paddle { +namespace operators { +namespace ngraphs { + +void BuildScaleNode( + const std::shared_ptr& op, + std::shared_ptr< + std::unordered_map>> + ngb_node_map) { + auto op_attrs = paddle::framework::AttrReader(op->Attrs()); + float scale = op_attrs.Get("scale"); + auto x = paddle::platform::GetInputNode(op, "X", ngb_node_map); + auto out = ElementwiseScalar(scale, x); + paddle::platform::SetOutputNode(op, "Out", out, ngb_node_map); +} +} // namespace ngraphs +} // namespace operators +} // namespace paddle +#endif diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 5c559484ec95e794ebbbe0e713cb9e26b5c01b98..61b9384f8422cb531a94096875434ffe36ecdbce 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -424,16 +424,23 @@ class AdamOpKernel : public framework::OpKernel { } } + framework::SelectedRows cpu_grad_merge; const framework::SelectedRows* grad_merge_ptr; if (is_strict_sorted) { grad_merge_ptr = &grad; } else { // merge duplicated rows if any. // The rows of grad_merge have been sorted inside MergeAdd functor + framework::SelectedRows* grad_merge_var; scatter::MergeAdd merge_func; - auto* grad_merge_var = const_cast(ctx.scope()) - .Var() - ->GetMutable(); + if (platform::is_cpu_place(ctx.GetPlace())) { + grad_merge_var = &cpu_grad_merge; + } else { + // FIXME(qiao): GPU also need to fix this + grad_merge_var = const_cast(ctx.scope()) + .Var() + ->GetMutable(); + } merge_func(ctx.template device_context(), grad, grad_merge_var, true); grad_merge_ptr = grad_merge_var; diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 7fc07efe7304701794595c9fa63f4a306d61e230..56879ffda5d3e04a88d12d6c4701c24a0d0ee4f7 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -49,7 +49,7 @@ void MonitorThread(std::vector* thread_status, class CTRReader : public framework::FileReader { public: explicit CTRReader(const std::shared_ptr& queue, - int batch_size, int thread_num, + int batch_size, size_t thread_num, const std::vector& slots, const std::vector& file_list) : batch_size_(batch_size), slots_(slots), file_list_(file_list) { diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 05a0f14440732e5aef2ff665fbd3a5c1c7094581..1f51b5bab3068cc89bffa85de28a9438359659f3 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -100,7 +100,7 @@ ENDIF() nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) if(WITH_GPU) - nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor) + nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor operator) else() - cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor) + cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor operator) endif() diff --git a/paddle/fluid/platform/cuda_helper.h b/paddle/fluid/platform/cuda_helper.h deleted file mode 100644 index 122de72e15d587cf33b5d9856ac8b1243f666881..0000000000000000000000000000000000000000 --- a/paddle/fluid/platform/cuda_helper.h +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include // NOLINT - -#include "paddle/fluid/platform/dynload/cublas.h" -#include "paddle/fluid/platform/macros.h" - -#if CUDA_VERSION < 9000 -enum cublasMath_t { CUBLAS_DEFAULT_MATH = 0 }; -#endif - -namespace paddle { -namespace platform { - -class CublasHandleHolder { - public: - CublasHandleHolder(cudaStream_t stream, cublasMath_t math_type) { - PADDLE_ENFORCE(dynload::cublasCreate(&handle_)); - PADDLE_ENFORCE(dynload::cublasSetStream(handle_, stream)); -#if CUDA_VERSION >= 9000 - if (math_type == CUBLAS_TENSOR_OP_MATH) { - PADDLE_ENFORCE( - dynload::cublasSetMathMode(handle_, CUBLAS_TENSOR_OP_MATH)); - } -#endif - } - - ~CublasHandleHolder() { PADDLE_ENFORCE(dynload::cublasDestroy(handle_)); } - - template - inline void Call(Callback &&callback) const { - std::lock_guard guard(mtx_); - callback(handle_); - } - - private: - DISABLE_COPY_AND_ASSIGN(CublasHandleHolder); - - cublasHandle_t handle_; - mutable std::mutex mtx_; -}; - -} // namespace platform -} // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index be7f4949d65cef36d61b726c1c656f177e298fcc..022afb686b29c2c493cfd05600ee372470cbc710 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -245,15 +245,8 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_->Reinitialize(&stream_, place); eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); - cublas_handle_.reset(new CublasHandleHolder(stream_, CUBLAS_DEFAULT_MATH)); - - if (TensorCoreAvailable()) { -#if CUDA_VERSION >= 9000 - cublas_tensor_core_handle_.reset( - new CublasHandleHolder(stream_, CUBLAS_TENSOR_OP_MATH)); -#endif - } - + PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); + PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); if (dynload::HasCUDNN()) { cudnn_holder_.reset(new CudnnHolder(&stream_, place)); } @@ -313,8 +306,7 @@ CUDADeviceContext::~CUDADeviceContext() { SetDeviceId(place_.device); Wait(); WaitStreamCallback(); - cublas_handle_.reset(); - cublas_tensor_core_handle_.reset(); + PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); eigen_stream_.reset(); eigen_device_.reset(); PADDLE_ENFORCE(cudaStreamDestroy(stream_)); @@ -343,8 +335,8 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { return eigen_device_.get(); } -bool CUDADeviceContext::tensor_core_available() const { - return cublas_tensor_core_handle_ != nullptr; +cublasHandle_t CUDADeviceContext::cublas_handle() const { + return cublas_handle_; } cudnnHandle_t CUDADeviceContext::cudnn_handle() const { diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index c81d17380cf894631d06588c007c2e11ce5c7836..7e875801893f3b73f8efaf33af690f8c855beee4 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -20,7 +20,6 @@ limitations under the License. */ #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/temporary_allocator.h" #ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/platform/cuda_helper.h" #include "paddle/fluid/platform/dynload/cublas.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/gpu_info.h" @@ -210,6 +209,39 @@ class CudnnWorkspaceHandle { std::unique_ptr> guard_; }; +#if CUDA_VERSION >= 9000 +class ScopedCublasMathMode { + public: + ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode) + : handle_(handle) { + need_reset = false; + PADDLE_ENFORCE( + platform::dynload::cublasGetMathMode(handle_, &old_math_mode_), + "Failed to get old cublas math mode"); + if (old_math_mode_ != new_math_mode) { + PADDLE_ENFORCE( + platform::dynload::cublasSetMathMode(handle_, new_math_mode), + "Failed to set old cublas math mode"); + need_reset = true; + } + } + + ~ScopedCublasMathMode() { + if (need_reset) { + PADDLE_ENFORCE( + platform::dynload::cublasSetMathMode(handle_, old_math_mode_), + "Failed to set old cublas math mode"); + } + } + + private: + cublasHandle_t handle_; + cublasMath_t old_math_mode_; + bool need_reset; +}; + +#endif + class CUDADeviceContext : public DeviceContext { public: explicit CUDADeviceContext(CUDAPlace place); @@ -230,25 +262,8 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return eigen device in the device context. */ Eigen::GpuDevice* eigen_device() const; - /*! \brief Call cublas function safely. */ - template - inline void CublasCall(Callback&& callback) const { - cublas_handle_->Call(std::forward(callback)); - } - - /*! \brief Check whether tensor core is supported */ - bool tensor_core_available() const; - - /*! \brief Call cublas function with Tensor Core safely. If - Tensor Core is not available, use DEFAULT_MATH instead. */ - template - inline void TensorCoreCublasCallIfAvailable(Callback&& callback) const { - if (cublas_tensor_core_handle_) { - cublas_tensor_core_handle_->Call(std::forward(callback)); - } else { - cublas_handle_->Call(std::forward(callback)); - } - } + /*! \brief Return cublas handle in the device context. */ + cublasHandle_t cublas_handle() const; /*! \brief Return cudnn handle in the device context. */ cudnnHandle_t cudnn_handle() const; @@ -267,6 +282,7 @@ class CUDADeviceContext : public DeviceContext { template void RecordEvent(cudaEvent_t ev, Callback callback) { + std::lock_guard guard(mtx_); callback(); PADDLE_ENFORCE(cudaEventRecord(ev, stream_)); } @@ -278,6 +294,18 @@ class CUDADeviceContext : public DeviceContext { void WaitStreamCallback() const { callback_manager_->Wait(); } +#if CUDA_VERSION >= 9000 + /*! \brief CublasCall may need to change cublas's config, + * but the cublas may be hold by multi-thread, so we should + * add lock here. */ + template + void CublasCall(Callback callback, cublasMath_t new_math) { + std::lock_guard guard(cublas_mtx_); + ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math); + callback(); + } +#endif + private: CUDAPlace place_; @@ -285,9 +313,7 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr eigen_stream_; std::unique_ptr cudnn_holder_; cudaStream_t stream_; - - std::unique_ptr cublas_handle_; - std::unique_ptr cublas_tensor_core_handle_; + cublasHandle_t cublas_handle_; int compute_capability_; int runtime_version_; @@ -295,10 +321,12 @@ class CUDADeviceContext : public DeviceContext { int multi_process_; int max_threads_per_mp_; + mutable std::mutex mtx_; + // StreamCallbackManager is thread-safe std::unique_ptr callback_manager_; - DISABLE_COPY_AND_ASSIGN(CUDADeviceContext); + mutable std::mutex cublas_mtx_; }; template <> diff --git a/paddle/fluid/platform/device_context_test.cu b/paddle/fluid/platform/device_context_test.cu index 5b3aa98efb46b51d6c3edb6d2cbd4200bd0a35c6..171d2979a0218ad5e22112190a59866b3e0b617f 100644 --- a/paddle/fluid/platform/device_context_test.cu +++ b/paddle/fluid/platform/device_context_test.cu @@ -43,6 +43,9 @@ TEST(Device, CUDADeviceContext) { ASSERT_NE(nullptr, gpu_device); cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); ASSERT_NE(nullptr, cudnn_handle); + cublasHandle_t cublas_handle = device_context->cublas_handle(); + ASSERT_NE(nullptr, cublas_handle); + ASSERT_NE(nullptr, device_context->stream()); delete device_context; } } diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 584df85e80203c383a89954aac73dd1dcd723f7c..b3d20736a8e70d2f57ee5d6dc97cb490b5cfee44 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -145,7 +145,8 @@ class MKLDNNHandler { const std::shared_ptr user_memory_p, const std::string& suffix, std::vector& pipeline, // NOLINT - bool is_persistent = false) { + bool is_persistent = false, bool is_INT8 = false, + std::vector scale_data = {1.0f}, int mask = 0) { // create reorder primitive if the input format is not the preferred one auto local_key = key_ + suffix; auto key_reorder_p = key_ + suffix + "reorder_p"; @@ -159,8 +160,20 @@ class MKLDNNHandler { std::shared_ptr reorder_p; if (mpd != user_mpd) { target_memory_p = std::make_shared(mpd); - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); + std::shared_ptr reorder_p; + if (is_INT8) { + mkldnn::primitive_attr + attri; // attribute for int8 weights and bias data reorder. + attri.set_output_scales(mask, scale_data); + + auto reorder_pd = std::shared_ptr( + new mkldnn::reorder::primitive_desc(user_mpd, mpd, attri)); + reorder_p = std::shared_ptr(new mkldnn::reorder( + *reorder_pd, *user_memory_p, *target_memory_p)); + } else { + reorder_p = std::make_shared(*user_memory_p, + *target_memory_p); + } dev_ctx_.SetBlob(key_reorder_p, reorder_p); pipeline.push_back(*reorder_p); } @@ -182,22 +195,58 @@ class MKLDNNHandler { return dims2str(operand_dims) + suffix; } - template + template static void SetDstMemory( const framework::ExecutionContext& ctx, framework::Tensor* output, std::vector dst_tz, const mkldnn::engine& engine, std::shared_ptr& dst_pd, // NOLINT std::shared_ptr& dst_memory) { // NOLINT - M* output_data = output->mutable_data(ctx.GetPlace()); + T* output_data = output->mutable_data(ctx.GetPlace()); auto dst_md = platform::MKLDNNMemDesc( {dst_tz}, paddle::framework::ToMKLDNNDataType( - framework::DataTypeTrait::DataType), + framework::DataTypeTrait::DataType), mkldnn::memory::format::nhwc); dst_pd.reset(new mkldnn::memory::primitive_desc(dst_md, engine)); - dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast(output_data))); + dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast(output_data))); + } + + static void AppendKey( + std::string* key, const mkldnn::memory::dims& input_dims, + const mkldnn::memory::dims& weights_dims, const std::vector& strides, + const std::vector& paddings, const std::vector& dilations, + const int& groups, const mkldnn::memory::data_type& srcdt, + const mkldnn::memory::format& format, + const mkldnn::memory::data_type& dstdt, const std::string& suffix) { + AppendKeyDims(key, input_dims); + AppendKeyDims(key, weights_dims); + AppendKeyVec(key, strides); + AppendKeyVec(key, paddings); + AppendKeyVec(key, dilations); + AppendKey(key, std::to_string(groups)); + AppendKey(key, std::to_string(srcdt)); + AppendKey(key, std::to_string(format)); + AppendKey(key, std::to_string(dstdt)); + AppendKey(key, suffix); } protected: + static void AppendKeyDims(std::string* key, + const mkldnn::memory::dims& dims) { + for (unsigned int i = 0; i < dims.size(); i++) { + AppendKey(key, std::to_string(dims[i])); + } + } + + static void AppendKeyVec(std::string* key, const std::vector& dims) { + for (unsigned int i = 0; i < dims.size(); i++) { + AppendKey(key, std::to_string(dims[i])); + } + } + + static void AppendKey(std::string* key, const std::string& s) { + key->append(s); + } + static std::string dims2str(const mkldnn::memory::dims& operand_dims) { std::string dstr = ""; for (size_t i = 0; i < operand_dims.size(); ++i) { @@ -215,7 +264,8 @@ class MKLDNNHandler { class TransposeMKLDNNHandler : public MKLDNNHandler { public: - TransposeMKLDNNHandler(std::vector& dims, std::vector& axis, + TransposeMKLDNNHandler(std::vector& dims, // NOLINT + std::vector& axis, // NOLINT const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, const std::string& base_key) : platform::MKLDNNHandler(dev_ctx, engine, base_key), @@ -303,8 +353,9 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { } protected: - mkldnn_memory_desc_t Axis2MemoryDesc(std::vector& nchw_tz, - std::vector& axis) { + mkldnn_memory_desc_t Axis2MemoryDesc(std::vector& nchw_tz, // NOLINT + std::vector& axis // NOLINT + ) { mkldnn_memory_desc_t mem_fmt; mem_fmt.primitive_kind = mkldnn_memory; @@ -462,21 +513,26 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { std::shared_ptr AcquireWeightsMemoryFromPrimitive( const std::shared_ptr user_weights_memory_p, std::vector& pipeline, // NOLINT - bool is_persistent = false) { + bool is_persistent = false, bool is_INT8 = false, + std::vector scale_data = {1.0f}, int mask = 0) { auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); auto weights_pd = conv_pd_->weights_primitive_desc(); - return this->AcquireMemory(weights_pd, user_weights_pd, - user_weights_memory_p, "@weights_mem_p", - pipeline, is_persistent); + return this->AcquireMemory( + weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p", + pipeline, is_persistent, is_INT8, scale_data, mask); } std::shared_ptr AcquireBiasMemoryFromPrimitive( const std::shared_ptr user_bias_memory_p, - std::vector& pipeline) { // NOLINT + std::vector& pipeline, // NOLINT + bool is_persistent = false, bool is_INT8 = false, + std::vector scale_data = {1.0f}, + int mask = 0) { // NOLINT auto user_bias_pd = user_bias_memory_p->get_primitive_desc(); auto bias_pd = conv_pd_->bias_primitive_desc(); return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p, - "@bias_mem_p", pipeline); + "@bias_mem_p", pipeline, is_persistent, is_INT8, + scale_data, mask); } std::shared_ptr AcquireConvolution( @@ -594,5 +650,29 @@ using ConvTransposeMKLDNNHandler = ConvMKLDNNTemplateHandler; + +template +static std::shared_ptr SetDstMemory( + const framework::ExecutionContext& ctx, framework::Tensor* output, + const std::shared_ptr& handler) { + T* output_data = output->mutable_data( + ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, + handler->GetDstMemorySize()); + std::shared_ptr dst_memory_p = + handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + return dst_memory_p; +} + +template +static std::shared_ptr SetDstMemoryHandler( + const framework::ExecutionContext& ctx, framework::Tensor* output, + const std::shared_ptr& handler) { + T* output_data = output->mutable_data( + ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, + handler->GetDstMemorySize()); + std::shared_ptr dst_memory_p; + dst_memory_p->set_data_handle(to_void_cast(output_data)); + return dst_memory_p; +} } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 6ce4bf8f13922e2756c3ee8f189bd36123d6964c..8df8e32098697540f02d488c873f5ae7fb29828e 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -106,7 +106,7 @@ struct NCCLContextMap { } std::unique_ptr comms(new ncclComm_t[order_.size()]); // if num_trainers == 1, should create a new nccl id for local comms. - if (num_trainers == 1) { + if (num_trainers == 1 && nccl_id == nullptr) { std::lock_guard guard(NCCLGroupGuard::NCCLMutex()); PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( comms.get(), static_cast(order_.size()), order_.data())); diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 998242fb4a09138db24aa75759f4990ffdc4d4e2..85977366e61c676fc5d2d3c5d22dd2f606543684 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -12,9 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/platform/profiler.h" -#include "paddle/fluid/platform/port.h" - #include #include #include @@ -25,9 +22,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif // PADDLE_WITH_CUDA + #include "glog/logging.h" #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/platform/device_tracer.h" +#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/printf.h" DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); @@ -173,8 +173,9 @@ void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) : is_enabled_(false), start_ns_(PosixInNsec()) { - std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled) return; + std::lock_guard l(profiler_mu); + is_enabled_ = true; dev_ctx_ = dev_ctx; name_ = name; @@ -184,8 +185,8 @@ RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) } RecordEvent::~RecordEvent() { - std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled || !is_enabled_) return; + std::lock_guard l(profiler_mu); DeviceTracer* tracer = GetDeviceTracer(); if (tracer) { tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(), diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc index e4e5be5b89f4cbecd6b5e9deec9cc5bffa6a4917..35d1d929819c41b213bc51ec24ac725021a76c88 100644 --- a/paddle/fluid/platform/temporary_allocator_test.cc +++ b/paddle/fluid/platform/temporary_allocator_test.cc @@ -14,12 +14,27 @@ #include "paddle/fluid/platform/temporary_allocator.h" #include +#include +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/tensor_util.h" + DECLARE_double(limit_of_temporary_allocation); namespace paddle { namespace platform { +class DummyOp : public framework::OperatorBase { + public: + DummyOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + protected: + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override {} +}; + TEST(temporary_allocator, temporary_allocator) { platform::CPUPlace cpu_place; TemporaryAllocator alloc(cpu_place); @@ -68,96 +83,92 @@ TEST(temporary_allocator, add_callback) { } TEST(temporary_allocator, create_tensor_with_allocationptr) { - platform::CPUPlace cpu_place; - TemporaryAllocator cpu_alloc(cpu_place); + framework::VariableNameMap dummy_vars; + framework::AttributeMap dummy_attrs; + DummyOp op("dummy", dummy_vars, dummy_vars, dummy_attrs); + framework::Scope scope; + framework::VariableValueMap vars; + framework::RuntimeContext run_ctx(vars, vars); + size_t memory_size = 300; { - size_t memory_size = 200; - auto allocation = cpu_alloc.Allocate(memory_size); - void* address = allocation->ptr(); + platform::CPUPlace cpu_place; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = + static_cast(pool.Get(cpu_place)); + framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx); + int numel = memory_size / sizeof(float); - framework::Tensor tensor = framework::GetTensor( - std::move(allocation), framework::make_ddim({numel})); - PADDLE_ENFORCE_EQ(address, tensor.data()); + framework::Tensor tensor = + ctx.AllocateTmpTensor( + framework::make_ddim({numel}), *dev_ctx); PADDLE_ENFORCE_EQ(tensor.numel(), numel); } #ifdef PADDLE_WITH_CUDA - platform::CUDAPlace gpu_place(0); - TemporaryAllocator gpu_alloc(gpu_place); - { - size_t memory_size = 300; - auto allocation = gpu_alloc.Allocate(memory_size); - void* address = allocation->ptr(); + platform::CUDAPlace gpu_place(0); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = + static_cast(pool.Get(gpu_place)); + framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx); int numel = memory_size / sizeof(float); - framework::Tensor tensor = framework::GetTensor( - std::move(allocation), framework::make_ddim({numel})); - PADDLE_ENFORCE_EQ(address, tensor.data()); + framework::Tensor tensor = + ctx.AllocateTmpTensor( + framework::make_ddim({numel}), *dev_ctx); PADDLE_ENFORCE_EQ(tensor.numel(), numel); } - - // The allocation is not holded now, it should be placed to - // TemporaryAllocationQueue. - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1); - gpu_alloc.Release([]() {}); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); #endif } TEST(temporary_allocator, create_tensor_with_allocationptr2) { - platform::CPUPlace cpu_place; - TemporaryAllocator cpu_alloc(cpu_place); + framework::VariableNameMap dummy_vars; + framework::AttributeMap dummy_attrs; + DummyOp op("dummy", dummy_vars, dummy_vars, dummy_attrs); + framework::Scope scope; + framework::VariableValueMap vars; + framework::RuntimeContext run_ctx(vars, vars); + size_t memory_size = 400; { - size_t memory_size = 400; + platform::CPUPlace cpu_place; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = + static_cast(pool.Get(cpu_place)); + framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx); int numel = memory_size / sizeof(float); framework::Tensor out_side_tensor; - void* address; { - auto allocation = cpu_alloc.Allocate(memory_size); - address = allocation->ptr(); - framework::Tensor tensor = framework::GetTensor( - std::move(allocation), framework::make_ddim({numel})); - PADDLE_ENFORCE_EQ(address, tensor.data()); + framework::Tensor tensor = + ctx.AllocateTmpTensor( + framework::make_ddim({numel}), *dev_ctx); PADDLE_ENFORCE_EQ(tensor.numel(), numel); out_side_tensor.ShareDataWith(tensor); } - PADDLE_ENFORCE_EQ(address, out_side_tensor.data()); PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel); } #ifdef PADDLE_WITH_CUDA - platform::CUDAPlace gpu_place(0); - TemporaryAllocator gpu_alloc(gpu_place); { - void* address; + platform::CUDAPlace gpu_place(0); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = + static_cast(pool.Get(gpu_place)); + framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx); + size_t memory_size = 500; int numel = memory_size / sizeof(float); framework::Tensor out_side_tensor; { - auto allocation = gpu_alloc.Allocate(memory_size); - address = allocation->ptr(); - framework::Tensor tensor = framework::GetTensor( - std::move(allocation), framework::make_ddim({numel})); - PADDLE_ENFORCE_EQ(address, tensor.data()); + framework::Tensor tensor = + ctx.AllocateTmpTensor( + framework::make_ddim({numel}), *dev_ctx); PADDLE_ENFORCE_EQ(tensor.numel(), numel); out_side_tensor.ShareDataWith(tensor); } - PADDLE_ENFORCE_EQ(address, out_side_tensor.data()); PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel); - // The allocation is holded by out_side_tensor. - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); - gpu_alloc.Release([]() {}); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); } - - // The allocation is not holded now, it should be placed to - // TemporaryAllocationQueue. - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1); - gpu_alloc.Release([]() {}); - PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0); #endif } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 3b81d59ad965b7532ca729682e7aeb8eb96194a8..dce755c91a58d3291d740bd05c1cf835cbfbf1f0 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -946,13 +946,6 @@ All parameter, weight, gradient are variables in Paddle. R"DOC(The type is STR, debug_graphviz_path indicate the path that writing the SSA Graph to file in the form of graphviz, you. It is useful for debugging. Default "")DOC") - .def_property( - "enable_data_balance", - [](const BuildStrategy &self) { return self.enable_data_balance_; }, - [](BuildStrategy &self, bool b) { - PADDLE_ENFORCE(!self.IsFinalized(), "BuildStrategy is finlaized."); - self.enable_data_balance_ = b; - }) // FIXME(chengudo): enable_data_balance seems not important .def_property( "enable_sequential_execution", [](const BuildStrategy &self) { @@ -1007,6 +1000,10 @@ All parameter, weight, gradient are variables in Paddle. "memory_optimize", [](const BuildStrategy &self) { return self.memory_optimize_; }, [](BuildStrategy &self, bool b) { self.memory_optimize_ = b; }) + .def_property( + "is_distribution", + [](const BuildStrategy &self) { return self.is_distribution_; }, + [](BuildStrategy &self, bool b) { self.is_distribution_ = b; }) .def_property( "memory_early_delete", [](const BuildStrategy &self) { return self.memory_early_delete_; }, diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index d7ab36223c72cdf479c56c95865e25e3e90a5dec..50b7a631297b150ac9d25c036d21b0bdf2854b79 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -199,6 +199,7 @@ function cmake_gen() { -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON} -DPY_VERSION=${PY_VERSION:-2.7} -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} + -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF} ======================================== EOF # Disable UNITTEST_USE_VIRTUALENV in docker because @@ -232,7 +233,8 @@ EOF -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\ -DANAKIN_BUILD_CROSS_PLANTFORM=${ANAKIN_BUILD_CROSS_PLANTFORM:ON}\ -DPY_VERSION=${PY_VERSION:-2.7} \ - -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build} \ + -DWITH_JEMALLOC=${WITH_JEMALLOC:-OFF} } @@ -447,7 +449,7 @@ EOF elif [ "$1" == "cp37-cp37m" ]; then pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl fi - + if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then paddle version fi @@ -918,11 +920,11 @@ function main() { cmake_gen ${PYTHON_ABI:-""} build assert_api_not_changed ${PYTHON_ABI:-""} - assert_api_spec_approvals run_test gen_capi_package gen_fluid_lib test_fluid_lib + assert_api_spec_approvals ;; assert_api) assert_api_not_changed ${PYTHON_ABI:-""} diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 7a72670935da23565a41d8b2159ef926416db3ca..f9f3807b1567eaf0be20b522154552a8b157583f 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -102,13 +102,6 @@ def __bootstrap__(): import sys import os import platform - - if os.name == 'nt': - third_lib_path = os.path.abspath(os.path.dirname( - __file__)) + os.sep + '..' + os.sep + 'libs' - os.environ['path'] += ';' + third_lib_path - sys.path.append(third_lib_path) - from . import core in_test = 'unittest' in sys.modules @@ -135,7 +128,8 @@ def __bootstrap__(): 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'allocator_strategy', 'reader_queue_speed_test_mode', - 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir' + 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', + 'enable_parallel_graph' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') @@ -158,14 +152,10 @@ def __bootstrap__(): if core.is_compiled_with_cuda(): read_env_flags += [ - 'fraction_of_gpu_memory_to_use', - 'cudnn_deterministic', - 'enable_cublas_tensor_op_math', - 'conv_workspace_size_limit', - 'cudnn_exhaustive_search', - 'memory_optimize_debug', - 'selected_gpus', - 'cudnn_exhaustive_search_times', + 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', + 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', + 'cudnn_exhaustive_search', 'memory_optimize_debug', 'selected_gpus', + 'cudnn_exhaustive_search_times', 'sync_nccl_allreduce' ] core.init_gflags([sys.argv[0]] + diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 71b96e01732d0c5749e863d762aaf0f947546f7c..70767c962f551bdf3afea2237000a4cf93feb120 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -26,6 +26,13 @@ import numpy as np from .. import compat as cpt from .proto import framework_pb2 try: + if os.name == 'nt': + import sys + third_lib_path = os.path.abspath(os.path.dirname( + __file__)) + os.sep + '..' + os.sep + 'libs' + os.environ['path'] += ';' + third_lib_path + sys.path.append(third_lib_path) + from . import core except ImportError as e: if os.name == 'nt': diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9572fcb385823eab16d5c44fd56c680e577c8f04..615a35ba916f813399dc21a87646884b3d01081e 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -26,7 +26,7 @@ from ..initializer import Normal, Constant from ..framework import Variable, OpProtoHolder from ..param_attr import ParamAttr from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_ -from .tensor import concat +from .tensor import concat, assign from . import utils from .. import unique_name from functools import reduce @@ -340,9 +340,7 @@ def embedding(input, """ helper = LayerHelper('embedding', **locals()) - remote_prefetch = False - if os.environ.get('PADDLE_ENABLE_REMOTE_PREFETCH'): - remote_prefetch = True + remote_prefetch = is_sparse and (not is_distributed) if remote_prefetch: assert is_sparse is True and is_distributed is False w = helper.create_parameter( @@ -5032,12 +5030,18 @@ def nce(input, else: num_neg_samples = int(num_neg_samples) + remote_prefetch = is_sparse + print( + "With sparse mode, if your models has only small parameter prefetch may cause speed down" + ) + attrs = { 'num_total_classes': int(num_total_classes), 'num_neg_samples': num_neg_samples, 'seed': seed, 'sampler': sampler, - 'is_sparse': is_sparse + 'is_sparse': is_sparse, + 'remote_prefetch': remote_prefetch } helper.append_op( @@ -5147,7 +5151,10 @@ def hsigmoid(input, pass weights = None - + remote_prefetch = is_sparse + print( + "With sparse mode, if your models has only small parameter prefetch may cause speed down" + ) if not is_custom: weights = helper.create_parameter( attr=helper.param_attr, @@ -5163,7 +5170,7 @@ def hsigmoid(input, inputs = { "X": input, "W": weights, - "PTable": path_table, + "PathTable": path_table, "PathCode": path_code, "Label": label } @@ -5186,9 +5193,13 @@ def hsigmoid(input, type="hierarchical_sigmoid", inputs=inputs, outputs={"Out": out, - "PreOut": pre_out}, - attrs={"num_classes": num_classes, - "is_sparse": is_sparse}) + "PreOut": pre_out, + "W_Out": weights}, + attrs={ + "num_classes": num_classes, + "is_sparse": is_sparse, + "remote_prefetch": remote_prefetch + }) return out @@ -7684,7 +7695,7 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None): Examples: - .. code-block:: python + .. code-block:: python x = fluid.layers.data(name="x", shape=[2,3,16,16], dtype="float32") y = fluid.layers.brelu(x, t_min=1.0, t_max=20.0) diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index c97a93ec36d4f4a7ff6a9f097551e2d21022d5b1..3b066eda110275dc02e451dfaf0cfe28a3fb7a53 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -29,6 +29,15 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy BuildStrategy = core.ParallelExecutor.BuildStrategy +def _is_pserver_mode(main_program): + main = main_program if main_program \ + else framework.default_main_program() + for op in main.global_block().ops: + if op.type in ["send", "recv"]: + return True + return False + + class ParallelExecutor(object): """ ParallelExecutor is designed for data parallelism, which focuses on distributing @@ -128,6 +137,11 @@ class ParallelExecutor(object): build_strategy = BuildStrategy() build_strategy.num_trainers = num_trainers build_strategy.trainer_id = trainer_id + # FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, + # num_trainers is 1, so the current fields of build_strategy doesn't tell if + # it's distributed model. + build_strategy.is_distribution = _is_pserver_mode( + main_program) or num_trainers > 1 # step4: get main_program, scope, local_scopes main = main_program if main_program \ diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 6d6fe245d8a0d9b3a29f11171e7d945e09a4133c..ec8b19c7ba07a9e57a32277ff3fc34b0ea25a819 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -21,18 +21,19 @@ if(NOT WITH_DISTRIBUTE) LIST(REMOVE_ITEM TEST_OPS test_dist_simnet_bow) LIST(REMOVE_ITEM TEST_OPS test_dist_mnist_batch_merge) LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification) + LIST(REMOVE_ITEM TEST_OPS test_nce_remote_table_op) + LIST(REMOVE_ITEM TEST_OPS test_hsigmoid_remote_table_op) endif(NOT WITH_DISTRIBUTE) if (NOT ${WITH_GPU}) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) -elseif(${CUDNN_MAJOR_VERSION} VERSION_LESS 7) +elseif(${CUDNN_VERSION} VERSION_LESS 7100) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) endif() list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185 -list(REMOVE_ITEM TEST_OPS test_nce) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778 list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957 diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py new file mode 100644 index 0000000000000000000000000000000000000000..5535427ea8a93fdc5818cdc058aedb6fe72165ee --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_mean_ngraph_op.py @@ -0,0 +1,31 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function + +import unittest +from paddle.fluid.tests.unittests.test_mean_op import TestMeanOp, TestFP16MeanOp + + +class TestNGRAPHMeanOp(TestMeanOp): + def setUp(self): + super(TestNGRAPHMeanOp, self).setUp() + + +class TestNGRAPHFP16MeanOp(TestFP16MeanOp): + def setUp(self): + super(TestNGRAPHFP16MeanOp, self).setUp() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py new file mode 100644 index 0000000000000000000000000000000000000000..b42a1f73fa72b0dab936a3bb61a8893978b229ec --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/test_scale_ngraph_op.py @@ -0,0 +1,40 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import print_function +import unittest +from paddle.fluid.tests.unittests.test_scale_op import TestScaleOp, TestScaleOpSelectedRows, TestScaleFp16Op, TestScaleFp16OpSelectedRows + + +class TestNGRAPHScaleOp(TestScaleOp): + def init_dtype_type(self): + pass + + +class TestNGRAPHScaleOpSelectedRows(TestScaleOpSelectedRows): + def init_dtype_type(self): + pass + + +class TestNGRAPHScaleFp16Op(TestScaleFp16Op): + def init_dtype_type(self): + pass + + +class TestNGRAPHScaleFp16OpSelectedRows(TestScaleFp16OpSelectedRows): + def init_dtype_type(self): + pass + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index e2a9fc183ea9206efd892b23844081cb9d2fb3d3..2b0ab0cc3bc23bab140d2b7e8cb765e537ff3f5c 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -78,7 +78,6 @@ class TestParallelExecutorBase(unittest.TestCase): exec_strategy.allow_op_delay = allow_op_delay if use_fast_executor: exec_strategy.use_experimental_executor = True - build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py index a27212f38f4e96090f6bc30d507581ce5c0a26ff..ab34a51dd94fce97ae9220fb87b7d6e007ffa994 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py @@ -51,8 +51,9 @@ class TestConv2dFusionOp(OpTest): input = np.random.random(self.input_size).astype(self.dtype) filter = np.random.random(self.filter_size).astype(self.dtype) - self.output = conv2d_forward_naive(input, filter, self.groups, - conv2d_param).astype(self.dtype) + self.output, _, _, _, _ = conv2d_forward_naive( + input, filter, self.groups, conv2d_param) + self.output = self.output.astype(self.dtype) self.inputs = { 'Input': OpTest.np_dtype_to_fluid_dtype(input), diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..def188bfa632b5b1bb6b2621091d0526ffa345dc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py @@ -0,0 +1,270 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle.fluid.core as core +from op_test import OpTest +from test_conv2d_op import conv2d_forward_naive, TestConv2dOp + + +def conv2d_forward_refer(input, filter, group, conv_param): + out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group, + conv_param) + out_tmp = np.zeros((in_n, out_h, out_w, out_c)) + for n in range(in_n): + for i in range(out_h): + for j in range(out_w): + for m in range(out_c): + out_tmp[n, i, j, m] = out[n, m, i, j] + return out_tmp.reshape(in_n, out_c, out_h, out_w) + + +class TestConv2dInt8Op(TestConv2dOp): + def setUp(self): + self.op_type = "conv2d" + self.use_cudnn = False + self.exhaustive_search = False + self.use_cuda = False + self.use_mkldnn = False + self.data_format = "AnyLayout" + self.weighttype = np.float32 + self.use_mkldnn = True + self.init_group() + self.init_dilation() + self.init_test_case() + self.init_fuse_relu() + self.init_data_type() + + conv2d_param = { + 'stride': self.stride, + 'pad': self.pad, + 'dilation': self.dilations + } + + filter = np.random.random(self.filter_size).astype(self.weighttype) + if self.srctype == np.uint8: + input = np.random.randint(0, 10, + self.input_size).astype(self.srctype) + else: + input = np.random.randint(-5, 5, + self.input_size).astype(self.srctype) + input_shift = (np.ones(self.input_size) * 128).astype(np.uint8) + + if self.srctype == np.int8: + filter_int = np.round(filter * self.scale_weights[0] * + 0.5).astype(np.int32) + scale_output_shift = self.scale_out / (self.scale_in * + self.scale_weights[0] * 0.5) + output1 = conv2d_forward_refer( + np.round((input.astype(np.int32) + input_shift) * + self.scale_in).astype(np.int32), filter_int, + self.groups, + conv2d_param).astype(np.float32) * scale_output_shift + output2 = conv2d_forward_refer( + np.round((input_shift) * self.scale_in).astype(np.int32), + filter_int, self.groups, + conv2d_param).astype(np.float32) * scale_output_shift + if self.fuse_relu: + output = np.maximum(np.round(output1 - output2), + 0).astype(self.dsttype) + else: + output = np.round(output1 - output2).astype(self.dsttype) + else: + filter_int = np.round(filter * + self.scale_weights[0]).astype(np.int32) + scale_output_shift = self.scale_out / (self.scale_in * + self.scale_weights[0]) + output1 = conv2d_forward_refer( + input.astype(np.int32), filter_int, self.groups, + conv2d_param).astype(np.float32) + if self.fuse_relu: + output = np.maximum( + np.round(output1 * (self.scale_out / ( + self.scale_in * self.scale_weights[0]))), + 0).astype(self.dsttype) + else: + output = np.round(output1 * (self.scale_out / ( + self.scale_in * + self.scale_weights[0]))).astype(self.dsttype) + + self.inputs = { + 'Input': + OpTest.np_dtype_to_fluid_dtype(input.astype(self.srctype)), + 'Filter': OpTest.np_dtype_to_fluid_dtype(filter) + } + self.attrs = { + 'strides': self.stride, + 'paddings': self.pad, + 'groups': self.groups, + 'dilations': self.dilations, + 'use_cudnn': self.use_cudnn, + 'use_mkldnn': self.use_mkldnn, + 'data_format': self.data_format, + 'exhaustive_search': self.exhaustive_search, + 'Scale_in': self.scale_in, + 'Scale_out': self.scale_out, + 'Scale_weights': self.scale_weights, + 'fuse_relu': self.fuse_relu + } + self.outputs = {'Output': output} + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace(), atol=0) + + def test_check_grad(self): + pass + + def test_check_grad_no_filter(self): + pass + + def test_check_grad_no_input(self): + pass + + def init_test_case(self): + TestConv2dOp.init_test_case(self) + f_c = self.input_size[1] // self.groups + self.filter_size = [1, f_c, 3, 3] + self.scale_in = 1.0 + self.scale_out = 0.5 + self.scale_weights = [10.0] + + def init_data_type(self): + self.srctype = np.uint8 + self.dsttype = np.int8 + + def init_fuse_relu(self): + self.fuse_relu = True + + +#--------------------test conv2d u8 in and u8 out-------------------- + + +class TestConv2d(TestConv2dInt8Op): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + self.scale_in = 1.0 + self.scale_out = 0.5 + self.scale_weights = [10.0] + + +class TestWithPad(TestConv2d): + def init_test_case(self): + TestConv2d.init_test_case(self) + self.pad = [1, 1] + + +class TestWithGroup(TestConv2d): + def init_group(self): + self.groups = 3 + + +class TestWithStride(TestConv2dInt8Op): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 6, 6] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + self.scale_in = 1.0 + self.scale_out = 0.8 + self.scale_weights = [10.0] + + +class TestWith1x1(TestConv2dInt8Op): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [1, 3, 5, 5] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 1, 1] + self.scale_in = 1.0 + self.scale_out = 0.5 + self.scale_weights = [12.0] + + +class TestWithInput1x1Filter1x1(TestConv2dInt8Op): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 3, 1, 1] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 1, 1] + self.scale_in = 1.0 + self.scale_out = 0.5 + self.scale_weights = [10.0] + + def init_group(self): + self.groups = 3 + + +def init_data_type_with_fusion(self, input_dt, fuse_relu): + self.srctype = input_dt + self.dsttype = np.uint8 if fuse_relu else np.int8 + + def init_fuse_relu(self): + self.fuse_relu = fuse_relu + + +def create_test_int8_class(parent): + + #--------------------test conv2d s8 in and u8 out-------------------- + + class TestS8U8Case(parent): + def init_data_type(self): + init_data_type_with_fusion(self, np.int8, True) + + #--------------------test conv2d s8 in and s8 out-------------------- + + class TestS8S8Case(parent): + def init_data_type(self): + init_data_type_with_fusion(self, np.int8, False) + + #--------------------test conv2d u8 in and s8 out-------------------- + + class TestU8S8Case(parent): + def init_data_type(self): + init_data_type_with_fusion(self, np.uint8, False) + + cls_name_s8u8 = "{0}_relu_{1}".format(parent.__name__, "1") + cls_name_s8s8 = "{0}_relu_{1}".format(parent.__name__, "0") + cls_name_u8s8 = "{0}_relu_{1}".format(parent.__name__, "0") + TestS8U8Case.__name__ = cls_name_s8u8 + TestS8S8Case.__name__ = cls_name_s8s8 + TestU8S8Case.__name__ = cls_name_u8s8 + globals()[cls_name_s8u8] = TestS8U8Case + globals()[cls_name_s8s8] = TestS8S8Case + globals()[cls_name_u8s8] = TestU8S8Case + + +create_test_int8_class(TestConv2dInt8Op) +create_test_int8_class(TestWithPad) +create_test_int8_class(TestWithStride) +create_test_int8_class(TestWithGroup) +create_test_int8_class(TestWith1x1) +create_test_int8_class(TestWithInput1x1Filter1x1) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index bcb79f232bd28bcb534ff2a2a0b799297ff96b71..25a9e8d46edb663600a1c1007cdda673e348a881 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -60,7 +60,7 @@ def conv2d_forward_naive(input, filter, group, conv_param): np.sum(input_pad_masked * f_sub[k, :, :, :], axis=(1, 2, 3)) - return out + return out, in_n, out_h, out_w, out_c class TestConv2dOp(OpTest): @@ -85,8 +85,9 @@ class TestConv2dOp(OpTest): input = np.random.random(self.input_size).astype(self.dtype) filter = np.random.random(self.filter_size).astype(self.dtype) - output = conv2d_forward_naive(input, filter, self.groups, - conv2d_param).astype(self.dtype) + output, _, _, _, _ = conv2d_forward_naive(input, filter, self.groups, + conv2d_param) + output = output.astype(self.dtype) self.inputs = { 'Input': OpTest.np_dtype_to_fluid_dtype(input), diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 07cc44aaa266af39fbf3d726ee51a9afc5cb3756..0caab08f0dc19efceadd723b474c10e1a2deb449 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -442,10 +442,10 @@ class TestDistBase(unittest.TestCase): tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2 --lr %f" tr0_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 0, w0_ep, self._lr / 2) + 0, w0_ep, self._lr) tr1_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, - 1, w1_ep, self._lr / 2) + 1, w1_ep, self._lr) if self._mem_opt: tr0_cmd += " --mem_opt" diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index d9ad4e2e2c7b8d0a99d917495fbc8efc6cbd188d..3d1ce6b27c935ddca0f2f5fb377e69b571e3714c 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -14,14 +14,15 @@ from __future__ import print_function +import traceback import math +import collections +import six import unittest +import numpy as np + import paddle.fluid as fluid -from paddle.fluid.transpiler.distribute_transpiler import delete_ops -import traceback -import collections -import six class TranspilerTest(unittest.TestCase): @@ -520,7 +521,7 @@ class TestLocalLookupTable(TestDistLookupTableBase): 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', 'sum', 'split_selected_rows', 'send', 'send_barrier', 'recv', - 'recv', 'recv', 'recv', 'fetch_barrier', 'concat', 'concat' + 'recv', 'fetch_barrier' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) @@ -560,7 +561,7 @@ class TestDistLookupTable(TestDistLookupTableBase): 'lookup_table_grad', 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', 'sum', 'split_ids', 'send', 'send_barrier', - 'recv', 'recv', 'recv', 'fetch_barrier', 'concat' + 'recv', 'recv', 'fetch_barrier' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) startup_ops = [ @@ -607,8 +608,7 @@ class TestAsyncLocalLookupTable(TestDistLookupTableBase): 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', 'lookup_table_grad', - 'sum', 'split_selected_rows', 'send', 'recv', 'recv', 'recv', - 'recv', 'concat', 'concat' + 'sum', 'split_selected_rows', 'send', 'recv', 'recv' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) @@ -648,8 +648,7 @@ class TestAsyncDistLookupTable(TestDistLookupTableBase): 'mul_grad', 'send', 'concat_grad', 'sequence_pool_grad', 'lookup_table_grad', 'split_selected_rows', 'send', 'sequence_pool_grad', 'lookup_table_grad', 'sequence_pool_grad', - 'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv', - 'recv', 'concat' + 'lookup_table_grad', 'sum', 'split_ids', 'send', 'recv', 'recv' ] self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) startup_ops = [ @@ -824,5 +823,142 @@ class TestRemoteLookupTable(TestDistLookupTableBase): self.assertEqual([op.type for op in trainer.blocks[0].ops], ops) +# test for remote prefetch +class TestRemoteNce(TestDistLookupTableBase): + def network_with_table(self, is_sparse, is_distributed): + + num_total_classes = 20 + sampler = "uniform" + nid_freq_arr = np.random.dirichlet(np.ones(20) * 1000).astype('float32') + + input = fluid.layers.data(name="input", shape=[10], dtype="float32") + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + w_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 10], + dtype='float32', + name='nce_w', + initializer=fluid.initializer.ConstantInitializer()) + b_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 1], + dtype='float32', + name='nce_b', + initializer=fluid.initializer.ConstantInitializer()) + + cost = fluid.layers.nce(input=input, + label=label, + num_total_classes=num_total_classes, + sampler=sampler, + custom_dist=nid_freq_arr.tolist(), + sample_weight=None, + param_attr='nce_w', + bias_attr='nce_b', + seed=1, + num_neg_samples=5, + is_sparse=is_sparse) + avg_cost = fluid.layers.mean(cost) + # optimizer + optimizer = fluid.optimizer.Adam(learning_rate=0.003) + optimizer.minimize(avg_cost) + + def net_conf(self): + import os + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + self.network_with_table(is_sparse=True, is_distributed=False) + + def transpiler_test_impl(self): + trainer, _ = self.get_trainer() + + out_vars = ["nce_w"] + in_vars = ["nce_b"] + + recv_var_names = [] + + for op in trainer.blocks[0].ops: + if op.type == "recv": + for var in op.output("Out"): + recv_var_names.append(var) + + for out_var in out_vars: + self.assertFalse(out_var in recv_var_names) + for in_var in in_vars: + self.assertTrue(in_var in recv_var_names) + + +# test for remote prefetch +class TestRemoteHsigmoid(TestDistLookupTableBase): + def network_with_table(self, is_sparse, is_distributed): + + num_total_classes = 3 + + input = fluid.layers.data(name="input", shape=[1], dtype="float32") + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + path_table = fluid.layers.data( + name='path_table', shape=[3], dtype='int64') + path_code = fluid.layers.data( + name='path_code', shape=[3], dtype='int64') + w_param = fluid.default_main_program().global_block().create_parameter( + shape=[num_total_classes, 10], + dtype='float32', + name='hs_w', + initializer=fluid.initializer.ConstantInitializer()) + b_param = fluid.default_main_program().global_block().create_parameter( + shape=[3, 1], + dtype='float32', + name='hs_b', + initializer=fluid.initializer.ConstantInitializer()) + + emb = fluid.layers.embedding( + input=input, + is_sparse=is_sparse, + size=[3, 3], + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal( + scale=1 / math.sqrt(num_total_classes)))) + + cost = fluid.layers.hsigmoid( + input=emb, + label=label, + num_classes=num_total_classes, + path_table=path_table, + path_code=path_code, + is_custom=True, + is_sparse=is_sparse) + avg_cost = fluid.layers.mean(cost) + # optimizer + optimizer = fluid.optimizer.SGD(learning_rate=0.003) + optimizer.minimize(avg_cost) + + def net_conf(self): + import os + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + self.network_with_table(is_sparse=True, is_distributed=False) + + def transpiler_test_impl(self): + trainer, _ = self.get_trainer() + params_to_check = list() + for op in trainer.blocks[0].ops: + if op.type == "hierarchical_sigmoid": + params_to_check = [op.input("W")[0], op.input("Bias")[0]] + for name in ["epmap", "table_names", "epmap"]: + assert op.has_attr(name) + if name == "epmap": + assert op.attr(name)[0] == u'127.0.0.1:6174' + elif name == "table_names": + assert op.attr(name)[0] == u'hierarchical_sigmoid_0.w_0' + else: + assert op.attr(name) == 3 + elif op.type == "lookup_table": + params_to_check.append(op.input("W")[0]) + else: + pass + op_count = 0 + for op in trainer.blocks[0].ops: + if op.type == "recv": + assert len(op.output("Out")) == 1 + assert op.output("Out")[0] == u'hierarchical_sigmoid_0.b_0' + op_count += 1 + assert op_count == 1 + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py index 89476ee641f1dd295a3caca89ac41038cad317f2..81b0b667814e851e8bd47ae1a3b0bf00a9a73ecd 100644 --- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py @@ -29,6 +29,12 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): print('Skip use_cuda=True because Paddle is not compiled with cuda') return + if use_parallel_executor and os.name == 'nt': + print( + 'Skip use_parallel_executor=True because Paddle comes without parallel support on windows' + ) + return + word_dict = paddle.dataset.imdb.word_dict() train_reader = paddle.batch( paddle.dataset.imdb.train(word_dict), batch_size=batch_size) diff --git a/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py new file mode 100644 index 0000000000000000000000000000000000000000..584e309befcee18ad913d935c803fdd387a92745 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_fused_emb_seq_pool_op.py @@ -0,0 +1,51 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid.op import Operator +import paddle.compat as cpt + + +class TestFusedEmbeddingSeqPoolOp(OpTest): + def setUp(self): + self.op_type = "fused_embedding_seq_pool" + self.emb_size = 2 + table = np.random.random((17, self.emb_size)).astype("float32") + ids = np.array([[[4], [3]], [[4], [3]], [[2], [1]], + [[16], [1]]]).astype("int64") + merged_ids = np.array([4, 2, 16]).astype("int64") + ids_expand = np.expand_dims(ids, axis=1) + self.lod = [[3, 1]] + self.attrs = {'is_sparse': True} + self.inputs = {'W': table, 'Ids': (ids_expand, self.lod)} + self.outputs = { + 'Out': np.reshape( + np.array([ + table[[4, 3]] + table[[4, 3]] + table[[2, 1]], + table[[16, 1]] + ]), [len(self.lod[0]), 2 * self.emb_size]) + } + + def test_check_output(self): + self.check_output() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index 2a6c93f75fad53440a2db64e4f34c9a5c22c654e..8ed5074dc2626ff58fc65d8af1340e260c029572 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -185,7 +185,7 @@ class TestHSigmoidOpSparse(OpTest): self.inputs = { 'X': x, 'W': w, - 'PTable': path_table, + 'PathTable': path_table, 'PathCode': path_code, 'Label': label, 'Bias': bias @@ -287,7 +287,7 @@ class TestHSigmoidOpWithCostumTree(OpTest): self.inputs = { 'X': x, 'W': w, - 'PTable': path_table, + 'PathTable': path_table, 'PathCode': path_code, 'Label': label, 'Bias': bias @@ -324,7 +324,7 @@ class TestHSigmoidOpWithCostumTreeWithoutBias(OpTest): self.inputs = { 'X': x, 'W': w, - 'PTable': path_table, + 'PathTable': path_table, 'PathCode': path_code, 'Label': label, } diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py new file mode 100644 index 0000000000000000000000000000000000000000..da343dd503a62e83f431dd0ffb02a7e70be7d0d5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_remote_table_op.py @@ -0,0 +1,269 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import signal +import time +import unittest +from multiprocessing import Process + +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle.fluid.framework import Program, program_guard + + +def run_pserver(pserver_id, use_cuda, sync_mode): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + # create table parameter in scope + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + # create and initialize Param Variable + param = scope.var('table').get_tensor() + + param_array = np.ones((5, 8)).astype("float32") + for i in range(len(param_array)): + param_array[i] *= param_array[i] * i + pserver_id * 10 + 1 + param.set(param_array, place) + + optimize_block = program._create_block(program.global_block().idx) + program.global_block().append_op( + type="listen_and_serv", + inputs={'X': []}, + outputs={}, + attrs={ + "optimize_blocks": [optimize_block], + "endpoint": '127.0.0.1:0', + "Fanin": 1, + "sync_mode": True, + "grad_to_block_id": [] + }) + + exe = fluid.Executor(place) + exe.run(program) + + +class TestListenAndServOp(unittest.TestCase): + def setUp(self): + self.ps_timeout = 5 + + def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func): + p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode)) + p.daemon = True + p.start() + return p + + def _wait_ps_ready(self, pid): + start_left_time = self.ps_timeout + sleep_time = 0.5 + while True: + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + start_left_time -= sleep_time + + def _get_pserver_port(self, pid): + with open("/tmp/paddle.%d.port" % pid, 'r') as f: + port = int(f.read().strip()) + return port + + def _run_hsigmoid_op_one_pserver(self, place, port): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('X').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") * 2 + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") * 2 + param.set(param_array, place) + + path_table = scope.var('PathTable').get_tensor() + path_table_array = np.array( + [(0, 2, -1, -1, -1), (0, 1, 2, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, -1)]).astype( + "int64" + ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + path_table.set(path_table_array, place) + + path_code = scope.var('PathCode').get_tensor() + path_code_array = np.array( + [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), + (0, 1, -1, -1, -1)]).astype("int64") #np.array to store + path_code.set(path_code_array, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([0, 1, 4, 5]) + label.set(label_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + out = scope.var('Out').get_tensor() + + pre_out = scope.var('PreOut').get_tensor + + w_out = scope.var('W_Out').get_tensor() + w_out.set(param_array, place) + + emaps = ['127.0.0.1:' + str(port)] + table_names = ['table'] + height_sections = [2] + + # create and run sgd operator + hsigmoid_op = Operator( + "hierarchical_sigmoid", + X='X', + W='W', + PathTable='PathTable', + PathCode='PathCode', + Label='Label', + Bias='Bias', + Out='Out', + PreOut='PreOut', + W_Out='W_Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + + hsigmoid_op.run(scope, place) + + # get and compare result + result_array = np.array(w_out) + self.assertEqual(list(result_array.shape), [5, 8]) + correct = None + for i in range(5): + if i != 3: + correct = np.full((1, 8), i + 1).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + else: + correct = np.full((1, 8), 0).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + + def _run_hsigmoid_op_two_pserver(self, place, port0, port1): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('X').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") * 2 + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('W').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") * 2 + param.set(param_array, place) + + path_table = scope.var('PathTable').get_tensor() + path_table_array = np.array( + [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, -1)]).astype( + "int64" + ) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + path_table.set(path_table_array, place) + + path_code = scope.var('PathCode').get_tensor() + path_code_array = np.array( + [(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (1, 0, 0, -1, -1), + (0, 1, -1, -1, -1)]).astype("int64") #np.array to store + path_code.set(path_code_array, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([0, 1, 4, 5]) + label.set(label_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + out = scope.var('Out').get_tensor() + + pre_out = scope.var('PreOut').get_tensor + + w_out = scope.var('W_Out').get_tensor() + w_out.set(param_array, place) + + emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] + table_names = ['table', 'table'] + height_sections = [2, 3] + + # create and run sgd operator + hsigmoid_op = Operator( + "hierarchical_sigmoid", + X='X', + W='W', + PathTable='PathTable', + PathCode='PathCode', + Label='Label', + Bias='Bias', + Out='Out', + PreOut='PreOut', + W_Out='W_Out', + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + hsigmoid_op.run(scope, place) + + # get and compare result + result_array = np.array(w_out) + self.assertEqual(list(result_array.shape), [5, 8]) + correct = None + for i in range(5): + if i < 2: + correct = np.full((1, 8), i + 1).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + else: + correct = np.full((1, 8), i + 9).astype("float32") + self.assertTrue((result_array[i] == correct).all()) + + def test_hsigmoid_op_remote(self): + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + # run pserver on CPU in sync mode + p0 = self._start_pserver(0, False, True, run_pserver) + self._wait_ps_ready(p0.pid) + port0 = self._get_pserver_port(p0.pid) + + p1 = self._start_pserver(1, False, True, run_pserver) + self._wait_ps_ready(p1.pid) + port1 = self._get_pserver_port(p1.pid) + + places = [core.CPUPlace()] + + for place in places: + self._run_hsigmoid_op_one_pserver(place, port0) + self._run_hsigmoid_op_two_pserver(place, port0, port1) + + # raise SIGTERM to pserver + os.kill(p0.pid, signal.SIGINT) + p0.join() + os.kill(p1.pid, signal.SIGINT) + p1.join() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py new file mode 100644 index 0000000000000000000000000000000000000000..cc6f40de86e302605a416c48790c74cbb431b2e3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_nce_remote_table_op.py @@ -0,0 +1,236 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import os +import signal +import time +import unittest +from multiprocessing import Process + +import numpy as np +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.op import Operator +from paddle.fluid.framework import Program, program_guard + + +def nce(input, weight, bias, sample_weight, labels, num_classes, + num_sample_class): + samples = [] + sample_labels = [] + batch_size = input.shape[0] + num_true_class = labels.shape[1] + for i in range(batch_size): + w = 1 if sample_weight is None else sample_weight[i] + for label in labels[i]: + samples.append((i, label, True, w)) + sample_labels.append(label) + for num in range(num_sample_class): + samples.append((i, num, False, w)) + sample_labels.append(num) + # forward bias + sample_out = np.zeros(len(samples)).astype(np.float32) + if bias is not None: + for i in range(len(samples)): + sample_out[i] = bias[samples[i][1]] + # forward weight + for i in range(len(samples)): + sample_out[i] += np.dot(input[samples[i][0]], weight[samples[i][1]]) + + # forward activation + sample_out = 1.0 / (1.0 + np.exp(-sample_out)) + # forward cost + out = np.zeros(batch_size).astype(np.float32) + b = 1.0 / num_classes * num_sample_class + + for i in range(len(samples)): + o = sample_out[i] + cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b)) + out[samples[i][0]] += cost * samples[i][3] + return (out[:, np.newaxis], np.array(sample_out).reshape( + batch_size, num_sample_class + num_true_class), + np.array(sample_labels).reshape(batch_size, + num_sample_class + num_true_class)) + + +def run_pserver(pserver_id, use_cuda, sync_mode): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + # create table parameter in scope + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + # create and initialize Param Variable + param = scope.var('table').get_tensor() + + param_array = np.ones((5, 8)).astype("float32") + for i in range(len(param_array)): + param_array[i] *= param_array[i] * i + pserver_id * 10 + 1 + param.set(param_array, place) + + optimize_block = program._create_block(program.global_block().idx) + program.global_block().append_op( + type="listen_and_serv", + inputs={'X': []}, + outputs={}, + attrs={ + "optimize_blocks": [optimize_block], + "endpoint": '127.0.0.1:0', + "Fanin": 1, + "sync_mode": True, + "grad_to_block_id": [] + }) + + exe = fluid.Executor(place) + exe.run(program) + + +class TestListenAndServOp(unittest.TestCase): + def setUp(self): + self.ps_timeout = 5 + + def _start_pserver(self, pserver_id, use_cuda, sync_mode, pserver_func): + p = Process(target=pserver_func, args=(pserver_id, use_cuda, sync_mode)) + p.daemon = True + p.start() + return p + + def _wait_ps_ready(self, pid): + start_left_time = self.ps_timeout + sleep_time = 0.5 + while True: + assert start_left_time >= 0, "wait ps ready failed" + time.sleep(sleep_time) + try: + # the listen_and_serv_op would touch a file which contains the listen port + # on the /tmp directory until it was ready to process all the RPC call. + os.stat("/tmp/paddle.%d.port" % pid) + return + except os.error: + start_left_time -= sleep_time + + def _get_pserver_port(self, pid): + with open("/tmp/paddle.%d.port" % pid, 'r') as f: + port = int(f.read().strip()) + return port + + def _run_nce_op_two_pserver(self, place, port0, port1): + scope = fluid.core.Scope() + program = Program() + with fluid.scope_guard(scope): + with program_guard(program, startup_program=Program()): + x = scope.var('Input').get_tensor() + x_array = np.random.random((4, 8)).astype("float32") + x.set(x_array, place) + # create and initialize Param Variable + param = scope.var('Weight').get_tensor() + param_array = np.zeros((5, 8)).astype("float32") + param.set(param_array, place) + + bias = scope.var('Bias').get_tensor() + bias_array = np.random.random((5, 1)).astype("float32") + bias.set(bias_array, place) + + sample_w = scope.var('SampleWeight').get_tensor() + sample_weight = np.random.random((4, 1)).astype("float32") + sample_w.set(sample_weight, place) + + label = scope.var('Label').get_tensor() + label_array = np.array([[0], [1], [4], [3]]) + label.set(label_array, place) + + cost = scope.var('Cost').get_tensor() + cost_w = np.zeros((4, 1)).astype("float32") + cost.set(cost_w, place) + + sample_l = scope.var('SampleLogits').get_tensor() + sample_l_w = np.zeros((4, 3)).astype("float32") + sample_l.set(sample_l_w, place) + + sample_la = scope.var('SampleLabels').get_tensor() + sample_la_w = np.zeros((4, 3)).astype("int") + sample_la.set(sample_la_w, place) + + emaps = ['127.0.0.1:' + str(port0), '127.0.0.1:' + str(port1)] + table_names = ['table', 'table'] + height_sections = [2, 3] + + # create and run nce operator + nce_op = Operator( + "nce", + Input='Input', + Weight='Weight', + Label='Label', + Bias='Bias', + Cost='Cost', + SampleLogits='SampleLogits', + SampleLabels='SampleLabels', + SampleWeight='SampleWeight', + num_total_classes=5, + num_neg_samples=2, + custom_neg_classes=list(range(2)), + sampler=0, + seed=0, + is_sparse=True, + remote_prefetch=True, + epmap=emaps, + table_names=table_names, + height_sections=height_sections) + + nce_op.run(scope, place) + + # get and compare result + o_cost = np.array(scope.var('Cost').get_tensor()) + o_logits = np.array(scope.var('SampleLogits').get_tensor()) + o_labels = np.array(scope.var('SampleLabels').get_tensor()) + + param_array = np.ones((5, 8)).astype("float32") + for i in range(2): + param_array[i] *= param_array[i] * i + 0 * 10 + 1 + for i in range(2, 5): + param_array[i] *= param_array[i] * i + 1 * 10 + 1 + out = nce(x_array, param_array, bias_array, sample_weight, + label_array, 5, 2) + + self.assertAlmostEqual(o_cost.all(), out[0].all(), delta=1e-6) + self.assertAlmostEqual(o_logits.all(), out[1].all(), delta=1e-6) + self.assertAlmostEqual(o_labels.all(), out[2].all(), delta=1e-6) + + def test_nce_op_remote(self): + os.environ['PADDLE_ENABLE_REMOTE_PREFETCH'] = "1" + # run pserver on CPU in sync mode + p0 = self._start_pserver(0, False, True, run_pserver) + self._wait_ps_ready(p0.pid) + port0 = self._get_pserver_port(p0.pid) + + p1 = self._start_pserver(1, False, True, run_pserver) + self._wait_ps_ready(p1.pid) + port1 = self._get_pserver_port(p1.pid) + + places = [core.CPUPlace()] + + for place in places: + self._run_nce_op_two_pserver(place, port0, port1) + + # raise SIGTERM to pserver + os.kill(p0.pid, signal.SIGINT) + p0.join() + os.kill(p1.pid, signal.SIGINT) + p1.join() + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py index 84b0aad8acb096a32f625e32fb640599f2882d97..1c6cfce0c2b772fa78fa08fa1bfb383c1e4f7939 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py @@ -175,41 +175,61 @@ class TestCRFModel(unittest.TestCase): print(pe.run(feed=feeder.feed(cur_batch), fetch_list=[avg_cost.name])[0]) - def test_update_sparse_parameter_all_reduce(self): + def _new_build_strategy(self, use_reduce=False): build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce + + if use_reduce: + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce + else: + build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce + + return build_strategy + + def test_update_sparse_parameter_all_reduce(self): if core.is_compiled_with_cuda(): self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy, use_cuda=True) + is_sparse=True, + build_strategy=self._new_build_strategy(), + use_cuda=True) + self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy, use_cuda=False) + is_sparse=True, + build_strategy=self._new_build_strategy(), + use_cuda=False) def test_update_dense_parameter_all_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce if core.is_compiled_with_cuda(): self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy, use_cuda=True) + is_sparse=False, + build_strategy=self._new_build_strategy(), + use_cuda=True) + self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy, use_cuda=False) + is_sparse=False, + build_strategy=self._new_build_strategy(), + use_cuda=False) def test_update_sparse_parameter_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if core.is_compiled_with_cuda(): self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy, use_cuda=True) + is_sparse=True, + build_strategy=self._new_build_strategy(use_reduce=True), + use_cuda=True) self.check_network_convergence( - is_sparse=True, build_strategy=build_strategy, use_cuda=False) + is_sparse=True, + build_strategy=self._new_build_strategy(use_reduce=True), + use_cuda=False) def test_update_dense_parameter_reduce(self): - build_strategy = fluid.BuildStrategy() - build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if core.is_compiled_with_cuda(): self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy, use_cuda=True) + is_sparse=False, + build_strategy=self._new_build_strategy(use_reduce=True), + use_cuda=True) self.check_network_convergence( - is_sparse=False, build_strategy=build_strategy, use_cuda=False) + is_sparse=False, + build_strategy=self._new_build_strategy(use_reduce=True), + use_cuda=False) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index 3eecc4670152e72443f731c71d7db67ca8e02e72..9768f7db26c76b1f6fcffa24fd2ea3c0abd17aeb 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -86,6 +86,7 @@ class TestMNIST(TestParallelExecutorBase): "label": label}, use_cuda=use_cuda, use_reduce=False) + reduce_first_loss, reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, diff --git a/python/paddle/fluid/tests/unittests/test_reader_reset.py b/python/paddle/fluid/tests/unittests/test_reader_reset.py index e97a05b6f929821f82d96b462598a5ff03cf0a48..7eeffa1039a1e14a8883c4a78305d253a4518b26 100644 --- a/python/paddle/fluid/tests/unittests/test_reader_reset.py +++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py @@ -75,8 +75,6 @@ class TestReaderReset(unittest.TestCase): exe.run(startup_prog) build_strategy = fluid.BuildStrategy() - if with_double_buffer: - build_strategy.enable_data_balance = True exec_strategy = fluid.ExecutionStrategy() parallel_exe = fluid.ParallelExecutor( use_cuda=self.use_cuda, diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index c128843885fbce29893a4b24c65482abaf870e82..07343b4051e0f44996d1d4617e2cbd1a0d22ce3e 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -251,11 +251,10 @@ class DistributeTranspiler(object): def _get_all_remote_sparse_update_op(self, main_program): sparse_update_ops = [] - sparse_update_op_types = ["lookup_table"] + sparse_update_op_types = ["lookup_table", "nce", "hierarchical_sigmoid"] for op in main_program.global_block().ops: if op.type in sparse_update_op_types and op.attr( - 'remote_prefetch') is True and not op.attr( - 'is_distributed'): + 'remote_prefetch') is True: sparse_update_ops.append(op) return sparse_update_ops