From 42e96a029fcf07d691fc61502432fdd4bce091ae Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 13 Mar 2019 22:44:35 +0800 Subject: [PATCH] Accelerate CPU part --- CMakeLists.txt | 10 ++++++ paddle/fluid/framework/grad_op_desc_maker.h | 5 ++- paddle/fluid/imperative/CMakeLists.txt | 1 + paddle/fluid/imperative/layer.cc | 6 ++-- paddle/fluid/imperative/layer.h | 28 ++++++++++------ paddle/fluid/imperative/tracer.cc | 37 ++------------------- paddle/fluid/imperative/tracer.h | 4 +++ paddle/fluid/inference/CMakeLists.txt | 3 +- paddle/fluid/pybind/CMakeLists.txt | 2 +- paddle/fluid/pybind/pybind.cc | 7 ++++ python/paddle/fluid/__init__.py | 3 +- python/paddle/fluid/framework.py | 4 +++ python/paddle/fluid/imperative/__init__.py | 4 +++ python/paddle/fluid/imperative/nn.py | 3 ++ 14 files changed, 64 insertions(+), 53 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e7ffe72b5f..ce06c73bdcf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,6 +37,16 @@ if(WIN32) set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") +# else() + # set(CMAKE_C_ARCHIVE_CREATE " --target elf64-x86-64 cr ") + # set(CMAKE_C_ARCHIVE_APPEND " --target elf64-x86-64 r ") + # # set(CMAKE_C_ARCHIVE_FINISH " --enable-64-bit-archive ") + # set(CMAKE_CXX_ARCHIVE_CREATE ${CMAKE_C_ARCHIVE_CREATE}) + # set(CMAKE_CXX_ARCHIVE_APPEND ${CMAKE_C_ARCHIVE_APPEND}) + # # set(CMAKE_CXX_ARCHIVE_FINISH ${CMAKE_C_ARCHIVE_FINISH}) + # set(CMAKE_Fortran_ARCHIVE_CREATE ${CMAKE_C_ARCHIVE_CREATE}) + # set(CMAKE_Fortran_ARCHIVE_APPEND ${CMAKE_C_ARCHIVE_APPEND}) + # # set(CMAKE_Fortran_ARCHIVE_FINISH ${CMAKE_C_ARCHIVE_FINISH}) endif(WIN32) find_package(CUDA QUIET) diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h index 9bccb1a32bf..46ebf4051fb 100644 --- a/paddle/fluid/framework/grad_op_desc_maker.h +++ b/paddle/fluid/framework/grad_op_desc_maker.h @@ -55,7 +55,10 @@ class GradOpDescMakerBase { std::back_inserter(ret_val), [this](const std::string& fwd_var_name) -> std::string { auto g_name = GradVarName(fwd_var_name); - if (no_grad_set_.count(g_name)) { + if (no_grad_set_.empty()) { + (*this->grad_to_var_)[g_name] = fwd_var_name; + return g_name; + } else if (no_grad_set_.count(g_name)) { return kEmptyVarName; } else { (*this->grad_to_var_)[g_name] = fwd_var_name; diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index ec8dedd6052..0d116a64954 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -2,4 +2,5 @@ if(WITH_PYTHON) cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind) cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind) cc_library(engine SRCS engine.cc) +cc_library(imperative_profiler SRCS profiler.cc) endif() diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 5530823b90f..d7f7967f725 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -239,7 +239,7 @@ std::map> OpBase::ApplyGrad() { VLOG(3) << "apply grad op " << grad_op_desc->Type(); // Allocate tmp grad output variable - for (auto it : grad_output_variable_map) { + for (const auto& it : grad_output_variable_map) { auto& outputs = tmp_grad_outputs[k][it.first]; outputs.reserve(it.second.size()); for (size_t i = 0; i < it.second.size(); ++i) { @@ -273,9 +273,9 @@ std::map> OpBase::ApplyGrad() { // Add tmp grad outputs to original grad vars for (size_t k = 0; k < grad_output_vars_.size(); ++k) { - for (auto it : grad_output_vars_[k]) { + for (const auto& it : grad_output_vars_[k]) { auto& outputs = tmp_grad_outputs[k][it.first]; - auto& origin_outputs = it.second; + const auto& origin_outputs = it.second; PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); for (size_t i = 0; i < outputs.size(); ++i) { diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h index 618a5b7a032..27cb1c84f56 100644 --- a/paddle/fluid/imperative/layer.h +++ b/paddle/fluid/imperative/layer.h @@ -294,17 +294,23 @@ class PYBIND11_HIDDEN OpBase { void InvokeBackwardHooks(); - void TrackPreOp(const VarBase* inp_var, const std::string& inp_name) { - if (inp_var->PreOp() && !inp_var->IsStopGradient()) { - VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot " - << inp_name; - pre_ops_[inp_name].push_back(inp_var->PreOp()); - pre_ops_out_idx_[inp_name].push_back(inp_var->PreOpOutIdx()); - } else { - VLOG(3) << "no pre op in slot " << inp_name - << " input var stop_gradient: " << inp_var->IsStopGradient(); - pre_ops_[inp_name].push_back(nullptr); - // pre_ops_out_idx_[inp_name].push_back(-1); + void TrackPreOp(const std::string& inp_name, + const std::vector& inputs) { + auto& pre_ops_list = pre_ops_[inp_name]; + pre_ops_list.reserve(inputs.size()); + auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name]; + for (VarBase* inp_var : inputs) { + if (inp_var->PreOp() && !inp_var->IsStopGradient()) { + VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot " + << inp_name; + pre_ops_list.emplace_back(inp_var->PreOp()); + pre_ops_out_idx_list.push_back(inp_var->PreOpOutIdx()); + } else { + VLOG(3) << "no pre op in slot " << inp_name + << " input var stop_gradient: " << inp_var->IsStopGradient(); + pre_ops_list.emplace_back(nullptr); + // pre_ops_out_idx_list.push_back(-1); + } } } diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 7ee92b4d8c4..7773a3f8fc3 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -23,23 +23,9 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -#ifdef WITH_GPERFTOOLS -#include "gperftools/profiler.h" -#endif - -DEFINE_string( - tracer_profile_fname, "", - "Profiler filename for imperative tracer, which generated by gperftools." - "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable."); - namespace paddle { namespace imperative { -static std::once_flag gTracerProfileOnce; -#ifdef WITH_GPERFTOOLS -static bool gTracerProfilerStarted = false; -#endif - void CreateGradOp(const framework::OpDesc& op_desc, const std::unordered_set& no_grad_set, const std::vector& grad_sub_block, @@ -146,17 +132,6 @@ framework::VariableNameMap CreateOutputVarNameMap( } Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { - if (!FLAGS_tracer_profile_fname.empty()) { - std::call_once(gTracerProfileOnce, [] { -#ifdef WITH_GPERFTOOLS - ProfilerStart(FLAGS_tracer_profile_fname.c_str()); - gTracerProfilerStarted = true; -#else - LOG(WARNING) << "Paddle is not compiled with gperftools. " - "FLAGS_tracer_profile_fname will be ignored"; -#endif - }); - } } std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, @@ -164,12 +139,6 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, framework::AttributeMap attrs_map, const platform::Place expected_place, const bool stop_gradient) { -#ifdef WITH_GPERFTOOLS - if (gTracerProfilerStarted) { - ProfilerFlush(); - } -#endif - framework::VariableValueMap invars_map; framework::VariableValueMap outvars_map; @@ -184,7 +153,6 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, inp->Name()); invars.emplace_back(inp->var_); - op->TrackPreOp(inp, it.first); if (!stop_gradient) { current_vars_map[inp->Name()] = inp; } @@ -192,6 +160,7 @@ std::set Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, << " inited: " << inp->var_->IsInitialized() << " stop_grad: " << inp->IsStopGradient(); } + op->TrackPreOp(it.first, it.second); } op->output_vars_ = outputs; @@ -319,9 +288,7 @@ std::vector Tracer::PyTrace(OpBase* op, std::vector ret_vars = PyLayer::Apply(op->forward_id_, inputs); - for (VarBase* inp : inputs) { - op->TrackPreOp(inp, PyLayer::kFwdInp); - } + op->TrackPreOp(PyLayer::kFwdInp, inputs); std::vector& outputs = op->output_vars_[PyLayer::kFwdOut]; outputs.reserve(ret_vars.size()); diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 7b65d55e9ef..62d8eecfccf 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -56,6 +56,10 @@ class Tracer { std::vector PyTrace(OpBase* op, const std::vector& inputs, bool stop_gradient = false); + static void StartProfile(); + + static void StopProfile(); + private: platform::Place GetPlace(const VarBasePtrMap& inputs); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 762640d6d1c..0d682fc0a5d 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -29,6 +29,7 @@ endif(WIN32) if(WIN32) sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) else(WIN32) + #set(CMAKE_C_ARCHIVE_CREATE " --target elf64-x86-64 cr paddle_fluid_origin ${fluid_modules} paddle_fluid_api") cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) endif(WIN32) @@ -91,5 +92,5 @@ if(WITH_TESTING) add_subdirectory(tests/book) if(WITH_INFERENCE_API_TEST) add_subdirectory(tests/api) - endif() + endif() endif() diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 4ac5b83c56b..f1385f57184 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,6 +1,6 @@ set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer scope_pool - tracer analysis_predictor) + tracer analysis_predictor imperative_profiler) if(WITH_PYTHON) list(APPEND PYBIND_DEPS py_func_op) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 395093a1f5a..7a6a5b8645d 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -36,6 +36,7 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/version.h" #include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/imperative/profiler.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/operators/activation_op.h" @@ -148,6 +149,12 @@ PYBIND11_MODULE(core, m) { m.def("print_mem_usage", []() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); }); + m.def("start_imperative_profiler", + []() { imperative::StartProfile(); }); + + m.def("stop_imperative_profiler", + []() { imperative::StopProfile(); }); + py::class_(m, "VarBase", R"DOC()DOC") .def( py::init