未验证 提交 8e4ad008 编写于 作者: Q Qiyang Min 提交者: GitHub

Merge pull request #16198 from velconia/imperative_train_speed

Improve imperative mode training speed
...@@ -24,6 +24,8 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: " ...@@ -24,6 +24,8 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
"${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}") "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: " message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}") "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
message(STATUS "AR tools: ${CMAKE_AR}")
if(WIN32) if(WIN32)
set(CMAKE_SUPPRESS_REGENERATION ON) set(CMAKE_SUPPRESS_REGENERATION ON)
set(CMAKE_STATIC_LIBRARY_PREFIX lib) set(CMAKE_STATIC_LIBRARY_PREFIX lib)
......
...@@ -14,7 +14,9 @@ limitations under the License. */ ...@@ -14,7 +14,9 @@ limitations under the License. */
#pragma once #pragma once
#include <algorithm> #include <algorithm>
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/op_desc.h"
...@@ -55,11 +57,11 @@ class GradOpDescMakerBase { ...@@ -55,11 +57,11 @@ class GradOpDescMakerBase {
std::back_inserter(ret_val), std::back_inserter(ret_val),
[this](const std::string& fwd_var_name) -> std::string { [this](const std::string& fwd_var_name) -> std::string {
auto g_name = GradVarName(fwd_var_name); auto g_name = GradVarName(fwd_var_name);
if (no_grad_set_.count(g_name)) { if (no_grad_set_.empty() || !no_grad_set_.count(g_name)) {
return kEmptyVarName;
} else {
(*this->grad_to_var_)[g_name] = fwd_var_name; (*this->grad_to_var_)[g_name] = fwd_var_name;
return g_name; return g_name;
} else {
return kEmptyVarName;
} }
}); });
if (!drop_empty_grad) { if (!drop_empty_grad) {
......
...@@ -2,4 +2,5 @@ if(WITH_PYTHON) ...@@ -2,4 +2,5 @@ if(WITH_PYTHON)
cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind) cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind) cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
cc_library(engine SRCS engine.cc) cc_library(engine SRCS engine.cc)
cc_library(imperative_profiler SRCS profiler.cc)
endif() endif()
...@@ -239,7 +239,7 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() { ...@@ -239,7 +239,7 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
VLOG(3) << "apply grad op " << grad_op_desc->Type(); VLOG(3) << "apply grad op " << grad_op_desc->Type();
// Allocate tmp grad output variable // Allocate tmp grad output variable
for (auto it : grad_output_variable_map) { for (const auto& it : grad_output_variable_map) {
auto& outputs = tmp_grad_outputs[k][it.first]; auto& outputs = tmp_grad_outputs[k][it.first];
outputs.reserve(it.second.size()); outputs.reserve(it.second.size());
for (size_t i = 0; i < it.second.size(); ++i) { for (size_t i = 0; i < it.second.size(); ++i) {
...@@ -273,9 +273,9 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() { ...@@ -273,9 +273,9 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
// Add tmp grad outputs to original grad vars // Add tmp grad outputs to original grad vars
for (size_t k = 0; k < grad_output_vars_.size(); ++k) { for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
for (auto it : grad_output_vars_[k]) { for (const auto& it : grad_output_vars_[k]) {
auto& outputs = tmp_grad_outputs[k][it.first]; auto& outputs = tmp_grad_outputs[k][it.first];
auto& origin_outputs = it.second; const auto& origin_outputs = it.second;
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
for (size_t i = 0; i < outputs.size(); ++i) { for (size_t i = 0; i < outputs.size(); ++i) {
......
...@@ -294,17 +294,23 @@ class PYBIND11_HIDDEN OpBase { ...@@ -294,17 +294,23 @@ class PYBIND11_HIDDEN OpBase {
void InvokeBackwardHooks(); void InvokeBackwardHooks();
void TrackPreOp(const VarBase* inp_var, const std::string& inp_name) { void TrackPreOp(const std::string& inp_name,
if (inp_var->PreOp() && !inp_var->IsStopGradient()) { const std::vector<VarBase*>& inputs) {
VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot " auto& pre_ops_list = pre_ops_[inp_name];
<< inp_name; pre_ops_list.reserve(inputs.size());
pre_ops_[inp_name].push_back(inp_var->PreOp()); auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name];
pre_ops_out_idx_[inp_name].push_back(inp_var->PreOpOutIdx()); for (VarBase* inp_var : inputs) {
} else { if (inp_var->PreOp() && !inp_var->IsStopGradient()) {
VLOG(3) << "no pre op in slot " << inp_name VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot "
<< " input var stop_gradient: " << inp_var->IsStopGradient(); << inp_name;
pre_ops_[inp_name].push_back(nullptr); pre_ops_list.emplace_back(inp_var->PreOp());
// pre_ops_out_idx_[inp_name].push_back(-1); pre_ops_out_idx_list.push_back(inp_var->PreOpOutIdx());
} else {
VLOG(3) << "no pre op in slot " << inp_name
<< " input var stop_gradient: " << inp_var->IsStopGradient();
pre_ops_list.emplace_back(nullptr);
// pre_ops_out_idx_list.push_back(-1);
}
} }
} }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/imperative/profiler.h"
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
#include <gflags/gflags.h>
#include <glog/logging.h>
#include <mutex> // NOLINT
#include <thread> // NOLINT
DEFINE_string(
tracer_profile_fname, "xxgperf",
"Profiler filename for imperative tracer, which generated by gperftools."
"Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
namespace paddle {
namespace imperative {
static std::once_flag gTracerProfileOnce;
#ifdef WITH_GPERFTOOLS
static bool gTracerProfilerStarted = false;
#endif
void StartProfile() {
if (!FLAGS_tracer_profile_fname.empty()) {
std::call_once(gTracerProfileOnce, [] {
#ifdef WITH_GPERFTOOLS
ProfilerStart(FLAGS_tracer_profile_fname.c_str());
gTracerProfilerStarted = true;
#else
LOG(WARNING) << "Paddle is not compiled with gperftools. "
"FLAGS_tracer_profile_fname will be ignored";
#endif
});
}
}
void StopProfile() {
#ifdef WITH_GPERFTOOLS
ProfilerFlush();
#else
LOG(WARNING) << "Paddle is not compiled with gperftools. "
"FLAGS_tracer_profile_fname will be ignored";
#endif
}
} // namespace imperative
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
namespace paddle {
namespace imperative {
extern void StartProfile();
extern void StopProfile();
} // namespace imperative
} // namespace paddle
...@@ -23,23 +23,9 @@ ...@@ -23,23 +23,9 @@
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
DEFINE_string(
tracer_profile_fname, "",
"Profiler filename for imperative tracer, which generated by gperftools."
"Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
static std::once_flag gTracerProfileOnce;
#ifdef WITH_GPERFTOOLS
static bool gTracerProfilerStarted = false;
#endif
void CreateGradOp(const framework::OpDesc& op_desc, void CreateGradOp(const framework::OpDesc& op_desc,
const std::unordered_set<std::string>& no_grad_set, const std::unordered_set<std::string>& no_grad_set,
const std::vector<framework::BlockDesc*>& grad_sub_block, const std::vector<framework::BlockDesc*>& grad_sub_block,
...@@ -145,31 +131,13 @@ framework::VariableNameMap CreateOutputVarNameMap( ...@@ -145,31 +131,13 @@ framework::VariableNameMap CreateOutputVarNameMap(
return result; return result;
} }
Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
if (!FLAGS_tracer_profile_fname.empty()) {
std::call_once(gTracerProfileOnce, [] {
#ifdef WITH_GPERFTOOLS
ProfilerStart(FLAGS_tracer_profile_fname.c_str());
gTracerProfilerStarted = true;
#else
LOG(WARNING) << "Paddle is not compiled with gperftools. "
"FLAGS_tracer_profile_fname will be ignored";
#endif
});
}
}
std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
const VarBasePtrMap& outputs, const VarBasePtrMap& outputs,
framework::AttributeMap attrs_map, framework::AttributeMap attrs_map,
const platform::Place expected_place, const platform::Place expected_place,
const bool stop_gradient) { const bool stop_gradient) {
#ifdef WITH_GPERFTOOLS
if (gTracerProfilerStarted) {
ProfilerFlush();
}
#endif
framework::VariableValueMap invars_map; framework::VariableValueMap invars_map;
framework::VariableValueMap outvars_map; framework::VariableValueMap outvars_map;
...@@ -184,7 +152,6 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -184,7 +152,6 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
inp->Name()); inp->Name());
invars.emplace_back(inp->var_); invars.emplace_back(inp->var_);
op->TrackPreOp(inp, it.first);
if (!stop_gradient) { if (!stop_gradient) {
current_vars_map[inp->Name()] = inp; current_vars_map[inp->Name()] = inp;
} }
...@@ -192,6 +159,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -192,6 +159,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
<< " inited: " << inp->var_->IsInitialized() << " inited: " << inp->var_->IsInitialized()
<< " stop_grad: " << inp->IsStopGradient(); << " stop_grad: " << inp->IsStopGradient();
} }
op->TrackPreOp(it.first, it.second);
} }
op->output_vars_ = outputs; op->output_vars_ = outputs;
...@@ -319,9 +287,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op, ...@@ -319,9 +287,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
std::vector<framework::Variable*> ret_vars = std::vector<framework::Variable*> ret_vars =
PyLayer::Apply(op->forward_id_, inputs); PyLayer::Apply(op->forward_id_, inputs);
for (VarBase* inp : inputs) { op->TrackPreOp(PyLayer::kFwdInp, inputs);
op->TrackPreOp(inp, PyLayer::kFwdInp);
}
std::vector<VarBase*>& outputs = op->output_vars_[PyLayer::kFwdOut]; std::vector<VarBase*>& outputs = op->output_vars_[PyLayer::kFwdOut];
outputs.reserve(ret_vars.size()); outputs.reserve(ret_vars.size());
......
...@@ -91,5 +91,5 @@ if(WITH_TESTING) ...@@ -91,5 +91,5 @@ if(WITH_TESTING)
add_subdirectory(tests/book) add_subdirectory(tests/book)
if(WITH_INFERENCE_API_TEST) if(WITH_INFERENCE_API_TEST)
add_subdirectory(tests/api) add_subdirectory(tests/api)
endif() endif()
endif() endif()
set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune
feed_fetch_method pass_builder parallel_executor profiler layer scope_pool feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
tracer analysis_predictor) tracer analysis_predictor imperative_profiler)
if(WITH_PYTHON) if(WITH_PYTHON)
list(APPEND PYBIND_DEPS py_func_op) list(APPEND PYBIND_DEPS py_func_op)
......
...@@ -42,6 +42,7 @@ void BindTracer(pybind11::module* m) { ...@@ -42,6 +42,7 @@ void BindTracer(pybind11::module* m) {
framework::AttributeMap attrs_map, framework::AttributeMap attrs_map,
const platform::CPUPlace expected_place, const platform::CPUPlace expected_place,
const bool stop_gradient = false) { const bool stop_gradient = false) {
pybind11::gil_scoped_release release;
return self.Trace(op, inputs, outputs, attrs_map, expected_place, return self.Trace(op, inputs, outputs, attrs_map, expected_place,
stop_gradient); stop_gradient);
}) })
...@@ -52,6 +53,7 @@ void BindTracer(pybind11::module* m) { ...@@ -52,6 +53,7 @@ void BindTracer(pybind11::module* m) {
framework::AttributeMap attrs_map, framework::AttributeMap attrs_map,
const platform::CUDAPlace expected_place, const platform::CUDAPlace expected_place,
const bool stop_gradient = false) { const bool stop_gradient = false) {
pybind11::gil_scoped_release release;
return self.Trace(op, inputs, outputs, attrs_map, expected_place, return self.Trace(op, inputs, outputs, attrs_map, expected_place,
stop_gradient); stop_gradient);
}) })
......
...@@ -36,6 +36,7 @@ limitations under the License. */ ...@@ -36,6 +36,7 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/version.h" #include "paddle/fluid/framework/version.h"
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/profiler.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
...@@ -156,6 +157,11 @@ PYBIND11_MODULE(core, m) { ...@@ -156,6 +157,11 @@ PYBIND11_MODULE(core, m) {
m.def("print_mem_usage", m.def("print_mem_usage",
[]() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); }); []() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); });
m.def("start_imperative_gperf_profiler",
[]() { imperative::StartProfile(); });
m.def("stop_imperative_gperf_profiler", []() { imperative::StopProfile(); });
py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC") py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC")
.def( .def(
py::init<const std::string &, paddle::framework::proto::VarType::Type, py::init<const std::string &, paddle::framework::proto::VarType::Type,
......
...@@ -132,7 +132,8 @@ def __bootstrap__(): ...@@ -132,7 +132,8 @@ def __bootstrap__():
'allocator_strategy', 'reader_queue_speed_test_mode', 'allocator_strategy', 'reader_queue_speed_test_mode',
'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
'inner_op_parallelism', 'enable_parallel_graph', 'inner_op_parallelism', 'enable_parallel_graph',
'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize' 'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize',
'tracer_profile_fname'
] ]
if 'Darwin' not in sysstr: if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory') read_env_flags.append('use_pinned_memory')
......
...@@ -26,8 +26,12 @@ from .nn import * ...@@ -26,8 +26,12 @@ from .nn import *
from . import tracer from . import tracer
from .tracer import * from .tracer import *
from . import profiler
from .profiler import *
__all__ = [] __all__ = []
__all__ += layers.__all__ __all__ += layers.__all__
__all__ += base.__all__ __all__ += base.__all__
__all__ += nn.__all__ __all__ += nn.__all__
__all__ += tracer.__all__ __all__ += tracer.__all__
__all__ += profiler.__all__
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
from .. import core
__all__ = [
'start_gperf_profiler',
'stop_gperf_profiler',
]
def start_gperf_profiler():
core.start_imperative_gperf_profiler()
def stop_gperf_profiler():
core.stop_imperative_gperf_profiler()
...@@ -153,3 +153,9 @@ done ...@@ -153,3 +153,9 @@ done
# Restore LD_LIBRARY_PATH # Restore LD_LIBRARY_PATH
LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}" LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}"
# According to ar issues: https://lists.gnu.org/archive/html/bug-binutils/2016-05/msg00211.html
# we should install new version ar with 64-bit supported here
wget https://ftp.gnu.org/gnu/binutils/binutils-2.27.tar.gz
tar xzf binutils-2.27.tar.gz && cd binutils-2.27
./configure --prefix=/opt/rh/devtoolset-2/root/usr/ --enable-64-bit-archive && make -j `nproc` && make install
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册