提交 42e96a02 编写于 作者: M minqiyang

Accelerate CPU part

上级 4e8c03bd
...@@ -37,6 +37,16 @@ if(WIN32) ...@@ -37,6 +37,16 @@ if(WIN32)
set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
# else()
# set(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> --target elf64-x86-64 cr <TARGET> <LINK_FLAGS> <OBJECTS>")
# set(CMAKE_C_ARCHIVE_APPEND "<CMAKE_AR> --target elf64-x86-64 r <TARGET> <LINK_FLAGS> <OBJECTS>")
# # set(CMAKE_C_ARCHIVE_FINISH "<CMAKE_RANLIB> --enable-64-bit-archive <TARGET>")
# set(CMAKE_CXX_ARCHIVE_CREATE ${CMAKE_C_ARCHIVE_CREATE})
# set(CMAKE_CXX_ARCHIVE_APPEND ${CMAKE_C_ARCHIVE_APPEND})
# # set(CMAKE_CXX_ARCHIVE_FINISH ${CMAKE_C_ARCHIVE_FINISH})
# set(CMAKE_Fortran_ARCHIVE_CREATE ${CMAKE_C_ARCHIVE_CREATE})
# set(CMAKE_Fortran_ARCHIVE_APPEND ${CMAKE_C_ARCHIVE_APPEND})
# # set(CMAKE_Fortran_ARCHIVE_FINISH ${CMAKE_C_ARCHIVE_FINISH})
endif(WIN32) endif(WIN32)
find_package(CUDA QUIET) find_package(CUDA QUIET)
......
...@@ -55,7 +55,10 @@ class GradOpDescMakerBase { ...@@ -55,7 +55,10 @@ class GradOpDescMakerBase {
std::back_inserter(ret_val), std::back_inserter(ret_val),
[this](const std::string& fwd_var_name) -> std::string { [this](const std::string& fwd_var_name) -> std::string {
auto g_name = GradVarName(fwd_var_name); auto g_name = GradVarName(fwd_var_name);
if (no_grad_set_.count(g_name)) { if (no_grad_set_.empty()) {
(*this->grad_to_var_)[g_name] = fwd_var_name;
return g_name;
} else if (no_grad_set_.count(g_name)) {
return kEmptyVarName; return kEmptyVarName;
} else { } else {
(*this->grad_to_var_)[g_name] = fwd_var_name; (*this->grad_to_var_)[g_name] = fwd_var_name;
......
...@@ -2,4 +2,5 @@ if(WITH_PYTHON) ...@@ -2,4 +2,5 @@ if(WITH_PYTHON)
cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind) cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind) cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
cc_library(engine SRCS engine.cc) cc_library(engine SRCS engine.cc)
cc_library(imperative_profiler SRCS profiler.cc)
endif() endif()
...@@ -239,7 +239,7 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() { ...@@ -239,7 +239,7 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
VLOG(3) << "apply grad op " << grad_op_desc->Type(); VLOG(3) << "apply grad op " << grad_op_desc->Type();
// Allocate tmp grad output variable // Allocate tmp grad output variable
for (auto it : grad_output_variable_map) { for (const auto& it : grad_output_variable_map) {
auto& outputs = tmp_grad_outputs[k][it.first]; auto& outputs = tmp_grad_outputs[k][it.first];
outputs.reserve(it.second.size()); outputs.reserve(it.second.size());
for (size_t i = 0; i < it.second.size(); ++i) { for (size_t i = 0; i < it.second.size(); ++i) {
...@@ -273,9 +273,9 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() { ...@@ -273,9 +273,9 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
// Add tmp grad outputs to original grad vars // Add tmp grad outputs to original grad vars
for (size_t k = 0; k < grad_output_vars_.size(); ++k) { for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
for (auto it : grad_output_vars_[k]) { for (const auto& it : grad_output_vars_[k]) {
auto& outputs = tmp_grad_outputs[k][it.first]; auto& outputs = tmp_grad_outputs[k][it.first];
auto& origin_outputs = it.second; const auto& origin_outputs = it.second;
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size()); PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
for (size_t i = 0; i < outputs.size(); ++i) { for (size_t i = 0; i < outputs.size(); ++i) {
......
...@@ -294,17 +294,23 @@ class PYBIND11_HIDDEN OpBase { ...@@ -294,17 +294,23 @@ class PYBIND11_HIDDEN OpBase {
void InvokeBackwardHooks(); void InvokeBackwardHooks();
void TrackPreOp(const VarBase* inp_var, const std::string& inp_name) { void TrackPreOp(const std::string& inp_name,
const std::vector<VarBase*>& inputs) {
auto& pre_ops_list = pre_ops_[inp_name];
pre_ops_list.reserve(inputs.size());
auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name];
for (VarBase* inp_var : inputs) {
if (inp_var->PreOp() && !inp_var->IsStopGradient()) { if (inp_var->PreOp() && !inp_var->IsStopGradient()) {
VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot " VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot "
<< inp_name; << inp_name;
pre_ops_[inp_name].push_back(inp_var->PreOp()); pre_ops_list.emplace_back(inp_var->PreOp());
pre_ops_out_idx_[inp_name].push_back(inp_var->PreOpOutIdx()); pre_ops_out_idx_list.push_back(inp_var->PreOpOutIdx());
} else { } else {
VLOG(3) << "no pre op in slot " << inp_name VLOG(3) << "no pre op in slot " << inp_name
<< " input var stop_gradient: " << inp_var->IsStopGradient(); << " input var stop_gradient: " << inp_var->IsStopGradient();
pre_ops_[inp_name].push_back(nullptr); pre_ops_list.emplace_back(nullptr);
// pre_ops_out_idx_[inp_name].push_back(-1); // pre_ops_out_idx_list.push_back(-1);
}
} }
} }
......
...@@ -23,23 +23,9 @@ ...@@ -23,23 +23,9 @@
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
DEFINE_string(
tracer_profile_fname, "",
"Profiler filename for imperative tracer, which generated by gperftools."
"Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
namespace paddle { namespace paddle {
namespace imperative { namespace imperative {
static std::once_flag gTracerProfileOnce;
#ifdef WITH_GPERFTOOLS
static bool gTracerProfilerStarted = false;
#endif
void CreateGradOp(const framework::OpDesc& op_desc, void CreateGradOp(const framework::OpDesc& op_desc,
const std::unordered_set<std::string>& no_grad_set, const std::unordered_set<std::string>& no_grad_set,
const std::vector<framework::BlockDesc*>& grad_sub_block, const std::vector<framework::BlockDesc*>& grad_sub_block,
...@@ -146,17 +132,6 @@ framework::VariableNameMap CreateOutputVarNameMap( ...@@ -146,17 +132,6 @@ framework::VariableNameMap CreateOutputVarNameMap(
} }
Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
if (!FLAGS_tracer_profile_fname.empty()) {
std::call_once(gTracerProfileOnce, [] {
#ifdef WITH_GPERFTOOLS
ProfilerStart(FLAGS_tracer_profile_fname.c_str());
gTracerProfilerStarted = true;
#else
LOG(WARNING) << "Paddle is not compiled with gperftools. "
"FLAGS_tracer_profile_fname will be ignored";
#endif
});
}
} }
std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
...@@ -164,12 +139,6 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -164,12 +139,6 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
framework::AttributeMap attrs_map, framework::AttributeMap attrs_map,
const platform::Place expected_place, const platform::Place expected_place,
const bool stop_gradient) { const bool stop_gradient) {
#ifdef WITH_GPERFTOOLS
if (gTracerProfilerStarted) {
ProfilerFlush();
}
#endif
framework::VariableValueMap invars_map; framework::VariableValueMap invars_map;
framework::VariableValueMap outvars_map; framework::VariableValueMap outvars_map;
...@@ -184,7 +153,6 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -184,7 +153,6 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
inp->Name()); inp->Name());
invars.emplace_back(inp->var_); invars.emplace_back(inp->var_);
op->TrackPreOp(inp, it.first);
if (!stop_gradient) { if (!stop_gradient) {
current_vars_map[inp->Name()] = inp; current_vars_map[inp->Name()] = inp;
} }
...@@ -192,6 +160,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs, ...@@ -192,6 +160,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
<< " inited: " << inp->var_->IsInitialized() << " inited: " << inp->var_->IsInitialized()
<< " stop_grad: " << inp->IsStopGradient(); << " stop_grad: " << inp->IsStopGradient();
} }
op->TrackPreOp(it.first, it.second);
} }
op->output_vars_ = outputs; op->output_vars_ = outputs;
...@@ -319,9 +288,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op, ...@@ -319,9 +288,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
std::vector<framework::Variable*> ret_vars = std::vector<framework::Variable*> ret_vars =
PyLayer::Apply(op->forward_id_, inputs); PyLayer::Apply(op->forward_id_, inputs);
for (VarBase* inp : inputs) { op->TrackPreOp(PyLayer::kFwdInp, inputs);
op->TrackPreOp(inp, PyLayer::kFwdInp);
}
std::vector<VarBase*>& outputs = op->output_vars_[PyLayer::kFwdOut]; std::vector<VarBase*>& outputs = op->output_vars_[PyLayer::kFwdOut];
outputs.reserve(ret_vars.size()); outputs.reserve(ret_vars.size());
......
...@@ -56,6 +56,10 @@ class Tracer { ...@@ -56,6 +56,10 @@ class Tracer {
std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs, std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
bool stop_gradient = false); bool stop_gradient = false);
static void StartProfile();
static void StopProfile();
private: private:
platform::Place GetPlace(const VarBasePtrMap& inputs); platform::Place GetPlace(const VarBasePtrMap& inputs);
......
...@@ -29,6 +29,7 @@ endif(WIN32) ...@@ -29,6 +29,7 @@ endif(WIN32)
if(WIN32) if(WIN32)
sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
else(WIN32) else(WIN32)
#set(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> --target elf64-x86-64 cr paddle_fluid_origin ${fluid_modules} paddle_fluid_api")
cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
endif(WIN32) endif(WIN32)
......
set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune
feed_fetch_method pass_builder parallel_executor profiler layer scope_pool feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
tracer analysis_predictor) tracer analysis_predictor imperative_profiler)
if(WITH_PYTHON) if(WITH_PYTHON)
list(APPEND PYBIND_DEPS py_func_op) list(APPEND PYBIND_DEPS py_func_op)
......
...@@ -36,6 +36,7 @@ limitations under the License. */ ...@@ -36,6 +36,7 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/version.h" #include "paddle/fluid/framework/version.h"
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/profiler.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/memory/allocation/legacy_allocator.h" #include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
...@@ -148,6 +149,12 @@ PYBIND11_MODULE(core, m) { ...@@ -148,6 +149,12 @@ PYBIND11_MODULE(core, m) {
m.def("print_mem_usage", m.def("print_mem_usage",
[]() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); }); []() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); });
m.def("start_imperative_profiler",
[]() { imperative::StartProfile(); });
m.def("stop_imperative_profiler",
[]() { imperative::StopProfile(); });
py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC") py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC")
.def( .def(
py::init<const std::string &, paddle::framework::proto::VarType::Type, py::init<const std::string &, paddle::framework::proto::VarType::Type,
......
...@@ -132,7 +132,8 @@ def __bootstrap__(): ...@@ -132,7 +132,8 @@ def __bootstrap__():
'allocator_strategy', 'reader_queue_speed_test_mode', 'allocator_strategy', 'reader_queue_speed_test_mode',
'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
'inner_op_parallelism', 'enable_parallel_graph', 'inner_op_parallelism', 'enable_parallel_graph',
'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize' 'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize',
'tracer_profile_fname'
] ]
if 'Darwin' not in sysstr: if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory') read_env_flags.append('use_pinned_memory')
......
...@@ -290,6 +290,7 @@ class Variable(object): ...@@ -290,6 +290,7 @@ class Variable(object):
dtype='float32') dtype='float32')
""" """
# @profile
def __init__(self, def __init__(self,
block, block,
type=core.VarDesc.VarType.LOD_TENSOR, type=core.VarDesc.VarType.LOD_TENSOR,
...@@ -645,6 +646,7 @@ class Operator(object): ...@@ -645,6 +646,7 @@ class Operator(object):
'checkpoint_notify', 'gen_nccl_id' 'checkpoint_notify', 'gen_nccl_id'
} }
# @profile
def __init__(self, def __init__(self,
block, block,
desc, desc,
...@@ -1239,6 +1241,7 @@ class Block(object): ...@@ -1239,6 +1241,7 @@ class Block(object):
return (item[1] for item in six.iteritems(self.vars) return (item[1] for item in six.iteritems(self.vars)
if isinstance(item[1], Parameter)) if isinstance(item[1], Parameter))
# @profile
def create_var(self, *args, **kwargs): def create_var(self, *args, **kwargs):
var = Variable(block=self, *args, **kwargs) var = Variable(block=self, *args, **kwargs)
if 'initializer' in kwargs: if 'initializer' in kwargs:
...@@ -1347,6 +1350,7 @@ class Block(object): ...@@ -1347,6 +1350,7 @@ class Block(object):
initializer(param, self) initializer(param, self)
return param return param
# @profile
def append_op(self, *args, **kwargs): def append_op(self, *args, **kwargs):
""" """
Appends a new Operator according to the giving arguments. Appends a new Operator according to the giving arguments.
......
...@@ -26,8 +26,12 @@ from .nn import * ...@@ -26,8 +26,12 @@ from .nn import *
from . import tracer from . import tracer
from .tracer import * from .tracer import *
from . import profiler
from .profiler import *
__all__ = [] __all__ = []
__all__ += layers.__all__ __all__ += layers.__all__
__all__ += base.__all__ __all__ += base.__all__
__all__ += nn.__all__ __all__ += nn.__all__
__all__ += tracer.__all__ __all__ += tracer.__all__
__all__ += profiler.__all__
...@@ -97,6 +97,7 @@ class Conv2D(layers.Layer): ...@@ -97,6 +97,7 @@ class Conv2D(layers.Layer):
dtype=self._dtype, dtype=self._dtype,
is_bias=True) is_bias=True)
# @profile
def forward(self, input): def forward(self, input):
pre_bias = self._helper.create_variable_for_type_inference( pre_bias = self._helper.create_variable_for_type_inference(
dtype=self._dtype) dtype=self._dtype)
...@@ -169,6 +170,7 @@ class Pool2D(layers.Layer): ...@@ -169,6 +170,7 @@ class Pool2D(layers.Layer):
self._exclusive = exclusive self._exclusive = exclusive
self._l_type = 'pool2d' self._l_type = 'pool2d'
# @profile
def forward(self, input): def forward(self, input):
pool_out = self._helper.create_variable_for_type_inference(self._dtype) pool_out = self._helper.create_variable_for_type_inference(self._dtype)
...@@ -229,6 +231,7 @@ class FC(layers.Layer): ...@@ -229,6 +231,7 @@ class FC(layers.Layer):
else: else:
self._b = None self._b = None
# @profile
def forward(self, input): def forward(self, input):
tmp = self._helper.create_variable_for_type_inference(self._dtype) tmp = self._helper.create_variable_for_type_inference(self._dtype)
self._helper.append_op( self._helper.append_op(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册