提交 42e96a02 编写于 作者: M minqiyang

Accelerate CPU part

上级 4e8c03bd
......@@ -37,6 +37,16 @@ if(WIN32)
set(CMAKE_STATIC_LINKER_FLAGS "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
# else()
# set(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> --target elf64-x86-64 cr <TARGET> <LINK_FLAGS> <OBJECTS>")
# set(CMAKE_C_ARCHIVE_APPEND "<CMAKE_AR> --target elf64-x86-64 r <TARGET> <LINK_FLAGS> <OBJECTS>")
# # set(CMAKE_C_ARCHIVE_FINISH "<CMAKE_RANLIB> --enable-64-bit-archive <TARGET>")
# set(CMAKE_CXX_ARCHIVE_CREATE ${CMAKE_C_ARCHIVE_CREATE})
# set(CMAKE_CXX_ARCHIVE_APPEND ${CMAKE_C_ARCHIVE_APPEND})
# # set(CMAKE_CXX_ARCHIVE_FINISH ${CMAKE_C_ARCHIVE_FINISH})
# set(CMAKE_Fortran_ARCHIVE_CREATE ${CMAKE_C_ARCHIVE_CREATE})
# set(CMAKE_Fortran_ARCHIVE_APPEND ${CMAKE_C_ARCHIVE_APPEND})
# # set(CMAKE_Fortran_ARCHIVE_FINISH ${CMAKE_C_ARCHIVE_FINISH})
endif(WIN32)
find_package(CUDA QUIET)
......
......@@ -55,7 +55,10 @@ class GradOpDescMakerBase {
std::back_inserter(ret_val),
[this](const std::string& fwd_var_name) -> std::string {
auto g_name = GradVarName(fwd_var_name);
if (no_grad_set_.count(g_name)) {
if (no_grad_set_.empty()) {
(*this->grad_to_var_)[g_name] = fwd_var_name;
return g_name;
} else if (no_grad_set_.count(g_name)) {
return kEmptyVarName;
} else {
(*this->grad_to_var_)[g_name] = fwd_var_name;
......
......@@ -2,4 +2,5 @@ if(WITH_PYTHON)
cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
cc_library(engine SRCS engine.cc)
cc_library(imperative_profiler SRCS profiler.cc)
endif()
......@@ -239,7 +239,7 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
VLOG(3) << "apply grad op " << grad_op_desc->Type();
// Allocate tmp grad output variable
for (auto it : grad_output_variable_map) {
for (const auto& it : grad_output_variable_map) {
auto& outputs = tmp_grad_outputs[k][it.first];
outputs.reserve(it.second.size());
for (size_t i = 0; i < it.second.size(); ++i) {
......@@ -273,9 +273,9 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
// Add tmp grad outputs to original grad vars
for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
for (auto it : grad_output_vars_[k]) {
for (const auto& it : grad_output_vars_[k]) {
auto& outputs = tmp_grad_outputs[k][it.first];
auto& origin_outputs = it.second;
const auto& origin_outputs = it.second;
PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
for (size_t i = 0; i < outputs.size(); ++i) {
......
......@@ -294,17 +294,23 @@ class PYBIND11_HIDDEN OpBase {
void InvokeBackwardHooks();
void TrackPreOp(const VarBase* inp_var, const std::string& inp_name) {
void TrackPreOp(const std::string& inp_name,
const std::vector<VarBase*>& inputs) {
auto& pre_ops_list = pre_ops_[inp_name];
pre_ops_list.reserve(inputs.size());
auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name];
for (VarBase* inp_var : inputs) {
if (inp_var->PreOp() && !inp_var->IsStopGradient()) {
VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot "
<< inp_name;
pre_ops_[inp_name].push_back(inp_var->PreOp());
pre_ops_out_idx_[inp_name].push_back(inp_var->PreOpOutIdx());
pre_ops_list.emplace_back(inp_var->PreOp());
pre_ops_out_idx_list.push_back(inp_var->PreOpOutIdx());
} else {
VLOG(3) << "no pre op in slot " << inp_name
<< " input var stop_gradient: " << inp_var->IsStopGradient();
pre_ops_[inp_name].push_back(nullptr);
// pre_ops_out_idx_[inp_name].push_back(-1);
pre_ops_list.emplace_back(nullptr);
// pre_ops_out_idx_list.push_back(-1);
}
}
}
......
......@@ -23,23 +23,9 @@
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef WITH_GPERFTOOLS
#include "gperftools/profiler.h"
#endif
DEFINE_string(
tracer_profile_fname, "",
"Profiler filename for imperative tracer, which generated by gperftools."
"Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
namespace paddle {
namespace imperative {
static std::once_flag gTracerProfileOnce;
#ifdef WITH_GPERFTOOLS
static bool gTracerProfilerStarted = false;
#endif
void CreateGradOp(const framework::OpDesc& op_desc,
const std::unordered_set<std::string>& no_grad_set,
const std::vector<framework::BlockDesc*>& grad_sub_block,
......@@ -146,17 +132,6 @@ framework::VariableNameMap CreateOutputVarNameMap(
}
Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
if (!FLAGS_tracer_profile_fname.empty()) {
std::call_once(gTracerProfileOnce, [] {
#ifdef WITH_GPERFTOOLS
ProfilerStart(FLAGS_tracer_profile_fname.c_str());
gTracerProfilerStarted = true;
#else
LOG(WARNING) << "Paddle is not compiled with gperftools. "
"FLAGS_tracer_profile_fname will be ignored";
#endif
});
}
}
std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
......@@ -164,12 +139,6 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
framework::AttributeMap attrs_map,
const platform::Place expected_place,
const bool stop_gradient) {
#ifdef WITH_GPERFTOOLS
if (gTracerProfilerStarted) {
ProfilerFlush();
}
#endif
framework::VariableValueMap invars_map;
framework::VariableValueMap outvars_map;
......@@ -184,7 +153,6 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
inp->Name());
invars.emplace_back(inp->var_);
op->TrackPreOp(inp, it.first);
if (!stop_gradient) {
current_vars_map[inp->Name()] = inp;
}
......@@ -192,6 +160,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
<< " inited: " << inp->var_->IsInitialized()
<< " stop_grad: " << inp->IsStopGradient();
}
op->TrackPreOp(it.first, it.second);
}
op->output_vars_ = outputs;
......@@ -319,9 +288,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
std::vector<framework::Variable*> ret_vars =
PyLayer::Apply(op->forward_id_, inputs);
for (VarBase* inp : inputs) {
op->TrackPreOp(inp, PyLayer::kFwdInp);
}
op->TrackPreOp(PyLayer::kFwdInp, inputs);
std::vector<VarBase*>& outputs = op->output_vars_[PyLayer::kFwdOut];
outputs.reserve(ret_vars.size());
......
......@@ -56,6 +56,10 @@ class Tracer {
std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
bool stop_gradient = false);
static void StartProfile();
static void StopProfile();
private:
platform::Place GetPlace(const VarBasePtrMap& inputs);
......
......@@ -29,6 +29,7 @@ endif(WIN32)
if(WIN32)
sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
else(WIN32)
#set(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> --target elf64-x86-64 cr paddle_fluid_origin ${fluid_modules} paddle_fluid_api")
cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
endif(WIN32)
......
set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune
feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
tracer analysis_predictor)
tracer analysis_predictor imperative_profiler)
if(WITH_PYTHON)
list(APPEND PYBIND_DEPS py_func_op)
......
......@@ -36,6 +36,7 @@ limitations under the License. */
#include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/version.h"
#include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/imperative/profiler.h"
#include "paddle/fluid/memory/allocation/allocator_strategy.h"
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/operators/activation_op.h"
......@@ -148,6 +149,12 @@ PYBIND11_MODULE(core, m) {
m.def("print_mem_usage",
[]() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); });
m.def("start_imperative_profiler",
[]() { imperative::StartProfile(); });
m.def("stop_imperative_profiler",
[]() { imperative::StopProfile(); });
py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC")
.def(
py::init<const std::string &, paddle::framework::proto::VarType::Type,
......
......@@ -132,7 +132,8 @@ def __bootstrap__():
'allocator_strategy', 'reader_queue_speed_test_mode',
'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
'inner_op_parallelism', 'enable_parallel_graph',
'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize'
'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize',
'tracer_profile_fname'
]
if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory')
......
......@@ -290,6 +290,7 @@ class Variable(object):
dtype='float32')
"""
# @profile
def __init__(self,
block,
type=core.VarDesc.VarType.LOD_TENSOR,
......@@ -645,6 +646,7 @@ class Operator(object):
'checkpoint_notify', 'gen_nccl_id'
}
# @profile
def __init__(self,
block,
desc,
......@@ -1239,6 +1241,7 @@ class Block(object):
return (item[1] for item in six.iteritems(self.vars)
if isinstance(item[1], Parameter))
# @profile
def create_var(self, *args, **kwargs):
var = Variable(block=self, *args, **kwargs)
if 'initializer' in kwargs:
......@@ -1347,6 +1350,7 @@ class Block(object):
initializer(param, self)
return param
# @profile
def append_op(self, *args, **kwargs):
"""
Appends a new Operator according to the giving arguments.
......
......@@ -26,8 +26,12 @@ from .nn import *
from . import tracer
from .tracer import *
from . import profiler
from .profiler import *
__all__ = []
__all__ += layers.__all__
__all__ += base.__all__
__all__ += nn.__all__
__all__ += tracer.__all__
__all__ += profiler.__all__
......@@ -97,6 +97,7 @@ class Conv2D(layers.Layer):
dtype=self._dtype,
is_bias=True)
# @profile
def forward(self, input):
pre_bias = self._helper.create_variable_for_type_inference(
dtype=self._dtype)
......@@ -169,6 +170,7 @@ class Pool2D(layers.Layer):
self._exclusive = exclusive
self._l_type = 'pool2d'
# @profile
def forward(self, input):
pool_out = self._helper.create_variable_for_type_inference(self._dtype)
......@@ -229,6 +231,7 @@ class FC(layers.Layer):
else:
self._b = None
# @profile
def forward(self, input):
tmp = self._helper.create_variable_for_type_inference(self._dtype)
self._helper.append_op(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册