提交 b2898c0f 编写于 作者: L luotao1

Merge branch 'develop' into runtime_context

test=develop
...@@ -293,6 +293,7 @@ paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords= ...@@ -293,6 +293,7 @@ paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=
paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3')) paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3'))
paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b')) paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b'))
paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6')) paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6'))
paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a46e0b5f9ce82348406478e610f14c9'))
paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7')) paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7'))
paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13')) paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13'))
paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27')) paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27'))
...@@ -300,6 +301,8 @@ paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None ...@@ -300,6 +301,8 @@ paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None
paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad')) paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad'))
paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973')) paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973'))
paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2')) paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2'))
paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '920a47734482276c069ba24c61c26b25'))
paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cf4ee2c9b9d7293556f8c5173dfb5d2c'))
paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4')) paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4'))
paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2')) paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2'))
paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26')) paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26'))
......
...@@ -34,11 +34,11 @@ limitations under the License. */ ...@@ -34,11 +34,11 @@ limitations under the License. */
#ifdef PADDLE_WITH_NGRAPH #ifdef PADDLE_WITH_NGRAPH
#include "paddle/fluid/operators/ngraph/ngraph_engine.h" #include "paddle/fluid/operators/ngraph/ngraph_engine.h"
DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
#endif #endif
DECLARE_bool(benchmark); DECLARE_bool(benchmark);
DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run"); DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -194,9 +194,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -194,9 +194,6 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool force_disable_gc) { bool force_disable_gc) {
platform::RecordBlock b(block_id); platform::RecordBlock b(block_id);
if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc);
#ifdef PADDLE_WITH_NGRAPH
if (FLAGS_use_ngraph) operators::NgraphEngine::EnableNgraph(pdesc);
#endif
auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc);
RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars);
} }
...@@ -372,6 +369,12 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare( ...@@ -372,6 +369,12 @@ std::unique_ptr<ExecutorPrepareContext> Executor::Prepare(
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc));
} }
#ifdef PADDLE_WITH_NGRAPH
if (FLAGS_use_ngraph) {
paddle::operators::NgraphEngine::FuseNgraphOps(
ctx->prog_.Block(ctx->block_id_), &ctx->ops_);
}
#endif
return ctx; return ctx;
} }
......
...@@ -934,8 +934,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -934,8 +934,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
dev_ctx = pool.Get(expected_kernel_key.place_); dev_ctx = pool.Get(expected_kernel_key.place_);
} }
if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx_); RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx_);
this->InferShape(&infer_shape_ctx); this->InferShape(&infer_shape_ctx);
}
// TODO(panyx0718): ExecutionContext should only depend on RuntimeContext // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
// not Scope. Imperative mode only pass inputs and get outputs. // not Scope. Imperative mode only pass inputs and get outputs.
kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx,
......
...@@ -70,6 +70,15 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@"; ...@@ -70,6 +70,15 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@";
/// execution to save the elapsed time. /// execution to save the elapsed time.
constexpr char kEnableRuntimeContext[] = "@ENABLE_RUNTIME_CONTEXT@"; constexpr char kEnableRuntimeContext[] = "@ENABLE_RUNTIME_CONTEXT@";
/// If an Op has this attribute, all its kernels should calculate output
/// variable's shape in the corresponding Compute() function. And
/// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
/// function in its runtime for speedup.
/// TODO(luotao): Note that this temporal attribute would be deleted after all
/// ops contain it.
constexpr char kAllKernelsMustComputeRuntimeShape[] =
"@ALL_KERNELS_MUST_COMPUTE_RUNTIME_SHAPE@";
// define some kernel priority // define some kernel priority
/* Define multiple kernel type fallback order*/ /* Define multiple kernel type fallback order*/
extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority; extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
......
...@@ -181,12 +181,13 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() { ...@@ -181,12 +181,13 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
return member_->local_scopes_; return member_->local_scopes_;
} }
ParallelExecutor::ParallelExecutor( ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const std::vector<platform::Place> &places, const std::vector<std::string> &bcast_vars,
const std::unordered_set<std::string> &bcast_vars, const std::string &loss_var_name,
const std::string &loss_var_name, Scope *scope, Scope *scope,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, const ExecutionStrategy &exec_strategy,
const BuildStrategy &build_strategy,
ir::Graph *graph) ir::Graph *graph)
: member_(new ParallelExecutorPrivate(places)) { : member_(new ParallelExecutorPrivate(places)) {
member_->global_scope_ = scope; member_->global_scope_ = scope;
...@@ -254,9 +255,23 @@ ParallelExecutor::ParallelExecutor( ...@@ -254,9 +255,23 @@ ParallelExecutor::ParallelExecutor(
PADDLE_THROW("Not compiled with CUDA"); PADDLE_THROW("Not compiled with CUDA");
#endif #endif
} }
if (member_->local_scopes_.size() != 1 && local_scopes.empty()) { // broadcast parameters from the 0th device to others:
BCastParamsToDevices(bcast_vars); auto need_broadcast = [&]() -> bool {
if (build_strategy.num_trainers_ > 1) {
// 1. num_tariners would be grater than 1 for nccl distributed training.
return true;
} else if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
// 2. Only one trainer process, but ParallelExecutor hold multiple
// devices.
return true;
} }
return false;
};
if (need_broadcast()) {
BCastParamsToDevices(bcast_vars, build_strategy.trainer_id_);
}
// Startup Program has been run. All local scopes has correct parameters. // Startup Program has been run. All local scopes has correct parameters.
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
...@@ -338,7 +353,7 @@ ParallelExecutor::ParallelExecutor( ...@@ -338,7 +353,7 @@ ParallelExecutor::ParallelExecutor(
} }
void ParallelExecutor::BCastParamsToDevices( void ParallelExecutor::BCastParamsToDevices(
const std::unordered_set<std::string> &vars) const { const std::vector<std::string> &vars, int trainer_id) const {
// the initializing bcast, all vars would be bcast from device(0). // the initializing bcast, all vars would be bcast from device(0).
for (auto &var : vars) { for (auto &var : vars) {
framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var);
...@@ -362,7 +377,7 @@ void ParallelExecutor::BCastParamsToDevices( ...@@ -362,7 +377,7 @@ void ParallelExecutor::BCastParamsToDevices(
auto place = member_->places_[i]; auto place = member_->places_[i];
void *buffer; void *buffer;
if (i == 0) { if (i == 0 && trainer_id == 0) {
buffer = const_cast<void *>(main_tensor.data<void>()); buffer = const_cast<void *>(main_tensor.data<void>());
} else { } else {
auto local_scope = member_->local_scopes_[i]; auto local_scope = member_->local_scopes_[i];
......
...@@ -14,9 +14,11 @@ limitations under the License. */ ...@@ -14,9 +14,11 @@ limitations under the License. */
#pragma once #pragma once
#include <memory>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/build_strategy.h"
...@@ -45,7 +47,7 @@ class ParallelExecutor { ...@@ -45,7 +47,7 @@ class ParallelExecutor {
public: public:
explicit ParallelExecutor(const std::vector<platform::Place> &places, explicit ParallelExecutor(const std::vector<platform::Place> &places,
const std::unordered_set<std::string> &bcast_vars, const std::vector<std::string> &bcast_vars,
const std::string &loss_var_name, Scope *scope, const std::string &loss_var_name, Scope *scope,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const ExecutionStrategy &exec_strategy, const ExecutionStrategy &exec_strategy,
...@@ -70,7 +72,10 @@ class ParallelExecutor { ...@@ -70,7 +72,10 @@ class ParallelExecutor {
const std::string &fetched_var_name); const std::string &fetched_var_name);
private: private:
void BCastParamsToDevices(const std::unordered_set<std::string> &vars) const; // broadcast the parameters from the 0th device.
// trainer_id the trainer index in nccl distributed training.
void BCastParamsToDevices(const std::vector<std::string> &vars,
int trainer_id = 0) const;
bool EnableParallelGraphExecution(const ir::Graph &graph, bool EnableParallelGraphExecution(const ir::Graph &graph,
const ExecutionStrategy &exec_strategy, const ExecutionStrategy &exec_strategy,
const BuildStrategy &build_strategy) const; const BuildStrategy &build_strategy) const;
......
...@@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator) ...@@ -3,7 +3,7 @@ cc_library(cpu_allocator SRCS cpu_allocator.cc DEPS allocator)
cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator) cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator) cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator) cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator) cc_library(legacy_allocator SRCS legacy_allocator.cc DEPS allocator buddy_allocator profiler)
cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator) cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS best_fit_allocator locked_allocator buffered_allocator cpu_allocator)
if (WITH_GPU) if (WITH_GPU)
......
...@@ -12,8 +12,6 @@ ...@@ -12,8 +12,6 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include <utility> #include <utility>
...@@ -24,9 +22,11 @@ ...@@ -24,9 +22,11 @@
#endif #endif
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/memory/allocation/legacy_allocator.h"
#include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/buddy_allocator.h"
#include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
#include "paddle/fluid/string/split.h" #include "paddle/fluid/string/split.h"
...@@ -329,18 +329,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const { ...@@ -329,18 +329,22 @@ size_t Usage::operator()(const platform::CUDAPinnedPlace &cuda_pinned) const {
} // namespace legacy } // namespace legacy
namespace allocation { namespace allocation {
LegacyMemMonitor GPUMemMonitor; LegacyMemMonitor GPUMemMonitor;
Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) { Allocation *LegacyAllocator::AllocateImpl(size_t size, Allocator::Attr attr) {
void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_); void *ptr = boost::apply_visitor(legacy::AllocVisitor(size), place_);
return new Allocation(ptr, size, place_); auto *tmp_alloc = new Allocation(ptr, size, place_);
platform::MemEvenRecorder::Instance().PushMemRecord(
static_cast<void *>(tmp_alloc), place_, size);
return tmp_alloc;
} }
void LegacyAllocator::Free(Allocation *allocation) { void LegacyAllocator::Free(Allocation *allocation) {
boost::apply_visitor( boost::apply_visitor(
legacy::FreeVisitor(allocation->ptr(), allocation->size()), legacy::FreeVisitor(allocation->ptr(), allocation->size()),
allocation->place()); allocation->place());
platform::MemEvenRecorder::Instance().PopMemRecord(
static_cast<void *>(allocation), place_);
delete allocation; delete allocation;
} }
......
...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include <memory>
#include <string> #include <string>
#include <unordered_map>
#include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h" #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -269,6 +271,48 @@ $$out = \\frac{x}{1 + \|x\|}$$ ...@@ -269,6 +271,48 @@ $$out = \\frac{x}{1 + \|x\|}$$
)DOC"; )DOC";
class AcosOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of acos operator");
AddOutput("Out", "Output of acos operator");
AddComment(R"DOC(
Arccosine Activation Operator.
$$out = \cos^{-1}(x)$$
)DOC");
}
};
class AsinOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of asin operator");
AddOutput("Out", "Output of asin operator");
AddComment(R"DOC(
Arcsine Activation Operator.
$$out = \sin^{-1}(x)$$
)DOC");
}
};
class AtanOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X", "Input of atan operator");
AddOutput("Out", "Output of atan operator");
AddComment(R"DOC(
Arctanh Activation Operator.
$$out = \tanh^{-1}(x)$$
)DOC");
}
};
class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker { class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -543,7 +587,10 @@ namespace ops = paddle::operators; ...@@ -543,7 +587,10 @@ namespace ops = paddle::operators;
__macro(SoftShrink, softshrink); \ __macro(SoftShrink, softshrink); \
__macro(Abs, abs); \ __macro(Abs, abs); \
__macro(Cos, cos); \ __macro(Cos, cos); \
__macro(Acos, acos); \
__macro(Sin, sin); \ __macro(Sin, sin); \
__macro(Asin, asin); \
__macro(Atan, atan); \
__macro(Round, round); \ __macro(Round, round); \
__macro(Log, log); \ __macro(Log, log); \
__macro(Square, square); \ __macro(Square, square); \
......
...@@ -40,8 +40,7 @@ namespace operators { ...@@ -40,8 +40,7 @@ namespace operators {
*/ */
static std::unordered_set<std::string> InplaceOpSet = { static std::unordered_set<std::string> InplaceOpSet = {
"sigmoid", "exp", "relu", "tanh", "sqrt", "ceil", "sigmoid", "exp", "relu", "tanh", "sqrt", "ceil",
"floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid", "floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid"};
};
static bool IsInplace(const std::string& op) { static bool IsInplace(const std::string& op) {
bool inplace = InplaceOpSet.count(op); bool inplace = InplaceOpSet.count(op);
...@@ -553,6 +552,101 @@ struct SinFunctor : public BaseActivationFunctor<T> { ...@@ -553,6 +552,101 @@ struct SinFunctor : public BaseActivationFunctor<T> {
} }
}; };
template <typename T>
struct Acos {
HOSTDEVICE T operator()(const T& val) const { return acos(val); }
};
template <>
struct Acos<platform::float16> {
HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
return platform::float16(acos(static_cast<float>(val)));
}
};
// Acos(x) = acos(x)
template <typename T>
struct AcosFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.unaryExpr(Acos<T>());
}
};
// acos'(x) = -1/sqrt(1-x^2)
template <typename T>
struct AcosGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) =
-dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
}
};
template <typename T>
struct Asin {
HOSTDEVICE T operator()(const T& val) const { return asin(val); }
};
template <>
struct Asin<platform::float16> {
HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
return platform::float16(asin(static_cast<float>(val)));
}
};
// Asin(x) = asin(x)
template <typename T>
struct AsinFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.unaryExpr(Asin<T>());
}
};
// asin'(x) = 1/sqrt(1-x^2)
template <typename T>
struct AsinGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) =
dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
}
};
template <typename T>
struct Atan {
HOSTDEVICE T operator()(const T& val) const { return atan(val); }
};
template <>
struct Atan<platform::float16> {
HOSTDEVICE platform::float16 operator()(const platform::float16& val) const {
return platform::float16(atan(static_cast<float>(val)));
}
};
// Atan(x) = atan(x)
template <typename T>
struct AtanFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
out.device(d) = x.unaryExpr(Atan<T>());
}
};
// atan'(x) = 1 / (1 + x^2)
template <typename T>
struct AtanGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut,
typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
}
};
// round(x) = [x] // round(x) = [x]
template <typename T> template <typename T>
struct RoundFunctor : public BaseActivationFunctor<T> { struct RoundFunctor : public BaseActivationFunctor<T> {
...@@ -1001,13 +1095,16 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> { ...@@ -1001,13 +1095,16 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
__macro(relu, ReluFunctor, ReluGradFunctor); \ __macro(relu, ReluFunctor, ReluGradFunctor); \
__macro(gelu, GeluFunctor, GeluGradFunctor); \ __macro(gelu, GeluFunctor, GeluGradFunctor); \
__macro(tanh, TanhFunctor, TanhGradFunctor); \ __macro(tanh, TanhFunctor, TanhGradFunctor); \
__macro(atan, AtanFunctor, AtanGradFunctor); \
__macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \
__macro(sqrt, SqrtFunctor, SqrtGradFunctor); \ __macro(sqrt, SqrtFunctor, SqrtGradFunctor); \
__macro(abs, AbsFunctor, AbsGradFunctor); \ __macro(abs, AbsFunctor, AbsGradFunctor); \
__macro(ceil, CeilFunctor, ZeroGradFunctor); \ __macro(ceil, CeilFunctor, ZeroGradFunctor); \
__macro(floor, FloorFunctor, ZeroGradFunctor); \ __macro(floor, FloorFunctor, ZeroGradFunctor); \
__macro(cos, CosFunctor, CosGradFunctor); \ __macro(cos, CosFunctor, CosGradFunctor); \
__macro(acos, AcosFunctor, AcosGradFunctor); \
__macro(sin, SinFunctor, SinGradFunctor); \ __macro(sin, SinFunctor, SinGradFunctor); \
__macro(asin, AsinFunctor, AsinGradFunctor); \
__macro(round, RoundFunctor, ZeroGradFunctor); \ __macro(round, RoundFunctor, ZeroGradFunctor); \
__macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
__macro(log, LogFunctor, LogGradFunctor); \ __macro(log, LogFunctor, LogGradFunctor); \
......
...@@ -20,7 +20,7 @@ namespace operators { ...@@ -20,7 +20,7 @@ namespace operators {
enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 }; enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
inline BoxCodeType GetBoxCodeType(const std::string& type) { inline BoxCodeType GetBoxCodeType(const std::string &type) {
if (type == "encode_center_size") { if (type == "encode_center_size") {
return BoxCodeType::kEncodeCenterSize; return BoxCodeType::kEncodeCenterSize;
} else if (type == "decode_center_size") { } else if (type == "decode_center_size") {
...@@ -32,24 +32,23 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) { ...@@ -32,24 +32,23 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class BoxCoderKernel : public framework::OpKernel<T> { class BoxCoderKernel : public framework::OpKernel<T> {
public: public:
void EncodeCenterSize(const framework::Tensor* target_box, void EncodeCenterSize(const framework::Tensor *target_box,
const framework::Tensor* prior_box, const framework::Tensor *prior_box,
const framework::Tensor* prior_box_var, const framework::Tensor *prior_box_var,
const bool normalized, const bool normalized,
const std::vector<float> variance, T* output) const { const std::vector<float> variance, T *output) const {
int64_t row = target_box->dims()[0]; int64_t row = target_box->dims()[0];
int64_t col = prior_box->dims()[0]; int64_t col = prior_box->dims()[0];
int64_t len = prior_box->dims()[1]; int64_t len = prior_box->dims()[1];
auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = nullptr;
if (prior_box_var) prior_box_var_data = prior_box_var->data<T>();
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
#endif #endif
for (int64_t i = 0; i < row; ++i) { for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) { for (int64_t j = 0; j < col; ++j) {
auto *target_box_data = target_box->data<T>();
auto *prior_box_data = prior_box->data<T>();
size_t offset = i * col * len + j * len;
T prior_box_width = prior_box_data[j * len + 2] - T prior_box_width = prior_box_data[j * len + 2] -
prior_box_data[j * len] + (normalized == false); prior_box_data[j * len] + (normalized == false);
T prior_box_height = prior_box_data[j * len + 3] - T prior_box_height = prior_box_data[j * len + 3] -
...@@ -69,7 +68,6 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -69,7 +68,6 @@ class BoxCoderKernel : public framework::OpKernel<T> {
target_box_data[i * len + 1] + target_box_data[i * len + 1] +
(normalized == false); (normalized == false);
size_t offset = i * col * len + j * len;
output[offset] = output[offset] =
(target_box_center_x - prior_box_center_x) / prior_box_width; (target_box_center_x - prior_box_center_x) / prior_box_width;
output[offset + 1] = output[offset + 1] =
...@@ -78,44 +76,61 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -78,44 +76,61 @@ class BoxCoderKernel : public framework::OpKernel<T> {
std::log(std::fabs(target_box_width / prior_box_width)); std::log(std::fabs(target_box_width / prior_box_width));
output[offset + 3] = output[offset + 3] =
std::log(std::fabs(target_box_height / prior_box_height)); std::log(std::fabs(target_box_height / prior_box_height));
}
}
if (prior_box_var) { if (prior_box_var) {
const T *prior_box_var_data = prior_box_var->data<T>();
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(3)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
for (int k = 0; k < 4; ++k) {
size_t offset = i * col * len + j * len;
int prior_var_offset = j * len; int prior_var_offset = j * len;
output[offset] /= prior_box_var_data[prior_var_offset]; output[offset + k] /= prior_box_var_data[prior_var_offset + k];
output[offset + 1] /= prior_box_var_data[prior_var_offset + 1]; }
output[offset + 2] /= prior_box_var_data[prior_var_offset + 2]; }
output[offset + 3] /= prior_box_var_data[prior_var_offset + 3]; }
} else if (!(variance.empty())) { } else if (!(variance.empty())) {
#ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(3)
#endif
for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) {
for (int k = 0; k < 4; ++k) { for (int k = 0; k < 4; ++k) {
size_t offset = i * col * len + j * len;
output[offset + k] /= static_cast<T>(variance[k]); output[offset + k] /= static_cast<T>(variance[k]);
} }
} }
} }
} }
} }
template <int axis, int var_size> template <int axis, int var_size>
void DecodeCenterSize(const framework::Tensor* target_box, void DecodeCenterSize(const framework::Tensor *target_box,
const framework::Tensor* prior_box, const framework::Tensor *prior_box,
const framework::Tensor* prior_box_var, const framework::Tensor *prior_box_var,
const bool normalized, std::vector<float> variance, const bool normalized, std::vector<float> variance,
T* output) const { T *output) const {
int64_t row = target_box->dims()[0]; int64_t row = target_box->dims()[0];
int64_t col = target_box->dims()[1]; int64_t col = target_box->dims()[1];
int64_t len = target_box->dims()[2]; int64_t len = target_box->dims()[2];
auto* target_box_data = target_box->data<T>();
auto* prior_box_data = prior_box->data<T>();
const T* prior_box_var_data = nullptr;
if (var_size == 2) prior_box_var_data = prior_box_var->data<T>();
int prior_box_offset = 0;
T var_data[4] = {1., 1., 1., 1.};
T* var_ptr = var_data;
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
#pragma omp parallel for collapse(2) #pragma omp parallel for collapse(2)
#endif #endif
for (int64_t i = 0; i < row; ++i) { for (int64_t i = 0; i < row; ++i) {
for (int64_t j = 0; j < col; ++j) { for (int64_t j = 0; j < col; ++j) {
auto *target_box_data = target_box->data<T>();
auto *prior_box_data = prior_box->data<T>();
T var_data[4] = {1., 1., 1., 1.};
T *var_ptr = var_data;
size_t offset = i * col * len + j * len; size_t offset = i * col * len + j * len;
prior_box_offset = axis == 0 ? j * len : i * len; int prior_box_offset = axis == 0 ? j * len : i * len;
T prior_box_width = prior_box_data[prior_box_offset + 2] - T prior_box_width = prior_box_data[prior_box_offset + 2] -
prior_box_data[prior_box_offset] + prior_box_data[prior_box_offset] +
(normalized == false); (normalized == false);
...@@ -131,10 +146,10 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -131,10 +146,10 @@ class BoxCoderKernel : public framework::OpKernel<T> {
T target_box_width = 0, target_box_height = 0; T target_box_width = 0, target_box_height = 0;
int prior_var_offset = axis == 0 ? j * len : i * len; int prior_var_offset = axis == 0 ? j * len : i * len;
if (var_size == 2) { if (var_size == 2) {
std::memcpy(var_ptr, prior_box_var_data + prior_var_offset, std::memcpy(var_ptr, prior_box_var->data<T>() + prior_var_offset,
4 * sizeof(T)); 4 * sizeof(T));
} else if (var_size == 1) { } else if (var_size == 1) {
var_ptr = reinterpret_cast<T*>(variance.data()); var_ptr = reinterpret_cast<T *>(variance.data());
} }
T box_var_x = *var_ptr; T box_var_x = *var_ptr;
T box_var_y = *(var_ptr + 1); T box_var_y = *(var_ptr + 1);
...@@ -162,11 +177,11 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -162,11 +177,11 @@ class BoxCoderKernel : public framework::OpKernel<T> {
} }
} }
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext &context) const override {
auto* prior_box = context.Input<framework::Tensor>("PriorBox"); auto *prior_box = context.Input<framework::Tensor>("PriorBox");
auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar"); auto *prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
auto* target_box = context.Input<framework::LoDTensor>("TargetBox"); auto *target_box = context.Input<framework::LoDTensor>("TargetBox");
auto* output_box = context.Output<framework::Tensor>("OutputBox"); auto *output_box = context.Output<framework::Tensor>("OutputBox");
std::vector<float> variance = context.Attr<std::vector<float>>("variance"); std::vector<float> variance = context.Attr<std::vector<float>>("variance");
const int axis = context.Attr<int>("axis"); const int axis = context.Attr<int>("axis");
if (target_box->lod().size()) { if (target_box->lod().size()) {
...@@ -194,7 +209,7 @@ class BoxCoderKernel : public framework::OpKernel<T> { ...@@ -194,7 +209,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
output_box->mutable_data<T>({row, col, len}, context.GetPlace()); output_box->mutable_data<T>({row, col, len}, context.GetPlace());
T* output = output_box->data<T>(); T *output = output_box->data<T>();
if (code_type == BoxCodeType::kEncodeCenterSize) { if (code_type == BoxCodeType::kEncodeCenterSize) {
EncodeCenterSize(target_box, prior_box, prior_box_var, normalized, EncodeCenterSize(target_box, prior_box, prior_box_var, normalized,
variance, output); variance, output);
......
...@@ -23,9 +23,6 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel { ...@@ -23,9 +23,6 @@ class FusedEmbeddingSeqPoolOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
if (ctx->IsRuntime()) {
return;
}
PADDLE_ENFORCE(ctx->HasInput("W"), PADDLE_ENFORCE(ctx->HasInput("W"),
"Input W of FusedEmbeddingSeqPoolOp should not be null."); "Input W of FusedEmbeddingSeqPoolOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Ids"), PADDLE_ENFORCE(ctx->HasInput("Ids"),
...@@ -91,6 +88,8 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -91,6 +88,8 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
"(boolean, default false) " "(boolean, default false) "
"Sparse update.") "Sparse update.")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
.SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
FusedEmbeddingSeqPool Operator. FusedEmbeddingSeqPool Operator.
......
...@@ -121,6 +121,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> { ...@@ -121,6 +121,8 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
auto *ids = context.Input<LoDTensor>("Ids"); auto *ids = context.Input<LoDTensor>("Ids");
auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out")); auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W")); auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
// runtime shape
d_table->set_height(table_dim[0]);
auto *ids_data = ids->data<int64_t>(); auto *ids_data = ids->data<int64_t>();
int64_t ids_num = ids->numel(); int64_t ids_num = ids->numel();
......
...@@ -26,9 +26,6 @@ class HashOp : public framework::OperatorWithKernel { ...@@ -26,9 +26,6 @@ class HashOp : public framework::OperatorWithKernel {
: OperatorWithKernel(type, inputs, outputs, attrs) {} : OperatorWithKernel(type, inputs, outputs, attrs) {}
void InferShape(framework::InferShapeContext *ctx) const override { void InferShape(framework::InferShapeContext *ctx) const override {
if (ctx->IsRuntime()) {
return;
}
PADDLE_ENFORCE(ctx->HasInput("X"), PADDLE_ENFORCE(ctx->HasInput("X"),
"Input(X) of HashOp should not be null."); "Input(X) of HashOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"), PADDLE_ENFORCE(ctx->HasOutput("Out"),
...@@ -57,6 +54,8 @@ $$Out = scale * X$$ ...@@ -57,6 +54,8 @@ $$Out = scale * X$$
)DOC"); )DOC");
AddAttr<int>("num_hash", "").SetDefault(1); AddAttr<int>("num_hash", "").SetDefault(1);
AddAttr<int>("mod_by", "").SetDefault(100000); AddAttr<int>("mod_by", "").SetDefault(100000);
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
.SetDefault(true);
} }
}; };
......
...@@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,12 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#ifndef PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
#define PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
#include <memory>
#include <set>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <vector> #include <vector>
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/var_desc.h"
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
...@@ -33,29 +39,47 @@ enum class OpState { /* nGraph support state on ops */ ...@@ -33,29 +39,47 @@ enum class OpState { /* nGraph support state on ops */
UNKNOWN /* Output all for debug purpose */ UNKNOWN /* Output all for debug purpose */
}; };
// cache engine repetitives
struct EngineCache {
std::shared_ptr<ngraph::Function> ngraph_function;
std::set<std::string> persistables;
std::vector<std::string> var_in;
std::vector<std::string> var_out;
std::vector<size_t> var_in_updates;
bool is_test = true;
};
// perform graph build through bridge and execute computation // perform graph build through bridge and execute computation
class NgraphEngine { class NgraphEngine {
public: public:
explicit NgraphEngine(const framework::Scope& scope, explicit NgraphEngine(const framework::Scope& scope,
const platform::Place& place, const platform::Place& place,
const std::string& serialized_graph, const framework::ExecutionContext& ctx);
const std::vector<int>& interval);
void Run(const framework::Scope& scope, const platform::Place& place) const; void Run(const framework::Scope& scope, const platform::Place& place) const;
static void EnableNgraph(const framework::ProgramDesc& program); static const framework::BlockDesc* p_bdesc;
static std::vector<std::string> feed_vars, fetch_vars;
static void FuseNgraphOps(
const framework::BlockDesc& prog,
std::vector<std::unique_ptr<framework::OperatorBase>>* ops);
private: private:
static std::unordered_map<std::string, std::shared_ptr<ngraph::Function>> static std::unordered_map<std::string, EngineCache> engine_cache;
func_cache_; static std::unordered_map<
std::string, std::vector<std::shared_ptr<ngraph::runtime::Tensor>>>
t_in_cache_;
static framework::Variable* pre_var_ptr;
const framework::Scope& scope_; const framework::Scope& scope_;
const platform::Place& place_; const platform::Place& place_;
std::vector<std::shared_ptr<framework::OperatorBase>> fused_ops_; std::vector<std::shared_ptr<framework::OperatorBase>> fused_ops_;
std::unordered_map<std::string, ngraph::element::Type> var_type_map_; std::unordered_map<std::string, ngraph::element::Type> var_type_map_;
std::unordered_set<std::string> persistables_; std::set<std::string> persistables_;
std::unordered_set<std::string> fetches_;
std::unordered_set<std::string> post_op_inputs_; std::unordered_set<std::string> post_op_inputs_;
OpState ng_op_state_ = OpState::UNKNOWN; OpState op_state_ = OpState::UNKNOWN;
bool is_test_{true};
std::string func_cache_key_; std::string func_cache_key_;
// ngraph backend eg. CPU // ngraph backend eg. CPU
...@@ -66,6 +90,8 @@ class NgraphEngine { ...@@ -66,6 +90,8 @@ class NgraphEngine {
std::vector<std::string> var_in_; std::vector<std::string> var_in_;
// var_name of outputs from fetch in order // var_name of outputs from fetch in order
std::vector<std::string> var_out_; std::vector<std::string> var_out_;
// non-persitable var_in
std::vector<size_t> var_in_updates_;
// map input vars to nodes // map input vars to nodes
std::shared_ptr< std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
...@@ -74,20 +100,23 @@ class NgraphEngine { ...@@ -74,20 +100,23 @@ class NgraphEngine {
std::shared_ptr< std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
var_node_map_; var_node_map_;
// prepare info for nraph engine // prepare info for ngraph engine need
void Prepare(const framework::BlockDesc& block, void Prepare(const std::vector<int>& interval);
// get ngraph engine input and output list
void BuildNgIO(const std::vector<framework::OpDesc*>& op_descs,
const std::vector<int>& interval); const std::vector<int>& interval);
// get ngraph input and define ngraph input parameters // get ngraph input and define ngraph input parameters
void GetNgInputShape(std::shared_ptr<framework::OperatorBase> op); void GetNgInputShape();
// Call ngraph bridge to map ops // Call ngraph bridge to map ops
void BuildNgNodes(); void BuildNgNodes();
// get the ngraph input and output var list // run paddle RuntimeInferShape to get the tensor shape
void BuildNgIO(); void RunInferShape();
// build ngraph function call // build ngraph function call
void BuildNgFunction(); void BuildNgFunction(const std::vector<int>& interval);
// Check cache for ngraph function or otherwise build the function // Check cache for ngraph function or otherwise build the function
void GetNgFunction(); void GetNgFunction(std::string engine_key, const std::vector<int>& interval);
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
#endif // PADDLE_FLUID_OPERATORS_NGRAPH_NGRAPH_ENGINE_H_
...@@ -29,6 +29,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -29,6 +29,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Xs", "A list of inputs.").AsDispensable(); AddInput("Xs", "A list of inputs.").AsDispensable();
AddOutput("Ys", "A list of outputs").AsDispensable(); AddOutput("Ys", "A list of outputs").AsDispensable();
AddAttr<std::string>("graph", "the graph."); AddAttr<std::string>("graph", "the graph.");
AddAttr<std::string>("engine_key", "the engine hash key.");
AddAttr<std::vector<int>>("interval", "op interval supported by ngraph"); AddAttr<std::vector<int>>("interval", "op interval supported by ngraph");
AddComment("ngraph engine operator."); AddComment("ngraph engine operator.");
} }
......
...@@ -46,10 +46,8 @@ class NgraphEngineKernel : public framework::OpKernel<T> { ...@@ -46,10 +46,8 @@ class NgraphEngineKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto& scope = ctx.scope(); auto& scope = ctx.scope();
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
std::string serialized_graph = ctx.Attr<std::string>("graph");
auto interval = ctx.Attr<std::vector<int>>("interval");
NgraphEngine ngraph_engine(scope, place, serialized_graph, interval); NgraphEngine ngraph_engine(scope, place, ctx);
ngraph_engine.Run(scope, place); ngraph_engine.Run(scope, place);
} }
}; };
......
...@@ -22,9 +22,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel { ...@@ -22,9 +22,6 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
if (ctx->IsRuntime()) {
return;
}
PADDLE_ENFORCE( PADDLE_ENFORCE(
ctx->HasInput("X"), ctx->HasInput("X"),
"Input(X) of SequecceEnumerate operator should not be null."); "Input(X) of SequecceEnumerate operator should not be null.");
...@@ -62,6 +59,8 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -62,6 +59,8 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
}); });
AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.") AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
.SetDefault(0); .SetDefault(0);
AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
.SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
Sequence Enumerate Operator. Sequence Enumerate Operator.
......
...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h"
#include <deque> #include <deque>
#include <forward_list> #include <forward_list>
...@@ -30,6 +29,8 @@ limitations under the License. */ ...@@ -30,6 +29,8 @@ limitations under the License. */
#include "glog/logging.h" #include "glog/logging.h"
#include "google/protobuf/text_format.h" #include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
...@@ -317,6 +318,24 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -317,6 +318,24 @@ class DeviceTracerImpl : public DeviceTracer {
stream_id, correlation_id, bytes}); stream_id, correlation_id, bytes});
} }
void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &alloc_in,
const std::string &free_in, int64_t thread_id) {
if (0 == start_ns || 0 == end_ns) {
VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
return;
}
thread_local std::forward_list<MemInfoRecord> *local_mem_info_record =
nullptr;
if (local_mem_info_record == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
mem_info_record_.emplace_front();
local_mem_info_record = &mem_info_record_.front();
}
local_mem_info_record->emplace_front(MemInfoRecord{
start_ns, end_ns, bytes, place, thread_id, alloc_in, free_in});
}
void AddActiveKindRecords(const std::string &anno, uint64_t start_ns, void AddActiveKindRecords(const std::string &anno, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, uint64_t end_ns, int64_t device_id,
int64_t thread_id, uint32_t correlation_id) { int64_t thread_id, uint32_t correlation_id) {
...@@ -409,6 +428,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -409,6 +428,7 @@ class DeviceTracerImpl : public DeviceTracer {
correlations_.clear(); correlations_.clear();
for (auto &tmp : correlations_pairs) tmp.clear(); for (auto &tmp : correlations_pairs) tmp.clear();
for (auto &tmp : cpu_records_) tmp.clear(); for (auto &tmp : cpu_records_) tmp.clear();
for (auto &tmp : mem_info_record_) tmp.clear();
for (auto &tmp : active_kind_records_) tmp.clear(); for (auto &tmp : active_kind_records_) tmp.clear();
} }
...@@ -440,9 +460,12 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -440,9 +460,12 @@ class DeviceTracerImpl : public DeviceTracer {
proto::Profile profile_pb; proto::Profile profile_pb;
profile_pb.set_start_ns(start_ns_); profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_); profile_pb.set_end_ns(end_ns_);
if (correlations_.empty()) if (correlations_.empty()) {
for (auto &tmp : correlations_pairs) for (auto &tmp : correlations_pairs) {
for (auto &pair : tmp) correlations_[pair.first] = pair.second; for (auto &pair : tmp) correlations_[pair.first] = pair.second;
}
}
for (const KernelRecord &r : kernel_records_) { for (const KernelRecord &r : kernel_records_) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel); event->set_type(proto::Event::GPUKernel);
...@@ -462,6 +485,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -462,6 +485,7 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find; VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
for (auto &tmp : cpu_records_) { for (auto &tmp : cpu_records_) {
for (const CPURecord &r : tmp) { for (const CPURecord &r : tmp) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
...@@ -473,6 +497,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -473,6 +497,7 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
} }
for (auto &tmp : active_kind_records_) { for (auto &tmp : active_kind_records_) {
for (const ActiveKindRecord &r : tmp) { for (const ActiveKindRecord &r : tmp) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
...@@ -510,6 +535,31 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -510,6 +535,31 @@ class DeviceTracerImpl : public DeviceTracer {
event->mutable_memcopy()->set_bytes(r.bytes); event->mutable_memcopy()->set_bytes(r.bytes);
} }
VLOG(1) << "MemRecord event miss: " << miss << " find: " << find; VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
for (auto &tmp : mem_info_record_) {
for (const auto &r : tmp) {
auto *event = profile_pb.add_mem_events();
event->set_device_id(0);
if (platform::is_cpu_place(r.place)) {
event->set_place(proto::MemEvent::CPUPlace);
} else if (platform::is_gpu_place(r.place)) {
event->set_place(proto::MemEvent::CUDAPlace);
event->set_device_id(
boost::get<platform::CUDAPlace>(r.place).GetDeviceId());
} else if (platform::is_cuda_pinned_place(r.place)) {
event->set_place(proto::MemEvent::CUDAPinnedPlace);
} else {
PADDLE_THROW("The current place is not supported.");
}
event->set_alloc_in(r.alloc_in);
event->set_free_in(r.free_in);
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_bytes(r.bytes);
event->set_thread_id(r.thread_id);
}
}
std::ofstream profile_f; std::ofstream profile_f;
profile_f.open(profile_path, profile_f.open(profile_path,
std::ios::out | std::ios::trunc | std::ios::binary); std::ios::out | std::ios::trunc | std::ios::binary);
...@@ -553,6 +603,7 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -553,6 +603,7 @@ class DeviceTracerImpl : public DeviceTracer {
std::forward_list<KernelRecord> kernel_records_; std::forward_list<KernelRecord> kernel_records_;
std::forward_list<MemRecord> mem_records_; std::forward_list<MemRecord> mem_records_;
std::forward_list<std::forward_list<CPURecord>> cpu_records_; std::forward_list<std::forward_list<CPURecord>> cpu_records_;
std::forward_list<std::forward_list<MemInfoRecord>> mem_info_record_;
std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_; std::forward_list<std::forward_list<ActiveKindRecord>> active_kind_records_;
std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>> std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
correlations_pairs; correlations_pairs;
...@@ -575,7 +626,7 @@ Event *CurAnnotation() { ...@@ -575,7 +626,7 @@ Event *CurAnnotation() {
return annotation_stack.back(); return annotation_stack.back();
} }
std::string CurAnnotationName() { std::string CurAnnotationName() {
if (annotation_stack.empty()) return ""; if (annotation_stack.empty()) return "Unknown";
return annotation_stack.back()->name(); return annotation_stack.back()->name();
} }
......
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.pb.h" #include "paddle/fluid/platform/profiler.pb.h"
...@@ -47,6 +48,7 @@ class DeviceTracer { ...@@ -47,6 +48,7 @@ class DeviceTracer {
int64_t stream_id; int64_t stream_id;
uint32_t correlation_id; uint32_t correlation_id;
}; };
struct CPURecord { struct CPURecord {
std::string name; std::string name;
uint64_t start_ns; uint64_t start_ns;
...@@ -54,6 +56,7 @@ class DeviceTracer { ...@@ -54,6 +56,7 @@ class DeviceTracer {
int64_t device_id; int64_t device_id;
int64_t thread_id; int64_t thread_id;
}; };
struct MemRecord { struct MemRecord {
std::string name; std::string name;
uint64_t start_ns; uint64_t start_ns;
...@@ -63,6 +66,17 @@ class DeviceTracer { ...@@ -63,6 +66,17 @@ class DeviceTracer {
uint32_t correlation_id; uint32_t correlation_id;
uint64_t bytes; uint64_t bytes;
}; };
struct MemInfoRecord {
uint64_t start_ns;
uint64_t end_ns;
size_t bytes;
Place place;
int64_t thread_id;
std::string alloc_in;
std::string free_in;
};
struct ActiveKindRecord { struct ActiveKindRecord {
std::string name; std::string name;
uint64_t start_ns; uint64_t start_ns;
...@@ -71,6 +85,7 @@ class DeviceTracer { ...@@ -71,6 +85,7 @@ class DeviceTracer {
int64_t thread_id; int64_t thread_id;
uint32_t correlation_id; uint32_t correlation_id;
}; };
virtual ~DeviceTracer() {} virtual ~DeviceTracer() {}
// Needs to be called once before use. // Needs to be called once before use.
virtual void Enable() = 0; virtual void Enable() = 0;
...@@ -97,6 +112,12 @@ class DeviceTracer { ...@@ -97,6 +112,12 @@ class DeviceTracer {
int64_t thread_id, int64_t thread_id,
uint32_t correlation_id) = 0; uint32_t correlation_id) = 0;
virtual void AddMemInfoRecord(uint64_t start_ns, uint64_t end_ns,
size_t bytes, const Place& place,
const std::string& alloc_in,
const std::string& free_in,
int64_t thread_id) = 0;
// Add a cuda kernel stats. `correlation_id` will be mapped to annotation // Add a cuda kernel stats. `correlation_id` will be mapped to annotation
// added before for human readability. // added before for human readability.
virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end, virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end,
......
...@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda_runtime.h> #include <cuda_runtime.h>
#endif #endif
#include "paddle/fluid/platform/place.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -64,5 +66,36 @@ class Event { ...@@ -64,5 +66,36 @@ class Event {
#endif #endif
#endif #endif
}; };
class MemEvent {
public:
MemEvent(EventType type, uint64_t start_ns, uint64_t end_ns, size_t bytes,
Place place, int64_t thread_id, const std::string& annotation)
: type_(type),
start_ns_(start_ns),
end_ns_(end_ns),
bytes_(bytes),
place_(place),
thread_id_(thread_id),
annotation_(annotation) {}
const EventType& type() const { return type_; }
uint64_t start_ns() const { return start_ns_; }
uint64_t end_ns() const { return end_ns_; }
size_t bytes() const { return bytes_; }
Place place() const { return place_; }
int64_t thread_id() const { return thread_id_; }
const std::string& annotation() const { return annotation_; }
private:
EventType type_;
uint64_t start_ns_ = 0;
uint64_t end_ns_ = 0;
size_t bytes_;
Place place_;
int64_t thread_id_;
std::string annotation_;
};
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include <algorithm> #include <algorithm>
#include <iomanip> #include <iomanip>
#include <limits> #include <limits>
...@@ -21,6 +20,8 @@ limitations under the License. */ ...@@ -21,6 +20,8 @@ limitations under the License. */
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <random> #include <random>
#include <string> #include <string>
#include <vector>
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include <cuda.h> #include <cuda.h>
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
...@@ -36,8 +37,6 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); ...@@ -36,8 +37,6 @@ DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct EventList;
static int64_t profiler_lister_id = 0; static int64_t profiler_lister_id = 0;
static bool should_send_profile_state = false; static bool should_send_profile_state = false;
std::mutex profiler_mu; std::mutex profiler_mu;
...@@ -53,43 +52,15 @@ static uint32_t g_next_thread_id = 0; ...@@ -53,43 +52,15 @@ static uint32_t g_next_thread_id = 0;
// The global mutex // The global mutex
static std::mutex g_all_event_lists_mutex; static std::mutex g_all_event_lists_mutex;
// The total event lists of all threads // The total event lists of all threads
static std::list<std::shared_ptr<EventList>> g_all_event_lists; static std::list<std::shared_ptr<EventList<Event>>> g_all_event_lists;
// The thread local event list only can be accessed by the specific thread // The thread local event list only can be accessed by the specific thread
static thread_local std::shared_ptr<EventList> g_event_list; static thread_local std::shared_ptr<EventList<Event>> g_event_list;
struct EventList {
constexpr static size_t kMB = 1024 * 1024;
constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static size_t kEventSize = sizeof(Event);
constexpr static size_t kEventAlign = alignof(Event);
constexpr static size_t kNumBlock =
kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args>
Event* Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock);
}
event_blocks.front().emplace_back(std::forward<Args>(args)...);
return &event_blocks.front().back();
}
std::vector<Event> Reduce() {
std::vector<Event> result;
for (auto& block : event_blocks) {
result.insert(result.begin(), std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
event_blocks.clear();
return result;
}
void Clear() { event_blocks.clear(); }
std::forward_list<std::vector<Event>> event_blocks; static std::list<std::shared_ptr<EventList<MemEvent>>> g_all_mem_event_lists;
}; static thread_local std::shared_ptr<EventList<MemEvent>> g_mem_event_list;
static std::mutex g_all_mem_event_lists_mutex;
static thread_local int32_t g_mem_thread_id;
static uint32_t g_mem_next_thread_id = 0;
inline uint64_t GetTimeInNsec() { inline uint64_t GetTimeInNsec() {
using clock = std::conditional<std::chrono::high_resolution_clock::is_steady, using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
...@@ -105,13 +76,13 @@ Event::Event(EventType type, std::string name, uint32_t thread_id) ...@@ -105,13 +76,13 @@ Event::Event(EventType type, std::string name, uint32_t thread_id)
cpu_ns_ = GetTimeInNsec(); cpu_ns_ = GetTimeInNsec();
} }
const EventType& Event::type() const { return type_; } const EventType &Event::type() const { return type_; }
double Event::CpuElapsedMs(const Event& e) const { double Event::CpuElapsedMs(const Event &e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000000.0); return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
} }
double Event::CudaElapsedMs(const Event& e) const { double Event::CudaElapsedMs(const Event &e) const {
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
return gpu_ns_ / 1000000.0; return gpu_ns_ / 1000000.0;
#else #else
...@@ -120,10 +91,32 @@ double Event::CudaElapsedMs(const Event& e) const { ...@@ -120,10 +91,32 @@ double Event::CudaElapsedMs(const Event& e) const {
#endif #endif
} }
inline EventList& GetEventList() { inline EventList<MemEvent> &GetMemEventList() {
if (!g_mem_event_list) {
g_mem_event_list = std::make_shared<EventList<MemEvent>>();
std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
g_mem_thread_id = g_mem_next_thread_id++;
g_all_mem_event_lists.emplace_front(g_mem_event_list);
}
return *g_mem_event_list;
}
void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &annotation) {
GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes,
place, g_mem_thread_id, annotation);
}
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &annotation) {
GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place,
g_mem_thread_id, annotation);
}
inline EventList<Event> &GetEventList() {
if (!g_event_list) { if (!g_event_list) {
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
g_event_list = std::make_shared<EventList>(); g_event_list = std::make_shared<EventList<Event>>();
g_thread_id = g_next_thread_id++; g_thread_id = g_next_thread_id++;
g_all_event_lists.emplace_front(g_event_list); g_all_event_lists.emplace_front(g_event_list);
RecoreCurThreadId(g_thread_id); RecoreCurThreadId(g_thread_id);
...@@ -131,26 +124,26 @@ inline EventList& GetEventList() { ...@@ -131,26 +124,26 @@ inline EventList& GetEventList() {
return *g_event_list; return *g_event_list;
} }
void Mark(const std::string& name) { void Mark(const std::string &name) {
GetEventList().Record(EventType::kMark, name, g_thread_id); GetEventList().Record(EventType::kMark, name, g_thread_id);
} }
Event* PushEvent(const std::string& name) { Event *PushEvent(const std::string &name) {
return GetEventList().Record(EventType::kPushRange, name, g_thread_id); return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
} }
void PopEvent(const std::string& name) { void PopEvent(const std::string &name) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id); GetEventList().Record(EventType::kPopRange, name, g_thread_id);
} }
RecordEvent::RecordEvent(const std::string& name) RecordEvent::RecordEvent(const std::string &name)
: is_enabled_(false), start_ns_(PosixInNsec()) { : is_enabled_(false), start_ns_(PosixInNsec()) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
// lock is not needed, the code below is thread-safe // lock is not needed, the code below is thread-safe
is_enabled_ = true; is_enabled_ = true;
name_ = name; name_ = name;
Event* e = PushEvent(name_); Event *e = PushEvent(name_);
// Maybe need the same push/pop behavior. // Maybe need the same push/pop behavior.
SetCurAnnotation(e); SetCurAnnotation(e);
} }
...@@ -158,7 +151,7 @@ RecordEvent::RecordEvent(const std::string& name) ...@@ -158,7 +151,7 @@ RecordEvent::RecordEvent(const std::string& name)
RecordEvent::~RecordEvent() { RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled || !is_enabled_) return; if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
// lock is not needed, the code below is thread-safe // lock is not needed, the code below is thread-safe
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer *tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(), tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
BlockDepth(), g_thread_id); BlockDepth(), g_thread_id);
...@@ -167,7 +160,56 @@ RecordEvent::~RecordEvent() { ...@@ -167,7 +160,56 @@ RecordEvent::~RecordEvent() {
PopEvent(name_); PopEvent(name_);
} }
RecordRPCEvent::RecordRPCEvent(const std::string& name) { MemEvenRecorder MemEvenRecorder::recorder;
void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
size_t size) {
if (g_state == ProfilerState::kDisabled) return;
std::lock_guard<std::mutex> guard(mtx_);
auto &events = address_memevent_[place];
PADDLE_ENFORCE(events.count(ptr) == 0, "");
events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
new MemEvenRecorder::RecordMemEvent(place, size)));
}
void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
if (g_state == ProfilerState::kDisabled) return;
std::lock_guard<std::mutex> guard(mtx_);
auto &events = address_memevent_[place];
auto iter = events.find(ptr);
// The ptr maybe not in address_memevent
if (iter != events.end()) {
events.erase(iter);
}
}
void MemEvenRecorder::Flush() {
std::lock_guard<std::mutex> guard(mtx_);
address_memevent_.clear();
}
MemEvenRecorder::RecordMemEvent::RecordMemEvent(const Place &place,
size_t bytes)
: place_(place),
bytes_(bytes),
start_ns_(PosixInNsec()),
alloc_in_(CurAnnotationName()) {
PushMemEvent(start_ns_, end_ns_, bytes_, place_, alloc_in_);
}
MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
DeviceTracer *tracer = GetDeviceTracer();
end_ns_ = PosixInNsec();
auto annotation_free = CurAnnotationName();
if (tracer) {
tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_,
annotation_free, g_mem_thread_id);
}
PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
}
RecordRPCEvent::RecordRPCEvent(const std::string &name) {
if (FLAGS_enable_rpc_profiler) { if (FLAGS_enable_rpc_profiler) {
event_.reset(new platform::RecordEvent(name)); event_.reset(new platform::RecordEvent(name));
} }
...@@ -185,7 +227,7 @@ RecordBlock::RecordBlock(int block_id) ...@@ -185,7 +227,7 @@ RecordBlock::RecordBlock(int block_id)
RecordBlock::~RecordBlock() { RecordBlock::~RecordBlock() {
// lock is not needed, the code below is thread-safe // lock is not needed, the code below is thread-safe
if (g_state == ProfilerState::kDisabled || !is_enabled_) return; if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer *tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
// We try to put all blocks at the same nested depth in the // We try to put all blocks at the same nested depth in the
// same timeline lane. and distinguish the using thread_id. // same timeline lane. and distinguish the using thread_id.
...@@ -232,11 +274,16 @@ void EnableProfiler(ProfilerState state) { ...@@ -232,11 +274,16 @@ void EnableProfiler(ProfilerState state) {
void ResetProfiler() { void ResetProfiler() {
SynchronizeAllDevice(); SynchronizeAllDevice();
GetDeviceTracer()->Reset(); GetDeviceTracer()->Reset();
MemEvenRecorder::Instance().Flush();
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
++it) { ++it) {
(*it)->Clear(); (*it)->Clear();
} }
for (auto it = g_all_mem_event_lists.begin();
it != g_all_mem_event_lists.end(); ++it) {
(*it)->Clear();
}
} }
std::vector<std::vector<Event>> GetAllEvents() { std::vector<std::vector<Event>> GetAllEvents() {
...@@ -249,6 +296,15 @@ std::vector<std::vector<Event>> GetAllEvents() { ...@@ -249,6 +296,15 @@ std::vector<std::vector<Event>> GetAllEvents() {
return result; return result;
} }
std::vector<std::vector<MemEvent>> GetMemEvents() {
std::lock_guard<std::mutex> guard(g_all_mem_event_lists_mutex);
std::vector<std::vector<MemEvent>> result;
for (auto &it : g_all_mem_event_lists) {
result.emplace_back((*it).Reduce());
}
return result;
}
// The information of each event given in the profiling report // The information of each event given in the profiling report
struct EventItem { struct EventItem {
std::string name; std::string name;
...@@ -263,8 +319,8 @@ struct EventItem { ...@@ -263,8 +319,8 @@ struct EventItem {
}; };
// Print results // Print results
void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, void PrintProfiler(const std::vector<std::vector<EventItem>> &events_table,
const std::string& sorted_domain, const size_t name_width, const std::string &sorted_domain, const size_t name_width,
const size_t data_width, bool merge_thread) { const size_t data_width, bool merge_thread) {
// Output header information // Output header information
std::cout << "\n------------------------->" std::cout << "\n------------------------->"
...@@ -302,7 +358,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, ...@@ -302,7 +358,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
<< std::setw(data_width) << "Ratio." << std::endl; << std::setw(data_width) << "Ratio." << std::endl;
for (size_t i = 0; i < events_table.size(); ++i) { for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) { for (size_t j = 0; j < events_table[i].size(); ++j) {
const EventItem& event_item = events_table[i][j]; const EventItem &event_item = events_table[i][j];
std::cout << std::setw(name_width) << event_item.name std::cout << std::setw(name_width) << event_item.name
<< std::setw(data_width) << event_item.calls << std::setw(data_width) << event_item.calls
<< std::setw(data_width) << event_item.total_time; << std::setw(data_width) << event_item.total_time;
...@@ -326,54 +382,54 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, ...@@ -326,54 +382,54 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
} }
// Parse the event list and output the profiling report // Parse the event list and output the profiling report
void ParseEvents(const std::vector<std::vector<Event>>& events, void ParseEvents(const std::vector<std::vector<Event>> &events,
bool merge_thread, bool merge_thread,
EventSortingKey sorted_by = EventSortingKey::kDefault) { EventSortingKey sorted_by = EventSortingKey::kDefault) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
if (merge_thread && events.size() < 2) return; if (merge_thread && events.size() < 2) return;
std::string sorted_domain; std::string sorted_domain;
std::function<bool(const EventItem&, const EventItem&)> sorted_func; std::function<bool(const EventItem &, const EventItem &)> sorted_func;
switch (sorted_by) { switch (sorted_by) {
case EventSortingKey::kCalls: case EventSortingKey::kCalls:
sorted_domain = "number of calls"; sorted_domain = "number of calls";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.calls > b.calls; return a.calls > b.calls;
}; };
break; break;
case EventSortingKey::kTotal: case EventSortingKey::kTotal:
sorted_domain = "total time"; sorted_domain = "total time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.total_time > b.total_time; return a.total_time > b.total_time;
}; };
break; break;
case EventSortingKey::kMin: case EventSortingKey::kMin:
sorted_domain = "minimum time"; sorted_domain = "minimum time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.min_time > b.min_time; return a.min_time > b.min_time;
}; };
break; break;
case EventSortingKey::kMax: case EventSortingKey::kMax:
sorted_domain = "maximum time"; sorted_domain = "maximum time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.max_time > b.max_time; return a.max_time > b.max_time;
}; };
break; break;
case EventSortingKey::kAve: case EventSortingKey::kAve:
sorted_domain = "average time"; sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.ave_time > b.ave_time; return a.ave_time > b.ave_time;
}; };
break; break;
case EventSortingKey::kGPUTime: case EventSortingKey::kGPUTime:
sorted_domain = "average time"; sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.gpu_time > b.gpu_time; return a.gpu_time > b.gpu_time;
}; };
break; break;
case EventSortingKey::kCPUTime: case EventSortingKey::kCPUTime:
sorted_domain = "average time"; sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) { sorted_func = [](const EventItem &a, const EventItem &b) {
return a.cpu_time > b.cpu_time; return a.cpu_time > b.cpu_time;
}; };
break; break;
...@@ -381,7 +437,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -381,7 +437,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
sorted_domain = "event first end time"; sorted_domain = "event first end time";
} }
const std::vector<std::vector<Event>>* analyze_events; const std::vector<std::vector<Event>> *analyze_events;
std::vector<std::vector<Event>> merged_events_list; std::vector<std::vector<Event>> merged_events_list;
if (merge_thread) { if (merge_thread) {
std::vector<Event> merged_events; std::vector<Event> merged_events;
...@@ -469,7 +525,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -469,7 +525,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
} }
} }
// average time // average time
for (auto& item : event_items) { for (auto &item : event_items) {
item.ave_time = item.total_time / item.calls; item.ave_time = item.total_time / item.calls;
item.ratio = item.total_time / total; item.ratio = item.total_time / total;
} }
...@@ -493,15 +549,77 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -493,15 +549,77 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
merge_thread); merge_thread);
} }
struct MemoryProfierReport {
size_t alloc_times{0};
size_t alloc_size{0};
size_t free_times{0};
size_t free_size{0};
};
// Print results
void PrintMemProfiler(
const std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
&annotation_report,
const size_t name_width, const size_t data_width) {
// Output header information
std::cout << "\n------------------------->"
<< " Memory Profiling Report "
<< "<-------------------------\n\n";
// Output events table
std::cout.setf(std::ios::left);
std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Alloc Calls" << std::setw(data_width) << "Size(MB)"
<< std::setw(data_width) << "Free Calls" << std::setw(data_width)
<< "Size(MB)" << std::endl;
for (auto &tmp : annotation_report) {
for (auto &e : tmp.second) {
auto event_name = string::Sprintf("%s:%s", tmp.first, e.first);
std::cout << std::setw(name_width) << event_name;
std::cout << std::setw(data_width) << e.second.alloc_times;
std::cout << std::setw(data_width)
<< e.second.alloc_size / (1024.0 * 1024.0);
std::cout << std::setw(data_width) << e.second.free_times;
std::cout << std::setw(data_width)
<< e.second.free_size / (1024.0 * 1024.0) << std::endl;
}
}
std::cout << std::endl;
}
// parse memory events
void ParseMemEvents(const std::vector<std::vector<MemEvent>> &events) {
if (g_state == ProfilerState::kDisabled) return;
// place, annotation, alloc times, alloc size
std::map<Place, std::unordered_map<std::string, MemoryProfierReport>>
annotation_report;
for (auto &tmp : events) {
for (auto &e : tmp) {
if (e.type() == EventType::kPushRange) {
annotation_report[e.place()][e.annotation()].alloc_times += 1;
annotation_report[e.place()][e.annotation()].alloc_size += e.bytes();
} else if (e.type() == EventType::kPopRange) {
annotation_report[e.place()][e.annotation()].free_times += 1;
annotation_report[e.place()][e.annotation()].free_size += e.bytes();
}
}
}
PrintMemProfiler(annotation_report, 55, 18);
}
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path) { const std::string &profile_path) {
SynchronizeAllDevice(); SynchronizeAllDevice();
MemEvenRecorder::Instance().Flush();
std::lock_guard<std::mutex> l(profiler_mu); std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
// Mark the profiling stop. // Mark the profiling stop.
Mark("_stop_profiler_"); Mark("_stop_profiler_");
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer *tracer = GetDeviceTracer();
if (tracer->IsEnabled()) { if (tracer->IsEnabled()) {
tracer->Disable(); tracer->Disable();
tracer->GenProfile(profile_path); tracer->GenProfile(profile_path);
...@@ -511,6 +629,11 @@ void DisableProfiler(EventSortingKey sorted_key, ...@@ -511,6 +629,11 @@ void DisableProfiler(EventSortingKey sorted_key,
std::vector<std::vector<Event>> all_events = GetAllEvents(); std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, true, sorted_key); ParseEvents(all_events, true, sorted_key);
ParseEvents(all_events, false, sorted_key); ParseEvents(all_events, false, sorted_key);
if (VLOG_IS_ON(5)) {
std::vector<std::vector<MemEvent>> all_mem_events = GetMemEvents();
ParseMemEvents(all_mem_events);
}
ResetProfiler(); ResetProfiler();
g_state = ProfilerState::kDisabled; g_state = ProfilerState::kDisabled;
should_send_profile_state = true; should_send_profile_state = true;
......
...@@ -15,10 +15,17 @@ limitations under the License. */ ...@@ -15,10 +15,17 @@ limitations under the License. */
#pragma once #pragma once
#include <forward_list> #include <forward_list>
#include <list> #include <list>
#include <map>
#include <memory>
#include <mutex> // NOLINT
#include <string> #include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/event.h"
#include "paddle/fluid/platform/place.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#endif #endif
...@@ -34,8 +41,41 @@ enum ProfilerState { ...@@ -34,8 +41,41 @@ enum ProfilerState {
void Mark(const std::string& name); void Mark(const std::string& name);
Event* PushEvent(const std::string& name); void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place);
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place);
struct MemEvenRecorder {
public:
void PushMemRecord(const void* ptr, const Place& place, size_t size);
void PopMemRecord(const void* ptr, const Place& place);
void Flush();
static MemEvenRecorder& Instance() { return recorder; }
private:
struct RecordMemEvent {
RecordMemEvent(const Place& place, size_t bytes);
~RecordMemEvent();
Place place_;
size_t bytes_;
uint64_t start_ns_;
uint64_t end_ns_;
std::string alloc_in_;
std::string free_in_;
};
static MemEvenRecorder recorder;
std::map<Place,
std::unordered_map<const void*, std::unique_ptr<RecordMemEvent>>>
address_memevent_;
std::mutex mtx_;
MemEvenRecorder() {}
DISABLE_COPY_AND_ASSIGN(MemEvenRecorder);
};
Event* PushEvent(const std::string& name);
void PopEvent(const std::string& name); void PopEvent(const std::string& name);
struct RecordEvent { struct RecordEvent {
...@@ -87,6 +127,41 @@ enum EventSortingKey { ...@@ -87,6 +127,41 @@ enum EventSortingKey {
kGPUTime kGPUTime
}; };
template <typename T>
struct EventList {
constexpr static size_t kMB = 1024 * 1024;
constexpr static size_t kEventBlockSize = 16 * kMB;
constexpr static size_t kEventSize = sizeof(T);
constexpr static size_t kEventAlign = alignof(T);
constexpr static size_t kNumBlock =
kEventBlockSize /
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args>
T* Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock);
}
event_blocks.front().emplace_back(std::forward<Args>(args)...);
return &event_blocks.front().back();
}
std::vector<T> Reduce() {
std::vector<T> result;
for (auto& block : event_blocks) {
result.insert(result.begin(), std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
event_blocks.clear();
return result;
}
void Clear() { event_blocks.clear(); }
std::forward_list<std::vector<T>> event_blocks;
};
// Enable the profiling function. // Enable the profiling function.
void EnableProfiler(ProfilerState state); void EnableProfiler(ProfilerState state);
......
...@@ -34,8 +34,25 @@ message Event { ...@@ -34,8 +34,25 @@ message Event {
optional string detail_info = 9; optional string detail_info = 9;
} }
message MemEvent {
enum Place {
CUDAPlace = 0;
CPUPlace = 1;
CUDAPinnedPlace = 2;
}
optional uint64 start_ns = 1;
optional uint64 end_ns = 2;
optional uint64 bytes = 3;
optional Place place = 4;
optional uint64 thread_id = 5;
optional uint32 device_id = 6;
optional string alloc_in = 7;
optional string free_in = 8;
}
message Profile { message Profile {
repeated Event events = 1; repeated Event events = 1;
optional uint64 start_ns = 2; optional uint64 start_ns = 2;
optional uint64 end_ns = 3; optional uint64 end_ns = 3;
repeated MemEvent mem_events = 4;
} }
\ No newline at end of file
...@@ -94,6 +94,14 @@ bool IsCompiledWithMKLDNN() { ...@@ -94,6 +94,14 @@ bool IsCompiledWithMKLDNN() {
#endif #endif
} }
bool IsCompiledWithNGRAPH() {
#ifndef PADDLE_WITH_NGRAPH
return false;
#else
return true;
#endif
}
bool IsCompiledWithBrpc() { bool IsCompiledWithBrpc() {
#ifndef PADDLE_WITH_DISTRIBUTE #ifndef PADDLE_WITH_DISTRIBUTE
return false; return false;
...@@ -874,6 +882,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -874,6 +882,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("init_devices", m.def("init_devices",
[](bool init_p2p) { framework::InitDevices(init_p2p); }); [](bool init_p2p) { framework::InitDevices(init_p2p); });
m.def("is_compiled_with_ngraph", IsCompiledWithNGRAPH);
m.def("is_compiled_with_cuda", IsCompiledWithCUDA); m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
m.def("is_compiled_with_brpc", IsCompiledWithBrpc); m.def("is_compiled_with_brpc", IsCompiledWithBrpc);
...@@ -1242,7 +1251,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1242,7 +1251,7 @@ All parameter, weight, gradient are variables in Paddle.
cannot be updated after being finalized.)DOC"); cannot be updated after being finalized.)DOC");
pe.def(py::init<const std::vector<platform::Place> &, pe.def(py::init<const std::vector<platform::Place> &,
const std::unordered_set<std::string> &, const std::string &, const std::vector<std::string> &, const std::string &,
Scope *, std::vector<Scope *> &, const ExecutionStrategy &, Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
const BuildStrategy &, ir::Graph *>()) const BuildStrategy &, ir::Graph *>())
// NOTE: even we return a vec<Scope*>* to Python use reference policy. // NOTE: even we return a vec<Scope*>* to Python use reference policy.
......
...@@ -455,7 +455,11 @@ function assert_api_spec_approvals() { ...@@ -455,7 +455,11 @@ function assert_api_spec_approvals() {
# NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable. # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
if [ "$API_FILE" == "paddle/fluid/API.spec" ];then if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308` python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308 46782768 30176695`
if [ "${APPROVALS}" == "TRUE" ];then
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
fi
else else
APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803` python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803`
...@@ -463,7 +467,7 @@ function assert_api_spec_approvals() { ...@@ -463,7 +467,7 @@ function assert_api_spec_approvals() {
echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
if [ "${APPROVALS}" == "FALSE" ]; then if [ "${APPROVALS}" == "FALSE" ]; then
if [ "$API_FILE" == "paddle/fluid/API.spec" ];then if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
echo "You must have panyx0718 and shanyi15 approval for the api change! ${API_FILE}" echo "You must have one RD (panyx0718 or chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE}"
else else
echo "You must have panyx0718 approval for the api change! ${API_FILE}" echo "You must have panyx0718 approval for the api change! ${API_FILE}"
fi fi
......
...@@ -125,7 +125,7 @@ def __bootstrap__(): ...@@ -125,7 +125,7 @@ def __bootstrap__():
os.environ['OMP_NUM_THREADS'] = str(num_threads) os.environ['OMP_NUM_THREADS'] = str(num_threads)
sysstr = platform.system() sysstr = platform.system()
read_env_flags = [ read_env_flags = [
'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_ngraph', 'check_nan_inf', 'benchmark', 'eager_delete_scope',
'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory',
'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb', 'paddle_num_threads', "dist_threadpool_size", 'eager_delete_tensor_gb',
'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion', 'fast_eager_deletion_mode', 'memory_fraction_of_eager_deletion',
...@@ -143,6 +143,9 @@ def __bootstrap__(): ...@@ -143,6 +143,9 @@ def __bootstrap__():
if core.is_compiled_with_mkldnn(): if core.is_compiled_with_mkldnn():
read_env_flags.append('use_mkldnn') read_env_flags.append('use_mkldnn')
if core.is_compiled_with_ngraph():
read_env_flags.append('use_ngraph')
if core.is_compiled_with_dist(): if core.is_compiled_with_dist():
read_env_flags.append('rpc_deadline') read_env_flags.append('rpc_deadline')
read_env_flags.append('rpc_server_profile_path') read_env_flags.append('rpc_server_profile_path')
......
...@@ -230,13 +230,17 @@ class CompiledProgram(object): ...@@ -230,13 +230,17 @@ class CompiledProgram(object):
self._persistable_vars.append(cpt.to_text(node.name())) self._persistable_vars.append(cpt.to_text(node.name()))
places = list(map(_place_obj, self._places)) places = list(map(_place_obj, self._places))
# ParallelExecutor would broadcast all the parameters during initializing.
return core.ParallelExecutor(places, # The parameters of each process should be in the same ordered for the data-parallelism
set(self._persistable_vars), # distributed training to keep the broadcast correct.
self._persistable_vars = list(set(self._persistable_vars))
self._persistable_vars.sort()
return core.ParallelExecutor(
places, self._persistable_vars,
cpt.to_text(self._loss_name) cpt.to_text(self._loss_name)
if self._loss_name else six.u(''), scope, if self._loss_name else six.u(''), self._scope, self._local_scopes,
self._local_scopes, self._exec_strategy, self._exec_strategy, self._build_strategy, self._graph)
self._build_strategy, self._graph)
def _compile_inference(self): def _compile_inference(self):
return core.create_paddle_predictor(self._infer_config) return core.create_paddle_predictor(self._infer_config)
......
...@@ -23,6 +23,7 @@ __activations_noattr__ = [ ...@@ -23,6 +23,7 @@ __activations_noattr__ = [
'logsigmoid', 'logsigmoid',
'exp', 'exp',
'tanh', 'tanh',
'atan',
'tanh_shrink', 'tanh_shrink',
'softshrink', 'softshrink',
'sqrt', 'sqrt',
...@@ -30,6 +31,8 @@ __activations_noattr__ = [ ...@@ -30,6 +31,8 @@ __activations_noattr__ = [
'ceil', 'ceil',
'floor', 'floor',
'cos', 'cos',
'acos',
'asin',
'sin', 'sin',
'round', 'round',
'reciprocal', 'reciprocal',
......
...@@ -100,6 +100,23 @@ class TestTanh(TestActivation): ...@@ -100,6 +100,23 @@ class TestTanh(TestActivation):
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestAtan(TestActivation):
def setUp(self):
self.op_type = "atan"
self.init_dtype()
x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
out = np.arctan(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestTanhShrink(TestActivation): class TestTanhShrink(TestActivation):
def setUp(self): def setUp(self):
self.op_type = "tanh_shrink" self.op_type = "tanh_shrink"
...@@ -248,6 +265,23 @@ class TestCos(TestActivation): ...@@ -248,6 +265,23 @@ class TestCos(TestActivation):
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestAcos(TestActivation):
def setUp(self):
self.op_type = "acos"
self.init_dtype()
x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
out = np.arccos(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestSin(TestActivation): class TestSin(TestActivation):
def setUp(self): def setUp(self):
self.op_type = "sin" self.op_type = "sin"
...@@ -265,6 +299,23 @@ class TestSin(TestActivation): ...@@ -265,6 +299,23 @@ class TestSin(TestActivation):
self.check_grad(['X'], 'Out', max_relative_error=0.007) self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestAsin(TestActivation):
def setUp(self):
self.op_type = "asin"
self.init_dtype()
x = np.random.uniform(-1, 1, [4, 4]).astype(self.dtype)
out = np.arcsin(x)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
def test_check_grad(self):
if self.dtype == np.float16:
return
self.check_grad(['X'], 'Out', max_relative_error=0.007)
class TestRound(TestActivation): class TestRound(TestActivation):
def setUp(self): def setUp(self):
self.op_type = "round" self.op_type = "round"
...@@ -665,7 +716,10 @@ create_test_act_fp16_class(TestAbs) ...@@ -665,7 +716,10 @@ create_test_act_fp16_class(TestAbs)
create_test_act_fp16_class(TestCeil, grad_check=False) create_test_act_fp16_class(TestCeil, grad_check=False)
create_test_act_fp16_class(TestFloor, grad_check=False) create_test_act_fp16_class(TestFloor, grad_check=False)
create_test_act_fp16_class(TestCos, grad_atol=0.85) create_test_act_fp16_class(TestCos, grad_atol=0.85)
create_test_act_fp16_class(TestAcos, grad_atol=0.85)
create_test_act_fp16_class(TestSin) create_test_act_fp16_class(TestSin)
create_test_act_fp16_class(TestAsin)
create_test_act_fp16_class(TestAtan)
create_test_act_fp16_class(TestRound, grad_check=False) create_test_act_fp16_class(TestRound, grad_check=False)
create_test_act_fp16_class(TestRelu) create_test_act_fp16_class(TestRelu)
create_test_act_fp16_class(TestGelu) create_test_act_fp16_class(TestGelu)
......
...@@ -51,6 +51,8 @@ def visit_member(parent_name, member): ...@@ -51,6 +51,8 @@ def visit_member(parent_name, member):
all = (args, doc) all = (args, doc)
member_dict[cur_name] = all member_dict[cur_name] = all
except TypeError: # special for PyBind method except TypeError: # special for PyBind method
if cur_name in check_modules_list:
return
member_dict[cur_name] = " ".join([ member_dict[cur_name] = " ".join([
line.strip() for line in pydoc.render_doc(member).split('\n') line.strip() for line in pydoc.render_doc(member).split('\n')
if "->" in line if "->" in line
...@@ -78,6 +80,7 @@ def visit_all_module(mod): ...@@ -78,6 +80,7 @@ def visit_all_module(mod):
visit_member(mod.__name__, instance) visit_member(mod.__name__, instance)
check_modules_list = ["paddle.reader.ComposeNotAligned.__init__"]
modules = sys.argv[1].split(",") modules = sys.argv[1].split(",")
for m in modules: for m in modules:
visit_all_module(importlib.import_module(m)) visit_all_module(importlib.import_module(m))
......
...@@ -95,6 +95,22 @@ class _ChromeTraceFormatter(object): ...@@ -95,6 +95,22 @@ class _ChromeTraceFormatter(object):
event['args'] = args event['args'] = args
self._events.append(event) self._events.append(event)
def emit_counter(self, category, name, pid, timestamp, counter, value):
"""Emits a record for a single counter.
Args:
category: The event category as string
name: The event name as string
pid: Identifier of the process generating this event as integer
timestamp: The timestamps of this event as long integer
counter: Name of the counter as string
value: Value of the counter as integer
tid: Thread id of the allocation as integer
"""
event = self._create_event('C', category, name, pid, 0, timestamp)
event['args'] = {counter: value}
self._events.append(event)
def format_to_string(self, pretty=False): def format_to_string(self, pretty=False):
"""Formats the chrome trace to a string. """Formats the chrome trace to a string.
...@@ -117,6 +133,7 @@ class Timeline(object): ...@@ -117,6 +133,7 @@ class Timeline(object):
self._profile_dict = profile_dict self._profile_dict = profile_dict
self._pid = 0 self._pid = 0
self._devices = dict() self._devices = dict()
self._mem_devices = dict()
self._chrome_trace = _ChromeTraceFormatter() self._chrome_trace = _ChromeTraceFormatter()
def _allocate_pid(self): def _allocate_pid(self):
...@@ -143,6 +160,45 @@ class Timeline(object): ...@@ -143,6 +160,45 @@ class Timeline(object):
self._devices[(k, event.device_id, "GPUKernel")] = pid self._devices[(k, event.device_id, "GPUKernel")] = pid
self._chrome_trace.emit_pid("%s:gpu:%d" % self._chrome_trace.emit_pid("%s:gpu:%d" %
(k, event.device_id), pid) (k, event.device_id), pid)
for mevent in profile_pb.mem_events:
if mevent.place == profiler_pb2.MemEvent.CUDAPlace:
if (k, mevent.device_id, "GPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, mevent.device_id, "GPU")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:gpu:%d" % (k, mevent.device_id),
pid)
elif mevent.place == profiler_pb2.MemEvent.CPUPlace:
if (k, mevent.device_id, "CPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, mevent.device_id, "CPU")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:cpu:%d" % (k, mevent.device_id),
pid)
elif mevent.place == profiler_pb2.MemEvent.CUDAPinnedPlace:
if (k, mevent.device_id, "CUDAPinnedPlace"
) not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, mevent.device_id,
"CUDAPinnedPlace")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:cudapinnedplace:%d" %
(k, mevent.device_id), pid)
if (k, 0, "CPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "CPU")] = pid
self._chrome_trace.emit_pid("memory usage on %s:cpu:%d" %
(k, 0), pid)
if (k, 0, "GPU") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "GPU")] = pid
self._chrome_trace.emit_pid("memory usage on %s:gpu:%d" %
(k, 0), pid)
if (k, 0, "CUDAPinnedPlace") not in self._mem_devices:
pid = self._allocate_pid()
self._mem_devices[(k, 0, "CUDAPinnedPlace")] = pid
self._chrome_trace.emit_pid(
"memory usage on %s:cudapinnedplace:%d" % (k, 0), pid)
def _allocate_events(self): def _allocate_events(self):
for k, profile_pb in six.iteritems(self._profile_dict): for k, profile_pb in six.iteritems(self._profile_dict):
...@@ -163,9 +219,57 @@ class Timeline(object): ...@@ -163,9 +219,57 @@ class Timeline(object):
event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid, event.start_ns, (event.end_ns - event.start_ns) / 1.0, pid,
event.sub_device_id, 'Op', event.name, args) event.sub_device_id, 'Op', event.name, args)
def _allocate_memory_event(self):
place_to_str = {
profiler_pb2.MemEvent.CPUPlace: "CPU",
profiler_pb2.MemEvent.CUDAPlace: "GPU",
profiler_pb2.MemEvent.CUDAPinnedPlace: "CUDAPinnedPlace"
}
for k, profile_pb in six.iteritems(self._profile_dict):
mem_list = []
end_profiler = 0
for mevent in profile_pb.mem_events:
crt_info = dict()
crt_info['time'] = mevent.start_ns
crt_info['size'] = mevent.bytes
if mevent.place in place_to_str:
place = place_to_str[mevent.place]
else:
place = "UnDefine"
crt_info['place'] = place
pid = self._mem_devices[(k, mevent.device_id, place)]
crt_info['pid'] = pid
crt_info['thread_id'] = mevent.thread_id
crt_info['device_id'] = mevent.device_id
mem_list.append(crt_info)
crt_info = dict()
crt_info['place'] = place
crt_info['pid'] = pid
crt_info['thread_id'] = mevent.thread_id
crt_info['device_id'] = mevent.device_id
crt_info['time'] = mevent.end_ns
crt_info['size'] = -mevent.bytes
mem_list.append(crt_info)
end_profiler = max(end_profiler, crt_info['time'])
mem_list.sort(key=lambda tmp: (tmp.get('time', 0)))
i = 0
total_size = 0
while i < len(mem_list):
total_size += mem_list[i]['size']
while i < len(mem_list) - 1 and mem_list[i]['time'] == mem_list[
i + 1]['time']:
total_size += mem_list[i + 1]['size']
i += 1
self._chrome_trace.emit_counter(
"Memory", "Memory", mem_list[i]['pid'], mem_list[i]['time'],
0, total_size)
i += 1
def generate_chrome_trace(self): def generate_chrome_trace(self):
self._allocate_pids() self._allocate_pids()
self._allocate_events() self._allocate_events()
self._allocate_memory_event()
return self._chrome_trace.format_to_string() return self._chrome_trace.format_to_string()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册