未验证 提交 8dd0a3b9 编写于 作者: C chenjian 提交者: GitHub

record memory and op supplement info (#43550)

* record memory and op supplement info

* update

* update

* fix a bug

* fix memory recording

* fix a bug

* update

* update

* fix a bug

* update

* fix a bug

* fix a bug

* fix a bug

* Revert "fix a bug"

This reverts commit c1d4df52762ba9ae7c7e27cd2ba4fc3a7ed9c7a5.

* fix a bug

* fix format

* fix
上级 e64823c1
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
#include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/kernel_context.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
...@@ -558,6 +559,11 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { ...@@ -558,6 +559,11 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
op_with_kernel->Info().infer_shape_( op_with_kernel->Info().infer_shape_(
instr_node.InnerInferShapeContext().get()); instr_node.InnerInferShapeContext().get());
} }
infershape_event.End();
platform::RecordOpInfoSupplement(op->Type(),
op->Attrs(),
*(instr_node.InnerInferShapeContext()),
*(instr_node.InnerRuntimeContext()));
} }
} }
......
...@@ -31,6 +31,7 @@ limitations under the License. */ ...@@ -31,6 +31,7 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
#include "paddle/phi/common/int_array.h" #include "paddle/phi/common/int_array.h"
#include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/kernel_context.h"
...@@ -70,7 +71,8 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = { ...@@ -70,7 +71,8 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
std::make_tuple(platform::CPUPlace(), LibraryType::kPlain), std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
}; };
static DDim GetDimsDebug(const ScopeBase& scope, const std::string& name, static DDim GetDimsDebug(const ScopeBase& scope,
const std::string& name,
bool get_actual_dim = false) { bool get_actual_dim = false) {
Variable* var = scope.FindVar(name); Variable* var = scope.FindVar(name);
if (var == nullptr) { if (var == nullptr) {
...@@ -264,7 +266,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -264,7 +266,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
Type(), platform::TracerEventType::Operator, 1); Type(), platform::TracerEventType::Operator, 1);
auto op_name = platform::OpName(outputs_, Type()); auto op_name = platform::OpName(outputs_, Type());
platform::RecordEvent op_name_record_event( platform::RecordEvent op_name_record_event(
op_name, platform::TracerEventType::Operator, op_name,
platform::TracerEventType::Operator,
FLAGS_enable_host_event_recorder_hook ? 20 : 1, FLAGS_enable_host_event_recorder_hook ? 20 : 1,
platform::EventRole::kUniqueOp); platform::EventRole::kUniqueOp);
RunImpl(scope, place); RunImpl(scope, place);
...@@ -293,9 +296,11 @@ bool OperatorBase::HasInputs(const std::string& name) const { ...@@ -293,9 +296,11 @@ bool OperatorBase::HasInputs(const std::string& name) const {
std::string OperatorBase::Input(const std::string& name) const { std::string OperatorBase::Input(const std::string& name) const {
auto& ins = Inputs(name); auto& ins = Inputs(name);
PADDLE_ENFORCE_LE( PADDLE_ENFORCE_LE(
ins.size(), 1UL, ins.size(),
1UL,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Operator %s's input %s should contain only one variable.", type_, "Operator %s's input %s should contain only one variable.",
type_,
name)); name));
return ins.empty() ? kEmptyVarName : ins[0]; return ins.empty() ? kEmptyVarName : ins[0];
} }
...@@ -304,9 +309,10 @@ const std::vector<std::string>& OperatorBase::Inputs( ...@@ -304,9 +309,10 @@ const std::vector<std::string>& OperatorBase::Inputs(
const std::string& name) const { const std::string& name) const {
auto it = inputs_.find(name); auto it = inputs_.find(name);
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
it, inputs_.end(), it,
platform::errors::NotFound("Operator %s does not have the input %s.", inputs_.end(),
type_, name)); platform::errors::NotFound(
"Operator %s does not have the input %s.", type_, name));
return it->second; return it->second;
} }
...@@ -321,9 +327,11 @@ bool OperatorBase::HasOutputs(const std::string& name) const { ...@@ -321,9 +327,11 @@ bool OperatorBase::HasOutputs(const std::string& name) const {
std::string OperatorBase::Output(const std::string& name) const { std::string OperatorBase::Output(const std::string& name) const {
auto& outs = Outputs(name); auto& outs = Outputs(name);
PADDLE_ENFORCE_LE( PADDLE_ENFORCE_LE(
outs.size(), 1UL, outs.size(),
1UL,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Operator %s's output %s should contain only one variable.", type_, "Operator %s's output %s should contain only one variable.",
type_,
name)); name));
return outs.empty() ? kEmptyVarName : outs[0]; return outs.empty() ? kEmptyVarName : outs[0];
} }
...@@ -332,7 +340,8 @@ const std::vector<std::string>& OperatorBase::Outputs( ...@@ -332,7 +340,8 @@ const std::vector<std::string>& OperatorBase::Outputs(
const std::string& name) const { const std::string& name) const {
auto it = outputs_.find(name); auto it = outputs_.find(name);
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
it, outputs_.end(), it,
outputs_.end(),
platform::errors::NotFound( platform::errors::NotFound(
"Operator %s does not have an output called %s.", type_, name)); "Operator %s does not have an output called %s.", type_, name));
return it->second; return it->second;
...@@ -480,18 +489,20 @@ void OperatorBase::CheckAllInputOutputSet() const { ...@@ -480,18 +489,20 @@ void OperatorBase::CheckAllInputOutputSet() const {
for (auto& in : info_->Proto().inputs()) { for (auto& in : info_->Proto().inputs()) {
if (!in.dispensable() && !in.extra()) { if (!in.dispensable() && !in.extra()) {
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
inputs_.find(in.name()), inputs_.end(), inputs_.find(in.name()),
platform::errors::NotFound("Operator %s's input (%s) is not set.", inputs_.end(),
Type(), in.name())); platform::errors::NotFound(
"Operator %s's input (%s) is not set.", Type(), in.name()));
} }
} }
for (auto& out : info_->Proto().outputs()) { for (auto& out : info_->Proto().outputs()) {
if (!out.dispensable() && !out.extra()) { if (!out.dispensable() && !out.extra()) {
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
outputs_.find(out.name()), outputs_.end(), outputs_.find(out.name()),
platform::errors::NotFound("Operator %s's output (%s) is not set.", outputs_.end(),
Type(), out.name())); platform::errors::NotFound(
"Operator %s's output (%s) is not set.", Type(), out.name()));
} }
} }
} }
...@@ -564,10 +575,12 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const { ...@@ -564,10 +575,12 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const {
if (it == ctx_.inputs.end()) return nullptr; if (it == ctx_.inputs.end()) return nullptr;
PADDLE_ENFORCE_LE( PADDLE_ENFORCE_LE(
it->second.size(), 1UL, it->second.size(),
1UL,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Operator %s's input %s should contain only one variable.", "Operator %s's input %s should contain only one variable.",
op_.Type(), name)); op_.Type(),
name));
return it->second.empty() ? nullptr : it->second[0]; return it->second.empty() ? nullptr : it->second[0];
} }
...@@ -576,10 +589,12 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const { ...@@ -576,10 +589,12 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
if (it == ctx_.outputs.end()) return nullptr; if (it == ctx_.outputs.end()) return nullptr;
PADDLE_ENFORCE_LE( PADDLE_ENFORCE_LE(
it->second.size(), 1UL, it->second.size(),
1UL,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Operator %s's output %s should contain only one variable.", "Operator %s's output %s should contain only one variable.",
op_.Type(), name)); op_.Type(),
name));
return it->second.empty() ? nullptr : it->second[0]; return it->second.empty() ? nullptr : it->second[0];
} }
...@@ -594,10 +609,13 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>( ...@@ -594,10 +609,13 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
} }
std::vector<const Tensor*> res; std::vector<const Tensor*> res;
res.reserve(vars.size()); res.reserve(vars.size());
std::transform(vars.begin(), vars.end(), std::back_inserter(res), std::transform(vars.begin(),
vars.end(),
std::back_inserter(res),
[&](const Variable* var) -> const Tensor* { [&](const Variable* var) -> const Tensor* {
if (var == nullptr) return nullptr; if (var == nullptr) return nullptr;
PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true, PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(),
true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Input variable should be LoDTensor, " "Input variable should be LoDTensor, "
"but the received type is %s.", "but the received type is %s.",
...@@ -617,7 +635,9 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>( ...@@ -617,7 +635,9 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
} }
std::vector<Tensor*> res; std::vector<Tensor*> res;
res.reserve(vars.size()); res.reserve(vars.size());
std::transform(vars.begin(), vars.end(), std::back_inserter(res), std::transform(vars.begin(),
vars.end(),
std::back_inserter(res),
[&](Variable* var) -> Tensor* { [&](Variable* var) -> Tensor* {
return var == nullptr ? nullptr return var == nullptr ? nullptr
: var->GetMutable<LoDTensor>(); : var->GetMutable<LoDTensor>();
...@@ -675,7 +695,8 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -675,7 +695,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
const auto& in = it->second; const auto& in = it->second;
if (in.size() == 0) return false; if (in.size() == 0) return false;
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in.size(), 1UL, in.size(),
1UL,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Input %s should not contain more than one inputs.", name)); "Input %s should not contain more than one inputs.", name));
return in[0] != nullptr; return in[0] != nullptr;
...@@ -693,7 +714,8 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -693,7 +714,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
return false; return false;
} }
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
out.size(), 1UL, out.size(),
1UL,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Output %s should not contain more than one outputs.", name)); "Output %s should not contain more than one outputs.", name));
return out[0] != nullptr; return out[0] != nullptr;
...@@ -750,11 +772,14 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -750,11 +772,14 @@ class RuntimeInferShapeContext : public InferShapeContext {
std::string GetInputNameByIdx(size_t idx) const override { std::string GetInputNameByIdx(size_t idx) const override {
auto& op_proto = auto& op_proto =
paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_; paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(), PADDLE_ENFORCE_LT(idx,
op_proto->inputs().size(),
platform::errors::OutOfRange( platform::errors::OutOfRange(
"The index should be less than the size of inputs of " "The index should be less than the size of inputs of "
"operator %s, but got index is %d and size is %d", "operator %s, but got index is %d and size is %d",
op_.Type(), idx, op_proto->inputs().size())); op_.Type(),
idx,
op_proto->inputs().size()));
return op_proto->inputs()[idx].name(); return op_proto->inputs()[idx].name();
} }
...@@ -762,42 +787,55 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -762,42 +787,55 @@ class RuntimeInferShapeContext : public InferShapeContext {
auto& op_proto = auto& op_proto =
paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_; paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
PADDLE_ENFORCE_LT( PADDLE_ENFORCE_LT(
idx, op_proto->outputs().size(), idx,
op_proto->outputs().size(),
platform::errors::OutOfRange( platform::errors::OutOfRange(
"The index should be less than the size of outputs of " "The index should be less than the size of outputs of "
"operator %s, but got index is %d and size is %d", "operator %s, but got index is %d and size is %d",
op_.Type(), idx, op_proto->outputs().size())); op_.Type(),
idx,
op_proto->outputs().size()));
return op_proto->outputs()[idx].name(); return op_proto->outputs()[idx].name();
} }
void ShareDim(const std::string& in, const std::string& out, size_t i = 0, void ShareDim(const std::string& in,
const std::string& out,
size_t i = 0,
size_t j = 0) override { size_t j = 0) override {
auto in_it = ctx_.inputs.find(in); auto in_it = ctx_.inputs.find(in);
auto out_it = ctx_.outputs.find(out); auto out_it = ctx_.outputs.find(out);
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
in_it, ctx_.inputs.end(), in_it,
ctx_.inputs.end(),
platform::errors::NotFound("Input %s does not exist.", in)); platform::errors::NotFound("Input %s does not exist.", in));
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
out_it, ctx_.outputs.end(), out_it,
ctx_.outputs.end(),
platform::errors::NotFound("Output %s does not exist.", out)); platform::errors::NotFound("Output %s does not exist.", out));
PADDLE_ENFORCE_LT(i, in_it->second.size(), PADDLE_ENFORCE_LT(i,
in_it->second.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index of input dimension is out of range, " "The index of input dimension is out of range, "
"excepted index less than %zu, but received %zu.", "excepted index less than %zu, but received %zu.",
in_it->second.size(), i)); in_it->second.size(),
PADDLE_ENFORCE_LT(j, out_it->second.size(), i));
PADDLE_ENFORCE_LT(j,
out_it->second.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index of output dimension is out of range, " "The index of output dimension is out of range, "
"excepted index less than %zu, but received %zu.", "excepted index less than %zu, but received %zu.",
out_it->second.size(), j)); out_it->second.size(),
j));
Variable* in_var = in_it->second[i]; Variable* in_var = in_it->second[i];
Variable* out_var = out_it->second[j]; Variable* out_var = out_it->second[j];
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var->Type(), out_var->Type(), in_var->Type(),
out_var->Type(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The type of input (%s) and output (%s) are inconsistent.", in, "The type of input (%s) and output (%s) are inconsistent.",
in,
out)); out));
if (in_var->IsType<phi::SelectedRows>()) { if (in_var->IsType<phi::SelectedRows>()) {
...@@ -821,19 +859,22 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -821,19 +859,22 @@ class RuntimeInferShapeContext : public InferShapeContext {
const std::string& out) const override { const std::string& out) const override {
auto in_it = ctx_.inputs.find(in); auto in_it = ctx_.inputs.find(in);
auto out_it = ctx_.outputs.find(out); auto out_it = ctx_.outputs.find(out);
PADDLE_ENFORCE_NE(in_it, ctx_.inputs.end(), PADDLE_ENFORCE_NE(in_it,
ctx_.inputs.end(),
platform::errors::NotFound( platform::errors::NotFound(
"Input [%s] found error in Op [%s]", in, op_.Type())); "Input [%s] found error in Op [%s]", in, op_.Type()));
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
out_it, ctx_.outputs.end(), out_it,
platform::errors::NotFound("Output [%s] found error in Op [%s]", out, ctx_.outputs.end(),
op_.Type())); platform::errors::NotFound(
"Output [%s] found error in Op [%s]", out, op_.Type()));
auto& in_var_list = in_it->second; auto& in_var_list = in_it->second;
auto& out_var_list = out_it->second; auto& out_var_list = out_it->second;
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
in_var_list.size(), out_var_list.size(), in_var_list.size(),
out_var_list.size(),
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"Op [%s]: Input var size should be equal with output var size", "Op [%s]: Input var size should be equal with output var size",
op_.Type())); op_.Type()));
...@@ -848,10 +889,12 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -848,10 +889,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
Variable* in_var = in_var_list[i]; Variable* in_var = in_var_list[i];
if (!in_var->IsType<LoDTensor>()) return; if (!in_var->IsType<LoDTensor>()) return;
Variable* out_var = out_var_list[i]; Variable* out_var = out_var_list[i];
PADDLE_ENFORCE_EQ(out_var->IsType<LoDTensor>(), true, PADDLE_ENFORCE_EQ(out_var->IsType<LoDTensor>(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"The %d-th output of Output(%s) must be LoDTensor.", "The %d-th output of Output(%s) must be LoDTensor.",
i, out_var_names[i])); i,
out_var_names[i]));
auto& in_tensor = in_var->Get<LoDTensor>(); auto& in_tensor = in_var->Get<LoDTensor>();
auto* out_tensor = out_var->GetMutable<LoDTensor>(); auto* out_tensor = out_var->GetMutable<LoDTensor>();
out_tensor->set_lod(in_tensor.lod()); out_tensor->set_lod(in_tensor.lod());
...@@ -862,32 +905,41 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -862,32 +905,41 @@ class RuntimeInferShapeContext : public InferShapeContext {
} }
} }
void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, void ShareLoD(const std::string& in,
const std::string& out,
size_t i = 0,
size_t j = 0) const override { size_t j = 0) const override {
auto in_it = ctx_.inputs.find(in); auto in_it = ctx_.inputs.find(in);
auto out_it = ctx_.outputs.find(out); auto out_it = ctx_.outputs.find(out);
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
in_it, ctx_.inputs.end(), in_it,
ctx_.inputs.end(),
platform::errors::NotFound("Input %s does not exist.", in)); platform::errors::NotFound("Input %s does not exist.", in));
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
out_it, ctx_.outputs.end(), out_it,
ctx_.outputs.end(),
platform::errors::NotFound("Output %s does not exist.", out)); platform::errors::NotFound("Output %s does not exist.", out));
PADDLE_ENFORCE_LT(i, in_it->second.size(), PADDLE_ENFORCE_LT(i,
in_it->second.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index of input dimension is out of range, " "The index of input dimension is out of range, "
"excepted index less than %zu, but received %zu.", "excepted index less than %zu, but received %zu.",
in_it->second.size(), i)); in_it->second.size(),
PADDLE_ENFORCE_LT(j, out_it->second.size(), i));
PADDLE_ENFORCE_LT(j,
out_it->second.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index of output dimension is out of range, " "The index of output dimension is out of range, "
"excepted index less than %zu, but received %zu.", "excepted index less than %zu, but received %zu.",
out_it->second.size(), j)); out_it->second.size(),
j));
Variable* in_var = in_it->second.at(i); Variable* in_var = in_it->second.at(i);
if (!in_var->IsType<LoDTensor>()) return; if (!in_var->IsType<LoDTensor>()) return;
Variable* out_var = out_it->second.at(j); Variable* out_var = out_it->second.at(j);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
out_var->IsType<LoDTensor>(), true, out_var->IsType<LoDTensor>(),
true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The %zu-th output of Output(%s) must be LoDTensor.", j, out)); "The %zu-th output of Output(%s) must be LoDTensor.", j, out));
auto& in_tensor = in_var->Get<LoDTensor>(); auto& in_tensor = in_var->Get<LoDTensor>();
...@@ -922,7 +974,8 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -922,7 +974,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
"set in the runtime kernel.")); "set in the runtime kernel."));
} }
void SetLoDLevel(const std::string& out, int32_t lod_level, void SetLoDLevel(const std::string& out,
int32_t lod_level,
size_t j = 0) const override { size_t j = 0) const override {
PADDLE_THROW(platform::errors::PreconditionNotMet( PADDLE_THROW(platform::errors::PreconditionNotMet(
"SetLoDLevel is only used in compile time. The calculation of " "SetLoDLevel is only used in compile time. The calculation of "
...@@ -965,10 +1018,12 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -965,10 +1018,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
DDim GetInputDim(const std::string& name) const override { DDim GetInputDim(const std::string& name) const override {
const std::vector<Variable*>& vars = InputVars(name); const std::vector<Variable*>& vars = InputVars(name);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
vars.size(), 1UL, vars.size(),
1UL,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Input(%s) should hold one element, but now it holds %zu elements.", "Input(%s) should hold one element, but now it holds %zu elements.",
name, vars.size())); name,
vars.size()));
return this->GetDim(vars[0]); return this->GetDim(vars[0]);
} }
...@@ -994,10 +1049,12 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -994,10 +1049,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
void SetOutputDim(const std::string& name, const DDim& dim) override { void SetOutputDim(const std::string& name, const DDim& dim) override {
auto& vars = OutputVars(name); auto& vars = OutputVars(name);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
vars.size(), 1UL, vars.size(),
1UL,
platform::errors::InvalidArgument("Output(%s) should hold one element, " platform::errors::InvalidArgument("Output(%s) should hold one element, "
"but now it holds %zu elements.", "but now it holds %zu elements.",
name, vars.size())); name,
vars.size()));
SetDim(vars[0], dim); SetDim(vars[0], dim);
} }
...@@ -1034,7 +1091,9 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -1034,7 +1091,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
std::vector<DDim> GetDims(const std::vector<Variable*>& vars) const { std::vector<DDim> GetDims(const std::vector<Variable*>& vars) const {
std::vector<DDim> ret; std::vector<DDim> ret;
ret.reserve(vars.size()); ret.reserve(vars.size());
std::transform(vars.begin(), vars.end(), std::back_inserter(ret), std::transform(vars.begin(),
vars.end(),
std::back_inserter(ret),
[this](Variable* var) { return this->GetDim(var); }); [this](Variable* var) { return this->GetDim(var); });
return ret; return ret;
} }
...@@ -1060,12 +1119,14 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -1060,12 +1119,14 @@ class RuntimeInferShapeContext : public InferShapeContext {
void SetDims(const std::vector<Variable*>& vars, void SetDims(const std::vector<Variable*>& vars,
const std::vector<DDim>& dims) { const std::vector<DDim>& dims) {
size_t length = vars.size(); size_t length = vars.size();
PADDLE_ENFORCE_EQ(length, dims.size(), PADDLE_ENFORCE_EQ(length,
dims.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The number of input variables do not match the " "The number of input variables do not match the "
"number of input dimensions, the number of variables " "number of input dimensions, the number of variables "
"is %zu, the number of dimensions is %zu.", "is %zu, the number of dimensions is %zu.",
length, dims.size())); length,
dims.size()));
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
if (vars[i] == nullptr) { if (vars[i] == nullptr) {
continue; continue;
...@@ -1084,9 +1145,12 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -1084,9 +1145,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
const std::vector<Variable*>& vars) const { const std::vector<Variable*>& vars) const {
std::vector<proto::VarType::Type> retv; std::vector<proto::VarType::Type> retv;
retv.resize(vars.size()); retv.resize(vars.size());
std::transform(vars.begin(), vars.end(), retv.begin(), std::transform(vars.begin(),
vars.end(),
retv.begin(),
std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType), std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType),
this, std::placeholders::_1)); this,
std::placeholders::_1));
return retv; return retv;
} }
...@@ -1098,7 +1162,8 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -1098,7 +1162,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
const std::vector<Variable*>& InputVars(const std::string& name) const { const std::vector<Variable*>& InputVars(const std::string& name) const {
auto it = ctx_.inputs.find(name); auto it = ctx_.inputs.find(name);
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
it, ctx_.inputs.end(), it,
ctx_.inputs.end(),
platform::errors::NotFound( platform::errors::NotFound(
"Operator (%s) does not have the input (%s).", op_.Type(), name)); "Operator (%s) does not have the input (%s).", op_.Type(), name));
return it->second; return it->second;
...@@ -1107,7 +1172,8 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -1107,7 +1172,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
const std::vector<Variable*>& OutputVars(const std::string& name) const { const std::vector<Variable*>& OutputVars(const std::string& name) const {
auto it = ctx_.outputs.find(name); auto it = ctx_.outputs.find(name);
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
it, ctx_.outputs.end(), it,
ctx_.outputs.end(),
platform::errors::NotFound( platform::errors::NotFound(
"Operator (%s) does not have the outputs (%s).", op_.Type(), name)); "Operator (%s) does not have the outputs (%s).", op_.Type(), name));
return it->second; return it->second;
...@@ -1143,20 +1209,23 @@ static void CheckTensorNANOrInf(const std::string& op_type, ...@@ -1143,20 +1209,23 @@ static void CheckTensorNANOrInf(const std::string& op_type,
return; return;
} }
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
framework::TensorContainsInf(tensor), true, framework::TensorContainsInf(tensor),
platform::errors::Fatal("Operator %s output Tensor %s contains Inf.", true,
op_type, name)); platform::errors::Fatal(
"Operator %s output Tensor %s contains Inf.", op_type, name));
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
framework::TensorContainsNAN(tensor), true, framework::TensorContainsNAN(tensor),
platform::errors::Fatal("Operator %s output Tensor %s contains NAN.", true,
op_type, name)); platform::errors::Fatal(
"Operator %s output Tensor %s contains NAN.", op_type, name));
} }
bool OperatorWithKernel::SupportGPU() const { bool OperatorWithKernel::SupportGPU() const {
auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap( auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
phi::TransToPhiKernelName(type_)); phi::TransToPhiKernelName(type_));
auto has_phi_kernel = auto has_phi_kernel =
std::any_of(phi_kernels.begin(), phi_kernels.end(), std::any_of(phi_kernels.begin(),
phi_kernels.end(),
[](phi::KernelKeyMap::const_reference kern_pair) { [](phi::KernelKeyMap::const_reference kern_pair) {
return kern_pair.first.backend() == phi::Backend::GPU; return kern_pair.first.backend() == phi::Backend::GPU;
}); });
...@@ -1169,7 +1238,8 @@ bool OperatorWithKernel::SupportGPU() const { ...@@ -1169,7 +1238,8 @@ bool OperatorWithKernel::SupportGPU() const {
} else { } else {
auto& op_kernels = kernel_iter->second; auto& op_kernels = kernel_iter->second;
return std::any_of( return std::any_of(
op_kernels.begin(), op_kernels.end(), op_kernels.begin(),
op_kernels.end(),
[](OpKernelMap::const_reference kern_pair) { [](OpKernelMap::const_reference kern_pair) {
return platform::is_gpu_place(kern_pair.first.place_); return platform::is_gpu_place(kern_pair.first.place_);
}); });
...@@ -1181,7 +1251,8 @@ bool OperatorWithKernel::SupportNPU() const { ...@@ -1181,7 +1251,8 @@ bool OperatorWithKernel::SupportNPU() const {
auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap( auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
phi::TransToPhiKernelName(type_)); phi::TransToPhiKernelName(type_));
auto has_phi_kernel = auto has_phi_kernel =
std::any_of(phi_kernels.begin(), phi_kernels.end(), std::any_of(phi_kernels.begin(),
phi_kernels.end(),
[](phi::KernelKeyMap::const_reference kern_pair) { [](phi::KernelKeyMap::const_reference kern_pair) {
return kern_pair.first.backend() == phi::Backend::NPU; return kern_pair.first.backend() == phi::Backend::NPU;
}); });
...@@ -1194,7 +1265,8 @@ bool OperatorWithKernel::SupportNPU() const { ...@@ -1194,7 +1265,8 @@ bool OperatorWithKernel::SupportNPU() const {
} else { } else {
auto& op_kernels = kernel_iter->second; auto& op_kernels = kernel_iter->second;
return std::any_of( return std::any_of(
op_kernels.begin(), op_kernels.end(), op_kernels.begin(),
op_kernels.end(),
[](OpKernelMap::const_reference kern_pair) { [](OpKernelMap::const_reference kern_pair) {
return platform::is_npu_place(kern_pair.first.place_); return platform::is_npu_place(kern_pair.first.place_);
}); });
...@@ -1214,7 +1286,8 @@ bool OperatorWithKernel::SupportsMKLDNN( ...@@ -1214,7 +1286,8 @@ bool OperatorWithKernel::SupportsMKLDNN(
return false; return false;
} }
auto& op_kernels = op_kernel_iter->second; auto& op_kernels = op_kernel_iter->second;
return std::any_of(op_kernels.begin(), op_kernels.end(), return std::any_of(op_kernels.begin(),
op_kernels.end(),
[data_type](OpKernelMap::const_reference kern_pair) { [data_type](OpKernelMap::const_reference kern_pair) {
return platform::is_cpu_place(kern_pair.first.place_) && return platform::is_cpu_place(kern_pair.first.place_) &&
kern_pair.first.library_type_ == kern_pair.first.library_type_ ==
...@@ -1496,10 +1569,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -1496,10 +1569,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
{ {
platform::RecordEvent record_event("prepare_data", platform::RecordEvent record_event("prepare_data",
platform::TracerEventType::OperatorInner, platform::TracerEventType::OperatorInner,
1, platform::EventRole::kInnerOp); 1,
platform::EventRole::kInnerOp);
if (need_prepare_data_) { if (need_prepare_data_) {
transfer_scope = PrepareData(scope, *kernel_type_, transfer_scope = PrepareData(
&transfered_inplace_vars, runtime_ctx); scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx);
} }
} }
// exec scope is the scope that kernel actually executed on. // exec scope is the scope that kernel actually executed on.
...@@ -1509,9 +1583,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -1509,9 +1583,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
if (!all_kernels_must_compute_runtime_shape_) { if (!all_kernels_must_compute_runtime_shape_) {
platform::RecordEvent record_event("infer_shape", platform::RecordEvent record_event("infer_shape",
platform::TracerEventType::OperatorInner, platform::TracerEventType::OperatorInner,
1, platform::EventRole::kInnerOp); 1,
platform::EventRole::kInnerOp);
RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx); RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
this->Info().infer_shape_(&infer_shape_ctx); this->Info().infer_shape_(&infer_shape_ctx);
record_event.End();
platform::RecordOpInfoSupplement(
Type(), Attrs(), infer_shape_ctx, *runtime_ctx);
} }
if (FLAGS_enable_unused_var_check) { if (FLAGS_enable_unused_var_check) {
...@@ -1523,7 +1601,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -1523,7 +1601,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
{ {
platform::RecordEvent record_event("compute", platform::RecordEvent record_event("compute",
platform::TracerEventType::OperatorInner, platform::TracerEventType::OperatorInner,
1, platform::EventRole::kInnerOp); 1,
platform::EventRole::kInnerOp);
if (run_phi_kernel_) { if (run_phi_kernel_) {
phi::KernelContext pt_kernel_context; phi::KernelContext pt_kernel_context;
// Do data transform before building KernelContext // Do data transform before building KernelContext
...@@ -1663,7 +1742,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { ...@@ -1663,7 +1742,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
auto& all_op_kernels = AllOpKernels(); auto& all_op_kernels = AllOpKernels();
auto kernels_iter = all_op_kernels.find(type_); auto kernels_iter = all_op_kernels.find(type_);
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
kernels_iter, all_op_kernels.end(), kernels_iter,
all_op_kernels.end(),
platform::errors::Unavailable( platform::errors::Unavailable(
"There are no kernels which are registered in the %s operator.", "There are no kernels which are registered in the %s operator.",
type_)); type_));
...@@ -1785,9 +1865,11 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { ...@@ -1785,9 +1865,11 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
kernel_iter = kernels.find(expected_kernel_key); kernel_iter = kernels.find(expected_kernel_key);
} }
#endif #endif
PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), PADDLE_ENFORCE_NE(
platform::errors::NotFound( kernel_iter,
"Operator (%s) does not have kernel for %s.", type_, kernels.end(),
platform::errors::NotFound("Operator (%s) does not have kernel for %s.",
type_,
KernelTypeToString(expected_kernel_key))); KernelTypeToString(expected_kernel_key)));
std::lock_guard<std::mutex> lock(cache_update_mutex_); std::lock_guard<std::mutex> lock(cache_update_mutex_);
...@@ -1798,7 +1880,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const { ...@@ -1798,7 +1880,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
} }
void OperatorWithKernel::TransferInplaceVarsBack( void OperatorWithKernel::TransferInplaceVarsBack(
const Scope& scope, const std::vector<std::string>& inplace_vars, const Scope& scope,
const std::vector<std::string>& inplace_vars,
const Scope& transfer_scope) const { const Scope& transfer_scope) const {
for (auto& var_name : inplace_vars) { for (auto& var_name : inplace_vars) {
VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
...@@ -1809,7 +1892,8 @@ void OperatorWithKernel::TransferInplaceVarsBack( ...@@ -1809,7 +1892,8 @@ void OperatorWithKernel::TransferInplaceVarsBack(
auto* original_tensor = auto* original_tensor =
GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var); GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var);
auto* var = transfer_scope.FindVar(var_name); auto* var = transfer_scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument( PADDLE_ENFORCE_NOT_NULL(var,
platform::errors::InvalidArgument(
"The variable[%s] is nullptr.", var_name)); "The variable[%s] is nullptr.", var_name));
auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var); auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto original_dims = original_tensor->dims(); auto original_dims = original_tensor->dims();
...@@ -1890,7 +1974,8 @@ void OperatorWithKernel::HandleComplexGradToRealGrad( ...@@ -1890,7 +1974,8 @@ void OperatorWithKernel::HandleComplexGradToRealGrad(
} }
Scope* OperatorWithKernel::PrepareData( Scope* OperatorWithKernel::PrepareData(
const Scope& scope, const OpKernelType& expected_kernel_key, const Scope& scope,
const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars, std::vector<std::string>* transfered_inplace_vars,
RuntimeContext* ctx) const { RuntimeContext* ctx) const {
Scope* new_scope = nullptr; Scope* new_scope = nullptr;
...@@ -1947,8 +2032,8 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -1947,8 +2032,8 @@ Scope* OperatorWithKernel::PrepareData(
input_vars[i] = trans_var; input_vars[i] = trans_var;
auto out = trans_var->GetMutable<LoDTensor>(); auto out = trans_var->GetMutable<LoDTensor>();
out->Resize(tensor_in->dims()); out->Resize(tensor_in->dims());
platform::MatchShapeToLayout(out, tensor_in->layout(), platform::MatchShapeToLayout(
DataLayout::kNHWC); out, tensor_in->layout(), DataLayout::kNHWC);
VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , " VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , "
"but kNHWC layout" "but kNHWC layout"
<< var_name_item.first << " in Operator " << type_; << var_name_item.first << " in Operator " << type_;
...@@ -1995,8 +2080,8 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -1995,8 +2080,8 @@ Scope* OperatorWithKernel::PrepareData(
if (!run_by_executor_ && if (!run_by_executor_ &&
(platform::is_gpu_place(kernel_type_for_var.place_) || (platform::is_gpu_place(kernel_type_for_var.place_) ||
platform::is_gpu_place(expected_kernel_key.place_))) { platform::is_gpu_place(expected_kernel_key.place_))) {
new_scope = TryCreateTransferScope(kernel_type_for_var, new_scope = TryCreateTransferScope(
expected_kernel_key, &scope); kernel_type_for_var, expected_kernel_key, &scope);
enable_cache_transfer_scope_ = true; enable_cache_transfer_scope_ = true;
} }
if (!new_scope) { if (!new_scope) {
...@@ -2058,7 +2143,8 @@ Scope* OperatorWithKernel::PrepareData( ...@@ -2058,7 +2143,8 @@ Scope* OperatorWithKernel::PrepareData(
} }
void OperatorWithKernel::ParseInputDataType( void OperatorWithKernel::ParseInputDataType(
const Variable* var, const std::string& name, const Variable* var,
const std::string& name,
proto::VarType::Type* data_type) const { proto::VarType::Type* data_type) const {
if (var != nullptr) { if (var != nullptr) {
const Tensor* t = nullptr; const Tensor* t = nullptr;
...@@ -2078,17 +2164,20 @@ void OperatorWithKernel::ParseInputDataType( ...@@ -2078,17 +2164,20 @@ void OperatorWithKernel::ParseInputDataType(
} }
if (t != nullptr) { if (t != nullptr) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
t->IsInitialized(), true, t->IsInitialized(),
true,
platform::errors::InvalidArgument("The %s Op's Input Variable `%s` " platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
"contains uninitialized Tensor.", "contains uninitialized Tensor.",
Type(), name)); Type(),
name));
*data_type = paddle::framework::TransToProtoVarType(t->dtype()); *data_type = paddle::framework::TransToProtoVarType(t->dtype());
} }
} }
} }
void OperatorWithKernel::ParseMultiInputDataType( void OperatorWithKernel::ParseMultiInputDataType(
const std::vector<Variable*>& vars, const std::string& name, const std::vector<Variable*>& vars,
const std::string& name,
proto::VarType::Type* data_type) const { proto::VarType::Type* data_type) const {
proto::VarType::Type default_data_type = proto::VarType::Type default_data_type =
static_cast<proto::VarType::Type>(-1); static_cast<proto::VarType::Type>(-1);
...@@ -2112,10 +2201,12 @@ void OperatorWithKernel::ParseMultiInputDataType( ...@@ -2112,10 +2201,12 @@ void OperatorWithKernel::ParseMultiInputDataType(
} }
if (t != nullptr) { if (t != nullptr) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
t->IsInitialized(), true, t->IsInitialized(),
true,
platform::errors::InvalidArgument("The %s Op's Input Variable `%s` " platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
"contains uninitialized Tensor.", "contains uninitialized Tensor.",
Type(), name)); Type(),
name));
proto::VarType::Type tmp = proto::VarType::Type tmp =
paddle::framework::TransToProtoVarType(t->dtype()); paddle::framework::TransToProtoVarType(t->dtype());
PADDLE_ENFORCE(tmp == *data_type || *data_type == default_data_type, PADDLE_ENFORCE(tmp == *data_type || *data_type == default_data_type,
...@@ -2125,7 +2216,9 @@ void OperatorWithKernel::ParseMultiInputDataType( ...@@ -2125,7 +2216,9 @@ void OperatorWithKernel::ParseMultiInputDataType(
"consistent or reigster GetExpectedKernelType. The " "consistent or reigster GetExpectedKernelType. The "
"current variable type is (%s), but the " "current variable type is (%s), but the "
"previous variable type is (%s).", "previous variable type is (%s).",
Type(), name, DataTypeToString(tmp), Type(),
name,
DataTypeToString(tmp),
DataTypeToString(*data_type))); DataTypeToString(*data_type)));
*data_type = tmp; *data_type = tmp;
} }
...@@ -2146,7 +2239,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( ...@@ -2146,7 +2239,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
} }
} }
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
data_type, dafault_data_type, data_type,
dafault_data_type,
platform::errors::NotFound( platform::errors::NotFound(
"DataType should be indicated by input Variable at %s.", Type())); "DataType should be indicated by input Variable at %s.", Type()));
return data_type; return data_type;
...@@ -2163,12 +2257,14 @@ proto::VarType::Type OperatorWithKernel::IndicateVarDataType( ...@@ -2163,12 +2257,14 @@ proto::VarType::Type OperatorWithKernel::IndicateVarDataType(
ParseMultiInputDataType(ctx.MultiInputVar(name), name, &data_type); ParseMultiInputDataType(ctx.MultiInputVar(name), name, &data_type);
} }
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
data_type, dafault_data_type, data_type,
dafault_data_type,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The Input Variable(%s) of (%s) Operator used to determine kernel " "The Input Variable(%s) of (%s) Operator used to determine kernel "
"data type is empty or not LoDTensor or SelectedRows or " "data type is empty or not LoDTensor or SelectedRows or "
"LoDTensorArray.", "LoDTensorArray.",
name, Type())); name,
Type()));
return data_type; return data_type;
} }
...@@ -2200,11 +2296,14 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely( ...@@ -2200,11 +2296,14 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely(
t, t,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The Tensor of variable %s is nullptr when promote complex types.")); "The Tensor of variable %s is nullptr when promote complex types."));
PADDLE_ENFORCE_EQ(t->IsInitialized(), true, PADDLE_ENFORCE_EQ(t->IsInitialized(),
true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The Tensor in the %s Op's Input Variable %s(%s) is " "The Tensor in the %s Op's Input Variable %s(%s) is "
"not initialized.", "not initialized.",
Type(), name, ctx.InputName(name))); Type(),
name,
ctx.InputName(name)));
return t; return t;
} }
...@@ -2216,7 +2315,8 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely( ...@@ -2216,7 +2315,8 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely(
* the kernel data type. * the kernel data type.
*/ */
proto::VarType::Type OperatorWithKernel::IndicateOrPromoteVarDataTypes( proto::VarType::Type OperatorWithKernel::IndicateOrPromoteVarDataTypes(
const ExecutionContext& ctx, const std::string& name1, const ExecutionContext& ctx,
const std::string& name1,
const std::string& name2) const { const std::string& name2) const {
// 1. Get tensor // 1. Get tensor
auto* tensor_a = GetTensorFormInputSafely(ctx, name1); auto* tensor_a = GetTensorFormInputSafely(ctx, name1);
...@@ -2238,10 +2338,11 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType( ...@@ -2238,10 +2338,11 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType(
} }
OpKernelType OperatorWithKernel::GetKernelTypeForVar( OpKernelType OperatorWithKernel::GetKernelTypeForVar(
const std::string& var_name, const Tensor& tensor, const std::string& var_name,
const Tensor& tensor,
const OpKernelType& expected_kernel_type) const { const OpKernelType& expected_kernel_type) const {
return OpKernelType(expected_kernel_type.data_type_, tensor.place(), return OpKernelType(
tensor.layout()); expected_kernel_type.data_type_, tensor.place(), tensor.layout());
} }
phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
...@@ -2264,16 +2365,19 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( ...@@ -2264,16 +2365,19 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
} }
Scope* OperatorWithKernel::PreparePhiData( Scope* OperatorWithKernel::PreparePhiData(
const Scope& scope, const phi::Kernel& pt_kernel, const Scope& scope,
const phi::Kernel& pt_kernel,
const phi::KernelSignature& pt_kernel_signature, const phi::KernelSignature& pt_kernel_signature,
RuntimeContext* ctx) const { RuntimeContext* ctx) const {
const auto& input_names = pt_kernel_signature.input_names; const auto& input_names = pt_kernel_signature.input_names;
auto input_defs = pt_kernel.args_def().input_defs(); auto input_defs = pt_kernel.args_def().input_defs();
PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(), PADDLE_ENFORCE_EQ(input_names.size(),
input_defs.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of inputs_args names (%d) must be equal to " "The size of inputs_args names (%d) must be equal to "
"the size of kernel input_defs (%d).", "the size of kernel input_defs (%d).",
input_names.size(), input_defs.size())); input_names.size(),
input_defs.size()));
Scope* new_scope = nullptr; Scope* new_scope = nullptr;
auto& name_map = Inputs(); auto& name_map = Inputs();
const std::unordered_set<std::string>* no_buffer_ins = nullptr; const std::unordered_set<std::string>* no_buffer_ins = nullptr;
...@@ -2362,7 +2466,8 @@ Scope* OperatorWithKernel::PreparePhiData( ...@@ -2362,7 +2466,8 @@ Scope* OperatorWithKernel::PreparePhiData(
} }
void OperatorWithKernel::BuildPhiKernelContext( void OperatorWithKernel::BuildPhiKernelContext(
const RuntimeContext& ctx, platform::DeviceContext* dev_ctx, const RuntimeContext& ctx,
platform::DeviceContext* dev_ctx,
phi::KernelContext* pt_kernel_context) const { phi::KernelContext* pt_kernel_context) const {
pt_kernel_context->SetDeviceContext(dev_ctx); pt_kernel_context->SetDeviceContext(dev_ctx);
...@@ -2374,23 +2479,29 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -2374,23 +2479,29 @@ void OperatorWithKernel::BuildPhiKernelContext(
auto attr_defs = pt_kernel_->args_def().attribute_defs(); auto attr_defs = pt_kernel_->args_def().attribute_defs();
auto output_defs = pt_kernel_->args_def().output_defs(); auto output_defs = pt_kernel_->args_def().output_defs();
PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(), PADDLE_ENFORCE_EQ(input_names.size(),
input_defs.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of inputs_args names (%d) must be equal to " "The size of inputs_args names (%d) must be equal to "
"the size of kernel input_defs (%d).", "the size of kernel input_defs (%d).",
input_names.size(), input_defs.size())); input_names.size(),
input_defs.size()));
PADDLE_ENFORCE_EQ(output_names.size(), output_defs.size(), PADDLE_ENFORCE_EQ(output_names.size(),
output_defs.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of outputs_args names (%d) must be equal to " "The size of outputs_args names (%d) must be equal to "
"the size of kernel output_defs (%d).", "the size of kernel output_defs (%d).",
output_names.size(), output_defs.size())); output_names.size(),
output_defs.size()));
PADDLE_ENFORCE_EQ(attr_names.size(), attr_defs.size(), PADDLE_ENFORCE_EQ(attr_names.size(),
attr_defs.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of attribute_args names (%d) must be equal " "The size of attribute_args names (%d) must be equal "
"to the size of kernel attribute_defs (%d).", "to the size of kernel attribute_defs (%d).",
attr_names.size(), attr_defs.size())); attr_names.size(),
attr_defs.size()));
for (size_t i = 0; i < input_names.size(); ++i) { for (size_t i = 0; i < input_names.size(); ++i) {
auto it = ctx.inputs.find(input_names[i]); auto it = ctx.inputs.find(input_names[i]);
...@@ -2572,7 +2683,8 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -2572,7 +2683,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
break; break;
case phi::AttributeType::SCALARS: { case phi::AttributeType::SCALARS: {
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
attr_iter, Attrs().end(), attr_iter,
Attrs().end(),
platform::errors::NotFound("(%s) is not found in AttributeMap when " platform::errors::NotFound("(%s) is not found in AttributeMap when "
"buildind static KernelContext.", "buildind static KernelContext.",
attr_names[i])); attr_names[i]));
...@@ -2636,7 +2748,8 @@ void OperatorWithKernel::BuildPhiKernelContext( ...@@ -2636,7 +2748,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
} break; } break;
default: { default: {
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
attr_iter, Attrs().end(), attr_iter,
Attrs().end(),
platform::errors::NotFound("(%s) is not found in AttributeMap when " platform::errors::NotFound("(%s) is not found in AttributeMap when "
"buildind static KernelContext.", "buildind static KernelContext.",
attr_names[i])); attr_names[i]));
......
cc_library( cc_library(
allocator allocator
SRCS allocator.cc SRCS allocator.cc
DEPS place stats) DEPS place stats profiler)
cc_library( cc_library(
cpu_allocator cpu_allocator
SRCS cpu_allocator.cc SRCS cpu_allocator.cc
...@@ -21,7 +21,7 @@ cc_library( ...@@ -21,7 +21,7 @@ cc_library(
cc_library( cc_library(
naive_best_fit_allocator naive_best_fit_allocator
SRCS naive_best_fit_allocator.cc SRCS naive_best_fit_allocator.cc
DEPS allocator buddy_allocator profiler) DEPS allocator buddy_allocator)
cc_test( cc_test(
naive_best_fit_allocator_test naive_best_fit_allocator_test
SRCS naive_best_fit_allocator_test.cc SRCS naive_best_fit_allocator_test.cc
......
...@@ -32,7 +32,8 @@ ...@@ -32,7 +32,8 @@
#endif #endif
PADDLE_DEFINE_EXPORTED_bool( PADDLE_DEFINE_EXPORTED_bool(
init_allocated_mem, false, init_allocated_mem,
false,
"It is a mistake that the values of the memory allocated by " "It is a mistake that the values of the memory allocated by "
"BuddyAllocator are always zeroed in some op's implementation. " "BuddyAllocator are always zeroed in some op's implementation. "
"To find this error in time, we use init_allocated_mem to indicate " "To find this error in time, we use init_allocated_mem to indicate "
...@@ -77,7 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() { ...@@ -77,7 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
std::call_once(init_flag, []() { std::call_once(init_flag, []() {
a = new detail::BuddyAllocator( a = new detail::BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator), std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
platform::CpuMinChunkSize(), platform::CpuMaxChunkSize()); platform::CpuMinChunkSize(),
platform::CpuMaxChunkSize());
}); });
return a; return a;
...@@ -95,7 +97,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) { ...@@ -95,7 +97,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
} }
template <> template <>
void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p, void Free<platform::CPUPlace>(const platform::CPUPlace &place,
void *p,
size_t size) { size_t size) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p); GetCPUBuddyAllocator()->Free(p);
...@@ -125,7 +128,8 @@ void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) { ...@@ -125,7 +128,8 @@ void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) {
return p; return p;
} }
template <> template <>
void Free<platform::IPUPlace>(const platform::IPUPlace &place, void *p, void Free<platform::IPUPlace>(const platform::IPUPlace &place,
void *p,
size_t size) { size_t size) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p); GetCPUBuddyAllocator()->Free(p);
...@@ -154,7 +158,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) { ...@@ -154,7 +158,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
ret = xpu_malloc(reinterpret_cast<void **>(&p), size); ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
} }
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS, ret,
XPU_SUCCESS,
platform::errors::External( platform::errors::External(
"XPU API return wrong value[%d], no enough memory", ret)); "XPU API return wrong value[%d], no enough memory", ret));
if (FLAGS_init_allocated_mem) { if (FLAGS_init_allocated_mem) {
...@@ -171,7 +176,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) { ...@@ -171,7 +176,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
} }
template <> template <>
void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p, void Free<platform::XPUPlace>(const platform::XPUPlace &place,
void *p,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
...@@ -234,11 +240,13 @@ class NPUBuddyAllocatorList { ...@@ -234,11 +240,13 @@ class NPUBuddyAllocatorList {
BuddyAllocator *Get(int npu_id) { BuddyAllocator *Get(int npu_id) {
auto pos = std::distance( auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id)); devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
PADDLE_ENFORCE_LT(pos, devices_.size(), PADDLE_ENFORCE_LT(pos,
devices_.size(),
platform::errors::OutOfRange( platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of " "The index exceeds the size of devices, the size of "
"devices is %d, the index is %d", "devices is %d, the index is %d",
devices_.size(), pos)); devices_.size(),
pos));
std::call_once(*init_flags_[pos], [this, pos] { std::call_once(*init_flags_[pos], [this, pos] {
platform::SetNPUDeviceId(devices_[pos]); platform::SetNPUDeviceId(devices_[pos]);
...@@ -246,7 +254,8 @@ class NPUBuddyAllocatorList { ...@@ -246,7 +254,8 @@ class NPUBuddyAllocatorList {
new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>( new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::NPUAllocator(devices_[pos])), new detail::NPUAllocator(devices_[pos])),
platform::NPUMinChunkSize(), platform::NPUMinChunkSize(),
platform::NPUMaxChunkSize(), EXTRA_PADDING_SIZE)); platform::NPUMaxChunkSize(),
EXTRA_PADDING_SIZE));
VLOG(10) << "\n\nNOTE:\n" VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable " << "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' " << "'FLAGS_fraction_of_gpu_memory_to_use' "
...@@ -312,8 +321,10 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) { ...@@ -312,8 +321,10 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
PADDLE_THROW(platform::errors::ResourceExhausted( PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize " "Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize "
"%s, NpuMaxChunkSize %s, NPU memory used: %s.", "%s, NpuMaxChunkSize %s, NPU memory used: %s.",
string::HumanReadableSize(size), place.device, string::HumanReadableSize(size),
string::HumanReadableSize(avail), string::HumanReadableSize(total), place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::NPUPlace>(place)))); string::HumanReadableSize(Used<platform::NPUPlace>(place))));
...@@ -331,7 +342,8 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) { ...@@ -331,7 +342,8 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
} }
template <> template <>
void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p, void Free<platform::NPUPlace>(const platform::NPUPlace &place,
void *p,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
...@@ -384,7 +396,8 @@ void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place, ...@@ -384,7 +396,8 @@ void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
template <> template <>
void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place, void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
void *p, size_t size) { void *p,
size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL #ifdef PADDLE_WITH_ASCEND_CL
GetNPUPinnedBuddyAllocator()->Free(p); GetNPUPinnedBuddyAllocator()->Free(p);
#else #else
...@@ -430,18 +443,21 @@ class GPUBuddyAllocatorList { ...@@ -430,18 +443,21 @@ class GPUBuddyAllocatorList {
BuddyAllocator *Get(int gpu_id) { BuddyAllocator *Get(int gpu_id) {
auto pos = std::distance( auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), gpu_id)); devices_.begin(), std::find(devices_.begin(), devices_.end(), gpu_id));
PADDLE_ENFORCE_LT(pos, devices_.size(), PADDLE_ENFORCE_LT(pos,
devices_.size(),
platform::errors::OutOfRange( platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of " "The index exceeds the size of devices, the size of "
"devices is %d, the index is %d", "devices is %d, the index is %d",
devices_.size(), pos)); devices_.size(),
pos));
std::call_once(*init_flags_[pos], [this, pos] { std::call_once(*init_flags_[pos], [this, pos] {
platform::SetDeviceId(devices_[pos]); platform::SetDeviceId(devices_[pos]);
allocators_[pos].reset(new BuddyAllocator( allocators_[pos].reset(
std::unique_ptr<detail::SystemAllocator>( new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::GPUAllocator(devices_[pos])), new detail::GPUAllocator(devices_[pos])),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize())); platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize()));
VLOG(10) << "\n\nNOTE:\n" VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable " << "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' " << "'FLAGS_fraction_of_gpu_memory_to_use' "
...@@ -493,8 +509,10 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, ...@@ -493,8 +509,10 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
PADDLE_THROW(platform::errors::ResourceExhausted( PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize " "Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
"%s, GpuMaxChunkSize %s, GPU memory used: %s.", "%s, GpuMaxChunkSize %s, GPU memory used: %s.",
string::HumanReadableSize(size), place.device, string::HumanReadableSize(size),
string::HumanReadableSize(avail), string::HumanReadableSize(total), place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::CUDAPlace>(place)))); string::HumanReadableSize(Used<platform::CUDAPlace>(place))));
...@@ -515,7 +533,8 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place, ...@@ -515,7 +533,8 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
} }
template <> template <>
void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p, void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
void *p,
size_t size) { size_t size) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
GetGPUBuddyAllocator(place.device)->Free(p); GetGPUBuddyAllocator(place.device)->Free(p);
...@@ -584,7 +603,8 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place, ...@@ -584,7 +603,8 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
template <> template <>
void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place, void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
void *p, size_t size) { void *p,
size_t size) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
GetCUDAPinnedBuddyAllocator()->Free(p); GetCUDAPinnedBuddyAllocator()->Free(p);
#else #else
...@@ -630,18 +650,21 @@ class MLUBuddyAllocatorList { ...@@ -630,18 +650,21 @@ class MLUBuddyAllocatorList {
BuddyAllocator *Get(int mlu_id) { BuddyAllocator *Get(int mlu_id) {
auto pos = std::distance( auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id)); devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id));
PADDLE_ENFORCE_LT(pos, devices_.size(), PADDLE_ENFORCE_LT(pos,
devices_.size(),
platform::errors::OutOfRange( platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of " "The index exceeds the size of devices, the size of "
"devices is %d, the index is %d", "devices is %d, the index is %d",
devices_.size(), pos)); devices_.size(),
pos));
std::call_once(*init_flags_[pos], [this, pos] { std::call_once(*init_flags_[pos], [this, pos] {
platform::SetMLUDeviceId(devices_[pos]); platform::SetMLUDeviceId(devices_[pos]);
allocators_[pos].reset(new BuddyAllocator( allocators_[pos].reset(
std::unique_ptr<detail::SystemAllocator>( new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::MLUAllocator(devices_[pos])), new detail::MLUAllocator(devices_[pos])),
platform::MLUMinChunkSize(), platform::MLUMaxChunkSize())); platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize()));
VLOG(10) << "\n\nNOTE:\n" VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable " << "You can set GFlags environment variable "
<< "(mlu reuse gpu GFlags) " << "(mlu reuse gpu GFlags) "
...@@ -693,8 +716,10 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) { ...@@ -693,8 +716,10 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
PADDLE_THROW(platform::errors::ResourceExhausted( PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize " "Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize "
"%s, MLUMinChunkSize %s, MLU memory used: %s.", "%s, MLUMinChunkSize %s, MLU memory used: %s.",
string::HumanReadableSize(size), place.device, string::HumanReadableSize(size),
string::HumanReadableSize(avail), string::HumanReadableSize(total), place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()), string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()), string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::MLUPlace>(place)))); string::HumanReadableSize(Used<platform::MLUPlace>(place))));
...@@ -711,7 +736,8 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) { ...@@ -711,7 +736,8 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
} }
template <> template <>
void Free<platform::MLUPlace>(const platform::MLUPlace &place, void *p, void Free<platform::MLUPlace>(const platform::MLUPlace &place,
void *p,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_MLU #ifdef PADDLE_WITH_MLU
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
...@@ -759,10 +785,12 @@ class BuddyAllocatorList { ...@@ -759,10 +785,12 @@ class BuddyAllocatorList {
} }
BuddyAllocator *Get(int dev_id) { BuddyAllocator *Get(int dev_id) {
PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(), PADDLE_ENFORCE_NE(init_flags_.find(dev_id),
init_flags_.end(),
platform::errors::OutOfRange( platform::errors::OutOfRange(
"Cannot find %s %d, please check visible devices.", "Cannot find %s %d, please check visible devices.",
device_type_, dev_id)); device_type_,
dev_id));
std::call_once(*init_flags_[dev_id], [this, dev_id] { std::call_once(*init_flags_[dev_id], [this, dev_id] {
phi::DeviceManager::SetDevice(device_type_, dev_id); phi::DeviceManager::SetDevice(device_type_, dev_id);
...@@ -773,7 +801,8 @@ class BuddyAllocatorList { ...@@ -773,7 +801,8 @@ class BuddyAllocatorList {
new detail::CustomAllocator(device_type_, dev_id)), new detail::CustomAllocator(device_type_, dev_id)),
phi::DeviceManager::GetMinChunkSize(place), phi::DeviceManager::GetMinChunkSize(place),
phi::DeviceManager::GetMaxChunkSize(place), phi::DeviceManager::GetMaxChunkSize(place),
phi::DeviceManager::GetExtraPaddingSize(place), device_type_)); phi::DeviceManager::GetExtraPaddingSize(place),
device_type_));
}); });
return allocators_[dev_id].get(); return allocators_[dev_id].get();
...@@ -813,8 +842,11 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place, ...@@ -813,8 +842,11 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
PADDLE_THROW(platform::errors::ResourceExhausted( PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in %s:%d, avaliable %s, total %s, used " "Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
"%s. ", "%s. ",
string::HumanReadableSize(size), place.GetDeviceType(), place.device, string::HumanReadableSize(size),
string::HumanReadableSize(avail), string::HumanReadableSize(total), place.GetDeviceType(),
place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(total - avail))); string::HumanReadableSize(total - avail)));
} else { } else {
if (FLAGS_init_allocated_mem) { if (FLAGS_init_allocated_mem) {
...@@ -830,7 +862,8 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place, ...@@ -830,7 +862,8 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
} }
template <> template <>
void Free<platform::CustomPlace>(const platform::CustomPlace &place, void *p, void Free<platform::CustomPlace>(const platform::CustomPlace &place,
void *p,
size_t size) { size_t size) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
...@@ -922,8 +955,6 @@ namespace allocation { ...@@ -922,8 +955,6 @@ namespace allocation {
phi::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) { phi::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size)); void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size));
auto *tmp_alloc = new Allocation(ptr, size, place_); auto *tmp_alloc = new Allocation(ptr, size, place_);
platform::MemEvenRecorder::Instance().PushMemRecord(
static_cast<void *>(tmp_alloc), place_, size);
return tmp_alloc; return tmp_alloc;
} }
...@@ -931,8 +962,6 @@ void NaiveBestFitAllocator::FreeImpl(phi::Allocation *allocation) { ...@@ -931,8 +962,6 @@ void NaiveBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
paddle::platform::VisitPlace( paddle::platform::VisitPlace(
allocation->place(), allocation->place(),
legacy::FreeVisitor(allocation->ptr(), allocation->size())); legacy::FreeVisitor(allocation->ptr(), allocation->size()));
platform::MemEvenRecorder::Instance().PopMemRecord(
static_cast<void *>(allocation), place_);
delete allocation; delete allocation;
} }
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include "paddle/fluid/memory/allocation/pinned_allocator.h" #include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/memory/stats.h" #include "paddle/fluid/memory/stats.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace allocation { namespace allocation {
...@@ -26,6 +27,10 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) { ...@@ -26,6 +27,10 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr())); PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
#endif #endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size()); HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size());
platform::RecordMemEvent(allocation->ptr(),
allocation->place(),
allocation->size(),
platform::TracerMemEventType::ReservedFree);
delete allocation; delete allocation;
} }
phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
...@@ -36,6 +41,10 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) { ...@@ -36,6 +41,10 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable)); PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
#endif #endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, size); HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
platform::RecordMemEvent(ptr,
platform::CUDAPinnedPlace(),
size,
platform::TracerMemEventType::ReservedAllocate);
return new Allocation(ptr, size, platform::CUDAPinnedPlace()); return new Allocation(ptr, size, platform::CUDAPinnedPlace());
} }
} // namespace allocation } // namespace allocation
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/stats.h" #include "paddle/fluid/memory/stats.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -30,14 +31,18 @@ class StatAllocator : public Allocator { ...@@ -30,14 +31,18 @@ class StatAllocator : public Allocator {
protected: protected:
void FreeImpl(phi::Allocation* allocation) override { void FreeImpl(phi::Allocation* allocation) override {
if (platform::is_cpu_place(allocation->place())) { if (platform::is_cpu_place(allocation->place()) ||
HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), platform::is_cuda_pinned_place(allocation->place())) {
-allocation->size()); HOST_MEMORY_STAT_UPDATE(
Allocated, allocation->place().GetDeviceId(), -allocation->size());
} else { } else {
DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(), DEVICE_MEMORY_STAT_UPDATE(
-allocation->size()); Allocated, allocation->place().GetDeviceId(), -allocation->size());
} }
platform::RecordMemEvent(allocation->ptr(),
allocation->place(),
allocation->size(),
platform::TracerMemEventType::Free);
underlying_allocator_->Free(allocation); underlying_allocator_->Free(allocation);
} }
...@@ -48,12 +53,16 @@ class StatAllocator : public Allocator { ...@@ -48,12 +53,16 @@ class StatAllocator : public Allocator {
const platform::Place& place = allocation->place(); const platform::Place& place = allocation->place();
if (platform::is_cpu_place(place) || if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) { platform::is_cuda_pinned_place(place)) {
HOST_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(), HOST_MEMORY_STAT_UPDATE(
allocation->size()); Allocated, place.GetDeviceId(), allocation->size());
} else { } else {
DEVICE_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(), DEVICE_MEMORY_STAT_UPDATE(
allocation->size()); Allocated, place.GetDeviceId(), allocation->size());
} }
platform::RecordMemEvent(allocation->ptr(),
allocation->place(),
allocation->size(),
platform::TracerMemEventType::Allocate);
return allocation.release(); return allocation.release();
} }
......
...@@ -41,6 +41,7 @@ limitations under the License. */ ...@@ -41,6 +41,7 @@ limitations under the License. */
#endif #endif
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
DECLARE_bool(use_pinned_memory); DECLARE_bool(use_pinned_memory);
DECLARE_double(fraction_of_gpu_memory_to_use); DECLARE_double(fraction_of_gpu_memory_to_use);
...@@ -64,11 +65,13 @@ void* AlignedMalloc(size_t size) { ...@@ -64,11 +65,13 @@ void* AlignedMalloc(size_t size) {
#else #else
int error = posix_memalign(&p, alignment, size); int error = posix_memalign(&p, alignment, size);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
error, 0, error,
0,
platform::errors::ResourceExhausted( platform::errors::ResourceExhausted(
"Fail to alloc memory of %ld size, error code is %d.", size, error)); "Fail to alloc memory of %ld size, error code is %d.", size, error));
#endif #endif
PADDLE_ENFORCE_NOT_NULL(p, platform::errors::ResourceExhausted( PADDLE_ENFORCE_NOT_NULL(p,
platform::errors::ResourceExhausted(
"Fail to alloc memory of %ld size.", size)); "Fail to alloc memory of %ld size.", size));
return p; return p;
} }
...@@ -95,7 +98,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -95,7 +98,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
} }
HOST_MEMORY_STAT_UPDATE(Reserved, 0, size); HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
platform::RecordMemEvent(
p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
return p; return p;
} }
...@@ -114,6 +118,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) { ...@@ -114,6 +118,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
#endif #endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size); HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
platform::RecordMemEvent(
p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
} }
bool CPUAllocator::UseGpu() const { return false; } bool CPUAllocator::UseGpu() const { return false; }
...@@ -146,7 +152,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -146,7 +152,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum GPU memory usage is limited to %d MB.\n" "maximum GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size); limit_size,
limit_size);
} }
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
...@@ -161,21 +168,29 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -161,21 +168,29 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
"please set it to a higher value but less than 1.0.\n" "please set it to a higher value but less than 1.0.\n"
" The command is " " The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
gpu_id_, string::HumanReadableSize(size), gpu_id_, gpu_id_,
string::HumanReadableSize(allocated), string::HumanReadableSize(avail), string::HumanReadableSize(size),
gpu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg)); gpu_id_,
string::HumanReadableSize(allocated),
string::HumanReadableSize(avail),
gpu_id_,
FLAGS_fraction_of_gpu_memory_to_use,
err_msg));
} }
} }
void GPUAllocator::Free(void* p, size_t size, size_t index) { void GPUAllocator::Free(void* p, size_t size, size_t index) {
PADDLE_ENFORCE_EQ(index, 0, PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index should be 0, index is %d", index)); "The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(gpu_alloc_size_, size, PADDLE_ENFORCE_GE(gpu_alloc_size_,
size,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of " "The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)", "allocated gpu memory (%d)",
size, gpu_alloc_size_)); size,
gpu_alloc_size_));
gpu_alloc_size_ -= size; gpu_alloc_size_ -= size;
platform::RecordedGpuFree(p, size, gpu_id_); platform::RecordedGpuFree(p, size, gpu_id_);
...@@ -213,6 +228,8 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { ...@@ -213,6 +228,8 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
*index = 1; // PINNED memory *index = 1; // PINNED memory
cuda_pinnd_alloc_size_ += size; cuda_pinnd_alloc_size_ += size;
HOST_MEMORY_STAT_UPDATE(Reserved, 0, size); HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
platform::RecordMemEvent(
p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
return p; return p;
} else { } else {
LOG(WARNING) << "cudaHostAlloc failed."; LOG(WARNING) << "cudaHostAlloc failed.";
...@@ -224,21 +241,25 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) { ...@@ -224,21 +241,25 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
gpuError_t err; gpuError_t err;
PADDLE_ENFORCE_EQ(index, 1, PADDLE_ENFORCE_EQ(index,
1,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index should be 1, but got %d", index)); "The index should be 1, but got %d", index));
PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, size, PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_,
size,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of " "The size of memory (%d) to free exceeds the size of "
"allocated cuda pinned memory (%d)", "allocated cuda pinned memory (%d)",
size, cuda_pinnd_alloc_size_)); size,
cuda_pinnd_alloc_size_));
cuda_pinnd_alloc_size_ -= size; cuda_pinnd_alloc_size_ -= size;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
err = hipHostFree(p); err = hipHostFree(p);
if (err != hipErrorDeinitialized) { if (err != hipErrorDeinitialized) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
err, hipSuccess, err,
hipSuccess,
platform::errors::Fatal( platform::errors::Fatal(
"hipFreeHost failed in GPUPinnedAllocator, error code is %d", err)); "hipFreeHost failed in GPUPinnedAllocator, error code is %d", err));
} }
...@@ -252,13 +273,16 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { ...@@ -252,13 +273,16 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
// cudaFreeHost succeeds. // cudaFreeHost succeeds.
if (err != cudaErrorCudartUnloading) { if (err != cudaErrorCudartUnloading) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
err, 0, err,
0,
platform::errors::Fatal( platform::errors::Fatal(
"cudaFreeHost failed in GPUPinnedAllocator, error code is %d", "cudaFreeHost failed in GPUPinnedAllocator, error code is %d",
err)); err));
} }
#endif #endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size); HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
platform::RecordMemEvent(
p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
} }
bool CUDAPinnedAllocator::UseGpu() const { return false; } bool CUDAPinnedAllocator::UseGpu() const { return false; }
...@@ -289,7 +313,8 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -289,7 +313,8 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum GPU memory usage is limited to %d MB.\n" "maximum GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size); limit_size,
limit_size);
} }
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
...@@ -304,22 +329,29 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) { ...@@ -304,22 +329,29 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
"please set it to a higher value but less than 1.0.\n" "please set it to a higher value but less than 1.0.\n"
" The command is " " The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
npu_id_, string::HumanReadableSize(size), npu_id_, npu_id_,
string::HumanReadableSize(avail), npu_id_, string::HumanReadableSize(size),
FLAGS_fraction_of_gpu_memory_to_use, err_msg)); npu_id_,
string::HumanReadableSize(avail),
npu_id_,
FLAGS_fraction_of_gpu_memory_to_use,
err_msg));
} }
} }
void NPUAllocator::Free(void* p, size_t size, size_t index) { void NPUAllocator::Free(void* p, size_t size, size_t index) {
VLOG(4) << "Free " << p << " size " << size; VLOG(4) << "Free " << p << " size " << size;
PADDLE_ENFORCE_EQ(index, 0, PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index should be 0, index is %d", index)); "The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(npu_alloc_size_, size, PADDLE_ENFORCE_GE(npu_alloc_size_,
size,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of " "The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)", "allocated gpu memory (%d)",
size, npu_alloc_size_)); size,
npu_alloc_size_));
npu_alloc_size_ -= size; npu_alloc_size_ -= size;
platform::RecordedNPUFree(p, size, npu_id_); platform::RecordedNPUFree(p, size, npu_id_);
...@@ -358,21 +390,25 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) { ...@@ -358,21 +390,25 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) { void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
aclError err; aclError err;
PADDLE_ENFORCE_EQ(index, 1, PADDLE_ENFORCE_EQ(index,
1,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index should be 1, but got %d", index)); "The index should be 1, but got %d", index));
PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size, PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_,
size,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of " "The size of memory (%d) to free exceeds the size of "
"allocated npu pinned memory (%d)", "allocated npu pinned memory (%d)",
size, npu_pinnd_alloc_size_)); size,
npu_pinnd_alloc_size_));
npu_pinnd_alloc_size_ -= size; npu_pinnd_alloc_size_ -= size;
err = platform::NPUHostFree(p); err = platform::NPUHostFree(p);
if (err != ACL_ERROR_NONE) { if (err != ACL_ERROR_NONE) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
err, 0, err,
0,
platform::errors::Fatal( platform::errors::Fatal(
"NPUHostFree failed in NPUPinnedAllocator, error code is %d", err)); "NPUHostFree failed in NPUPinnedAllocator, error code is %d", err));
} }
...@@ -407,7 +443,8 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) { ...@@ -407,7 +443,8 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the " "larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum MLU memory usage is limited to %d MB.\n" "maximum MLU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.", " The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size); limit_size,
limit_size);
} }
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
...@@ -422,21 +459,29 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) { ...@@ -422,21 +459,29 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
"please set it to a higher value but less than 1.0.\n" "please set it to a higher value but less than 1.0.\n"
" The command is " " The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
mlu_id_, string::HumanReadableSize(size), mlu_id_, mlu_id_,
string::HumanReadableSize(allocated), string::HumanReadableSize(avail), string::HumanReadableSize(size),
mlu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg)); mlu_id_,
string::HumanReadableSize(allocated),
string::HumanReadableSize(avail),
mlu_id_,
FLAGS_fraction_of_gpu_memory_to_use,
err_msg));
} }
} }
void MLUAllocator::Free(void* p, size_t size, size_t index) { void MLUAllocator::Free(void* p, size_t size, size_t index) {
PADDLE_ENFORCE_EQ(index, 0, PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index should be 0, index is %d", index)); "The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(mlu_alloc_size_, size, PADDLE_ENFORCE_GE(mlu_alloc_size_,
size,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of " "The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)", "allocated gpu memory (%d)",
size, mlu_alloc_size_)); size,
mlu_alloc_size_));
mlu_alloc_size_ -= size; mlu_alloc_size_ -= size;
platform::RecordedMLUFree(p, size, mlu_id_); platform::RecordedMLUFree(p, size, mlu_id_);
...@@ -465,7 +510,9 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) { ...@@ -465,7 +510,9 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
"\n\nOut of memory error on %s %d. " "\n\nOut of memory error on %s %d. "
"total memory is %s, used memory is %s, " "total memory is %s, used memory is %s, "
"available memory is only %s.\n\n", "available memory is only %s.\n\n",
dev_type_, dev_id_, string::HumanReadableSize(total), dev_type_,
dev_id_,
string::HumanReadableSize(total),
string::HumanReadableSize(total - avail), string::HumanReadableSize(total - avail),
string::HumanReadableSize(avail))); string::HumanReadableSize(avail)));
} }
...@@ -474,14 +521,17 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) { ...@@ -474,14 +521,17 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
void CustomAllocator::Free(void* p, size_t size, size_t index) { void CustomAllocator::Free(void* p, size_t size, size_t index) {
VLOG(4) << "CustomAllocator::Free " << p << " size " << size; VLOG(4) << "CustomAllocator::Free " << p << " size " << size;
PADDLE_ENFORCE_EQ(index, 0, PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index should be 0, index is %d", index)); "The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(plug_alloc_size, size, PADDLE_ENFORCE_GE(plug_alloc_size,
size,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of " "The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)", "allocated gpu memory (%d)",
size, plug_alloc_size)); size,
plug_alloc_size));
plug_alloc_size -= size; plug_alloc_size -= size;
auto place = platform::CustomPlace(dev_type_, dev_id_); auto place = platform::CustomPlace(dev_type_, dev_id_);
auto device = phi::DeviceManager::GetDeviceWithPlace(place); auto device = phi::DeviceManager::GetDeviceWithPlace(place);
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/device_wrapper.h" #include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/common/place.h" #include "paddle/phi/common/place.h"
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
...@@ -33,8 +33,12 @@ namespace memory { ...@@ -33,8 +33,12 @@ namespace memory {
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
template <> template <>
void Copy<platform::CPUPlace, platform::CustomPlace>( void Copy<platform::CPUPlace, platform::CustomPlace>(
platform::CPUPlace dst_place, void* dst, platform::CustomPlace src_place, platform::CPUPlace dst_place,
const void* src, size_t num, void* stream) { void* dst,
platform::CustomPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
auto src_type = platform::PlaceHelper::GetDeviceType(src_place); auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
...@@ -52,8 +56,12 @@ void Copy<platform::CPUPlace, platform::CustomPlace>( ...@@ -52,8 +56,12 @@ void Copy<platform::CPUPlace, platform::CustomPlace>(
template <> template <>
void Copy<platform::CustomPlace, platform::CPUPlace>( void Copy<platform::CustomPlace, platform::CPUPlace>(
platform::CustomPlace dst_place, void* dst, platform::CPUPlace src_place, platform::CustomPlace dst_place,
const void* src, size_t num, void* stream) { void* dst,
platform::CPUPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
auto src_type = platform::PlaceHelper::GetDeviceType(src_place); auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place); auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place);
...@@ -70,8 +78,12 @@ void Copy<platform::CustomPlace, platform::CPUPlace>( ...@@ -70,8 +78,12 @@ void Copy<platform::CustomPlace, platform::CPUPlace>(
template <> template <>
void Copy<platform::CustomPlace, platform::CustomPlace>( void Copy<platform::CustomPlace, platform::CustomPlace>(
platform::CustomPlace dst_place, void* dst, platform::CustomPlace src_place, platform::CustomPlace dst_place,
const void* src, size_t num, void* stream) { void* dst,
platform::CustomPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
auto src_type = platform::PlaceHelper::GetDeviceType(src_place); auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
...@@ -102,9 +114,11 @@ void Copy<platform::CustomPlace, platform::CustomPlace>( ...@@ -102,9 +114,11 @@ void Copy<platform::CustomPlace, platform::CustomPlace>(
#endif // PADDLE_WITH_CUSTOM_DEVICE #endif // PADDLE_WITH_CUSTOM_DEVICE
template <> template <>
void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst, void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace,
void* dst,
platform::CPUPlace, platform::CPUPlace,
const void* src, size_t num) { const void* src,
size_t num) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num; VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num;
std::memcpy(dst, src, num); std::memcpy(dst, src, num);
...@@ -115,7 +129,8 @@ template <> ...@@ -115,7 +129,8 @@ template <>
void Copy<platform::IPUPlace, platform::CPUPlace>(platform::IPUPlace dst_place, void Copy<platform::IPUPlace, platform::CPUPlace>(platform::IPUPlace dst_place,
void* dst, void* dst,
platform::CPUPlace src_place, platform::CPUPlace src_place,
const void* src, size_t num) { const void* src,
size_t num) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
std::memcpy(dst, src, num); std::memcpy(dst, src, num);
} }
...@@ -123,7 +138,8 @@ template <> ...@@ -123,7 +138,8 @@ template <>
void Copy<platform::CPUPlace, platform::IPUPlace>(platform::CPUPlace dst_place, void Copy<platform::CPUPlace, platform::IPUPlace>(platform::CPUPlace dst_place,
void* dst, void* dst,
platform::IPUPlace src_place, platform::IPUPlace src_place,
const void* src, size_t num) { const void* src,
size_t num) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
std::memcpy(dst, src, num); std::memcpy(dst, src, num);
} }
...@@ -131,15 +147,18 @@ template <> ...@@ -131,15 +147,18 @@ template <>
void Copy<platform::IPUPlace, platform::IPUPlace>(platform::IPUPlace dst_place, void Copy<platform::IPUPlace, platform::IPUPlace>(platform::IPUPlace dst_place,
void* dst, void* dst,
platform::IPUPlace src_place, platform::IPUPlace src_place,
const void* src, size_t num) { const void* src,
size_t num) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
std::memcpy(dst, src, num); std::memcpy(dst, src, num);
} }
// NOTE: only for (CPUPlace and IPUPlace) -> (IPUPlace). // NOTE: only for (CPUPlace and IPUPlace) -> (IPUPlace).
template <> template <>
void Copy<phi::IPUPlace, phi::Place>(phi::IPUPlace dst_place, void* dst, void Copy<phi::IPUPlace, phi::Place>(phi::IPUPlace dst_place,
phi::Place src_place, const void* src, void* dst,
phi::Place src_place,
const void* src,
size_t num) { size_t num) {
if (src_place.GetType() == phi::AllocationType::CPU) { if (src_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_src; platform::CPUPlace place_src;
...@@ -152,8 +171,10 @@ void Copy<phi::IPUPlace, phi::Place>(phi::IPUPlace dst_place, void* dst, ...@@ -152,8 +171,10 @@ void Copy<phi::IPUPlace, phi::Place>(phi::IPUPlace dst_place, void* dst,
// NOTE: only for (IPUPlace) -> (CPUPlace and IPUPlace). // NOTE: only for (IPUPlace) -> (CPUPlace and IPUPlace).
template <> template <>
void Copy<phi::Place, phi::IPUPlace>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::IPUPlace>(phi::Place dst_place,
phi::IPUPlace src_place, const void* src, void* dst,
phi::IPUPlace src_place,
const void* src,
size_t num) { size_t num) {
if (dst_place.GetType() == phi::AllocationType::CPU) { if (dst_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_dst; platform::CPUPlace place_dst;
...@@ -170,7 +191,8 @@ template <> ...@@ -170,7 +191,8 @@ template <>
void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place, void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
void* dst, void* dst,
platform::CPUPlace src_place, platform::CPUPlace src_place,
const void* src, size_t num) { const void* src,
size_t num) {
if (num <= 0) { if (num <= 0) {
VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")"; VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
return; return;
...@@ -182,7 +204,8 @@ template <> ...@@ -182,7 +204,8 @@ template <>
void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place, void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
void* dst, void* dst,
platform::XPUPlace src_place, platform::XPUPlace src_place,
const void* src, size_t num) { const void* src,
size_t num) {
if (num <= 0) { if (num <= 0) {
VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")"; VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
return; return;
...@@ -194,7 +217,8 @@ template <> ...@@ -194,7 +217,8 @@ template <>
void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place, void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
void* dst, void* dst,
platform::XPUPlace src_place, platform::XPUPlace src_place,
const void* src, size_t num) { const void* src,
size_t num) {
if (num <= 0) { if (num <= 0) {
VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")"; VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
return; return;
...@@ -204,8 +228,10 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place, ...@@ -204,8 +228,10 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
// NOTE: only for (CPUPlace and XPUPlace) -> (XPUPlace). // NOTE: only for (CPUPlace and XPUPlace) -> (XPUPlace).
template <> template <>
void Copy<phi::XPUPlace, phi::Place>(phi::XPUPlace dst_place, void* dst, void Copy<phi::XPUPlace, phi::Place>(phi::XPUPlace dst_place,
phi::Place src_place, const void* src, void* dst,
phi::Place src_place,
const void* src,
size_t num) { size_t num) {
if (src_place.GetType() == phi::AllocationType::CPU) { if (src_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_src; platform::CPUPlace place_src;
...@@ -218,8 +244,10 @@ void Copy<phi::XPUPlace, phi::Place>(phi::XPUPlace dst_place, void* dst, ...@@ -218,8 +244,10 @@ void Copy<phi::XPUPlace, phi::Place>(phi::XPUPlace dst_place, void* dst,
// NOTE: only for (XPUPlace) -> (CPUPlace and XPUPlace). // NOTE: only for (XPUPlace) -> (CPUPlace and XPUPlace).
template <> template <>
void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
phi::XPUPlace src_place, const void* src, void* dst,
phi::XPUPlace src_place,
const void* src,
size_t num) { size_t num) {
if (dst_place.GetType() == phi::AllocationType::CPU) { if (dst_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_dst; platform::CPUPlace place_dst;
...@@ -236,7 +264,8 @@ template <> ...@@ -236,7 +264,8 @@ template <>
void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place, void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
void* dst, void* dst,
platform::CPUPlace src_place, platform::CPUPlace src_place,
const void* src, size_t num, const void* src,
size_t num,
void* stream) { void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
...@@ -248,7 +277,10 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place, ...@@ -248,7 +277,10 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
if (stream) { if (stream) {
platform::RecordEvent record_event( platform::RecordEvent record_event(
"NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1); "NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_HOST_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream)); reinterpret_cast<aclrtStream>(stream));
} else { } else {
// On NPU, async operation after sync operation is ok, while sync operation // On NPU, async operation after sync operation is ok, while sync operation
...@@ -267,7 +299,8 @@ template <> ...@@ -267,7 +299,8 @@ template <>
void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place, void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
void* dst, void* dst,
platform::NPUPlace src_place, platform::NPUPlace src_place,
const void* src, size_t num, const void* src,
size_t num,
void* stream) { void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
...@@ -279,7 +312,10 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place, ...@@ -279,7 +312,10 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
if (stream) { if (stream) {
platform::RecordEvent record_event( platform::RecordEvent record_event(
"NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1); "NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_DEVICE_TO_HOST,
reinterpret_cast<aclrtStream>(stream)); reinterpret_cast<aclrtStream>(stream));
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
...@@ -295,7 +331,8 @@ template <> ...@@ -295,7 +331,8 @@ template <>
void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place, void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
void* dst, void* dst,
platform::NPUPlace src_place, platform::NPUPlace src_place,
const void* src, size_t num, const void* src,
size_t num,
void* stream) { void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
...@@ -307,7 +344,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place, ...@@ -307,7 +344,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU", platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU",
platform::TracerEventType::UserDefined, platform::TracerEventType::UserDefined,
1); 1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_DEVICE_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream)); reinterpret_cast<aclrtStream>(stream));
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool& pool =
...@@ -329,7 +369,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place, ...@@ -329,7 +369,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU", platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU",
platform::TracerEventType::UserDefined, platform::TracerEventType::UserDefined,
1); 1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_DEVICE_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream)); reinterpret_cast<aclrtStream>(stream));
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool& pool =
...@@ -346,8 +389,11 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place, ...@@ -346,8 +389,11 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
template <> template <>
void Copy<platform::CPUPlace, platform::NPUPinnedPlace>( void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
platform::CPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place, platform::CPUPlace dst_place,
const void* src, size_t num) { void* dst,
platform::NPUPinnedPlace src_place,
const void* src,
size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place; << dst_place;
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
...@@ -356,8 +402,11 @@ void Copy<platform::CPUPlace, platform::NPUPinnedPlace>( ...@@ -356,8 +402,11 @@ void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
template <> template <>
void Copy<platform::NPUPinnedPlace, platform::CPUPlace>( void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
platform::NPUPinnedPlace dst_place, void* dst, platform::CPUPlace src_place, platform::NPUPinnedPlace dst_place,
const void* src, size_t num) { void* dst,
platform::CPUPlace src_place,
const void* src,
size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place; << dst_place;
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
...@@ -366,8 +415,11 @@ void Copy<platform::NPUPinnedPlace, platform::CPUPlace>( ...@@ -366,8 +415,11 @@ void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
template <> template <>
void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>( void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPinnedPlace dst_place,
platform::NPUPinnedPlace src_place, const void* src, size_t num) { void* dst,
platform::NPUPinnedPlace src_place,
const void* src,
size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place; << dst_place;
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
...@@ -376,8 +428,12 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>( ...@@ -376,8 +428,12 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
template <> template <>
void Copy<platform::NPUPinnedPlace, platform::NPUPlace>( void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place, platform::NPUPinnedPlace dst_place,
const void* src, size_t num, void* stream) { void* dst,
platform::NPUPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetNPUDeviceId(src_place.device); platform::SetNPUDeviceId(src_place.device);
...@@ -389,7 +445,10 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>( ...@@ -389,7 +445,10 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned", platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned",
platform::TracerEventType::UserDefined, platform::TracerEventType::UserDefined,
1); 1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_DEVICE_TO_HOST,
reinterpret_cast<aclrtStream>(stream)); reinterpret_cast<aclrtStream>(stream));
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
...@@ -404,8 +463,12 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>( ...@@ -404,8 +463,12 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
template <> template <>
void Copy<platform::NPUPlace, platform::NPUPinnedPlace>( void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place, platform::NPUPlace dst_place,
const void* src, size_t num, void* stream) { void* dst,
platform::NPUPinnedPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetNPUDeviceId(dst_place.device); platform::SetNPUDeviceId(dst_place.device);
...@@ -417,7 +480,10 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>( ...@@ -417,7 +480,10 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU", platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU",
platform::TracerEventType::UserDefined, platform::TracerEventType::UserDefined,
1); 1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_HOST_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream)); reinterpret_cast<aclrtStream>(stream));
} else { } else {
// On NPU, async operation after sync operation is ok, while sync operation // On NPU, async operation after sync operation is ok, while sync operation
...@@ -435,9 +501,12 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>( ...@@ -435,9 +501,12 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
// NOTE: only for CPUPlace, NPUPlace and NPUPinnedPlace. // NOTE: only for CPUPlace, NPUPlace and NPUPinnedPlace.
template <> template <>
void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::Place>(phi::Place dst_place,
phi::Place src_place, const void* src, void* dst,
size_t num, aclrtStream stream) { phi::Place src_place,
const void* src,
size_t num,
aclrtStream stream) {
if (src_place.GetType() == phi::AllocationType::CPU && if (src_place.GetType() == phi::AllocationType::CPU &&
dst_place.GetType() == phi::AllocationType::CPU) { dst_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_dst, place_src; platform::CPUPlace place_dst, place_src;
...@@ -504,52 +573,76 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst, ...@@ -504,52 +573,76 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (CPUPlace). // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (CPUPlace).
template <> template <>
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst, void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
phi::Place src_place, const void* src, void* dst,
size_t num, aclrtStream stream) { phi::Place src_place,
const void* src,
size_t num,
aclrtStream stream) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
} }
// NOTE: only for (CPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace). // NOTE: only for (CPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace).
template <> template <>
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
phi::CPUPlace src_place, const void* src, void* dst,
size_t num, aclrtStream stream) { phi::CPUPlace src_place,
const void* src,
size_t num,
aclrtStream stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
} }
// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPlace) // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPlace)
template <> template <>
void Copy<phi::NPUPlace, phi::Place>(phi::NPUPlace dst_place, void* dst, void Copy<phi::NPUPlace, phi::Place>(phi::NPUPlace dst_place,
phi::Place src_place, const void* src, void* dst,
size_t num, aclrtStream stream) { phi::Place src_place,
Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place, const void* src,
src, num, stream); size_t num,
aclrtStream stream) {
Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
dst,
src_place,
src,
num,
stream);
} }
// NOTE: only for (NPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace) // NOTE: only for (NPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
template <> template <>
void Copy<phi::Place, phi::NPUPlace>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::NPUPlace>(phi::Place dst_place,
phi::NPUPlace src_place, const void* src, void* dst,
size_t num, aclrtStream stream) { phi::NPUPlace src_place,
Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()), const void* src,
src, num, stream); size_t num,
aclrtStream stream) {
Copy(dst_place,
dst,
phi::Place(src_place.GetType(), src_place.GetDeviceId()),
src,
num,
stream);
} }
// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPinnedPlace) // NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPinnedPlace)
template <> template <>
void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place, void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place,
void* dst, phi::Place src_place, void* dst,
const void* src, size_t num, phi::Place src_place,
const void* src,
size_t num,
aclrtStream stream) { aclrtStream stream) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
} }
// NOTE: only for (NPUPinnedPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace) // NOTE: only for (NPUPinnedPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
template <> template <>
void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place,
void* dst,
phi::NPUPinnedPlace src_place, phi::NPUPinnedPlace src_place,
const void* src, size_t num, const void* src,
size_t num,
aclrtStream stream) { aclrtStream stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
} }
...@@ -557,16 +650,20 @@ void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place, void* dst, ...@@ -557,16 +650,20 @@ void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place, void* dst,
// NOTE: only for (CPUPlace) -> (NPUPinnedPlace) // NOTE: only for (CPUPlace) -> (NPUPinnedPlace)
template <> template <>
void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place, void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place,
void* dst, phi::Place src_place, void* dst,
const void* src, size_t num) { phi::Place src_place,
const void* src,
size_t num) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr); Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr);
} }
// NOTE: only for (NPUPinnedPlace) -> (CPUPlace) // NOTE: only for (NPUPinnedPlace) -> (CPUPlace)
template <> template <>
void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place,
void* dst,
phi::NPUPinnedPlace src_place, phi::NPUPinnedPlace src_place,
const void* src, size_t num) { const void* src,
size_t num) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr); Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr);
} }
#endif #endif
...@@ -608,8 +705,12 @@ inline void SyncCUDAStream() { ...@@ -608,8 +705,12 @@ inline void SyncCUDAStream() {
template <> template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>( void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, platform::CPUPlace dst_place,
const void* src, size_t num, void* stream) { void* dst,
platform::CUDAPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
...@@ -619,10 +720,16 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>( ...@@ -619,10 +720,16 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::RecordEvent record_event( platform::RecordEvent record_event(
"GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1); "GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, platform::GpuMemcpyAsync(dst,
src,
num,
hipMemcpyDeviceToHost,
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
#else #else
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, platform::GpuMemcpyAsync(dst,
src,
num,
cudaMemcpyDeviceToHost,
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
#endif #endif
} else { } else {
...@@ -642,8 +749,12 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>( ...@@ -642,8 +749,12 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
template <> template <>
void Copy<platform::CUDAPlace, platform::CPUPlace>( void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, platform::CUDAPlace dst_place,
const void* src, size_t num, void* stream) { void* dst,
platform::CPUPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(dst_place.device); platform::SetDeviceId(dst_place.device);
...@@ -653,10 +764,16 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>( ...@@ -653,10 +764,16 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform::RecordEvent record_event( platform::RecordEvent record_event(
"GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1); "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, platform::GpuMemcpyAsync(dst,
src,
num,
hipMemcpyHostToDevice,
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
#else #else
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, platform::GpuMemcpyAsync(dst,
src,
num,
cudaMemcpyHostToDevice,
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
#endif #endif
} else { } else {
...@@ -676,8 +793,12 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>( ...@@ -676,8 +793,12 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
template <> template <>
void Copy<platform::CUDAPlace, platform::CUDAPlace>( void Copy<platform::CUDAPlace, platform::CUDAPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place, platform::CUDAPlace dst_place,
const void* src, size_t num, void* stream) { void* dst,
platform::CUDAPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
...@@ -689,10 +810,16 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>( ...@@ -689,10 +810,16 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
platform::TracerEventType::UserDefined, platform::TracerEventType::UserDefined,
1); 1);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice, platform::GpuMemcpyAsync(dst,
src,
num,
hipMemcpyDeviceToDevice,
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
#else #else
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, platform::GpuMemcpyAsync(dst,
src,
num,
cudaMemcpyDeviceToDevice,
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
#endif #endif
} else { } else {
...@@ -710,22 +837,29 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>( ...@@ -710,22 +837,29 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU", platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU",
platform::TracerEventType::UserDefined, platform::TracerEventType::UserDefined,
1); 1);
platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, platform::GpuMemcpyPeerAsync(dst,
num, reinterpret_cast<gpuStream_t>(stream)); dst_place.device,
src,
src_place.device,
num,
reinterpret_cast<gpuStream_t>(stream));
} else { } else {
platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU", platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU",
platform::TracerEventType::UserDefined, platform::TracerEventType::UserDefined,
1); 1);
platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device, platform::GpuMemcpyPeerSync(
num); dst, dst_place.device, src, src_place.device, num);
} }
} }
} }
template <> template <>
void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>( void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
platform::CPUPlace dst_place, void* dst, platform::CPUPlace dst_place,
platform::CUDAPinnedPlace src_place, const void* src, size_t num) { void* dst,
platform::CUDAPinnedPlace src_place,
const void* src,
size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place; << dst_place;
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
...@@ -734,8 +868,11 @@ void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>( ...@@ -734,8 +868,11 @@ void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
template <> template <>
void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>( void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
platform::CUDAPinnedPlace dst_place, void* dst, platform::CUDAPinnedPlace dst_place,
platform::CPUPlace src_place, const void* src, size_t num) { void* dst,
platform::CPUPlace src_place,
const void* src,
size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place; << dst_place;
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
...@@ -744,8 +881,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>( ...@@ -744,8 +881,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
template <> template <>
void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>( void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
platform::CUDAPinnedPlace dst_place, void* dst, platform::CUDAPinnedPlace dst_place,
platform::CUDAPinnedPlace src_place, const void* src, size_t num) { void* dst,
platform::CUDAPinnedPlace src_place,
const void* src,
size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place; << dst_place;
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
...@@ -754,8 +894,12 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>( ...@@ -754,8 +894,12 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
template <> template <>
void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>( void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
platform::CUDAPinnedPlace dst_place, void* dst, platform::CUDAPinnedPlace dst_place,
platform::CUDAPlace src_place, const void* src, size_t num, void* stream) { void* dst,
platform::CUDAPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(src_place.device); platform::SetDeviceId(src_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
...@@ -765,10 +909,16 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>( ...@@ -765,10 +909,16 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
platform::TracerEventType::UserDefined, platform::TracerEventType::UserDefined,
1); 1);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost, platform::GpuMemcpyAsync(dst,
src,
num,
hipMemcpyDeviceToHost,
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
#else #else
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, platform::GpuMemcpyAsync(dst,
src,
num,
cudaMemcpyDeviceToHost,
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
#endif #endif
} else { } else {
...@@ -785,8 +935,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>( ...@@ -785,8 +935,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
template <> template <>
void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>( void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace dst_place,
platform::CUDAPinnedPlace src_place, const void* src, size_t num, void* dst,
platform::CUDAPinnedPlace src_place,
const void* src,
size_t num,
void* stream) { void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
...@@ -798,10 +951,16 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>( ...@@ -798,10 +951,16 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
platform::TracerEventType::UserDefined, platform::TracerEventType::UserDefined,
1); 1);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice, platform::GpuMemcpyAsync(dst,
src,
num,
hipMemcpyHostToDevice,
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
#else #else
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, platform::GpuMemcpyAsync(dst,
src,
num,
cudaMemcpyHostToDevice,
reinterpret_cast<gpuStream_t>(stream)); reinterpret_cast<gpuStream_t>(stream));
#endif #endif
} else { } else {
...@@ -818,9 +977,12 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>( ...@@ -818,9 +977,12 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
// NOTE: only for CPUPlace、CUDAPlace and CUDAPinnedPlace. // NOTE: only for CPUPlace、CUDAPlace and CUDAPinnedPlace.
template <> template <>
void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::Place>(phi::Place dst_place,
phi::Place src_place, const void* src, void* dst,
size_t num, void* stream) { phi::Place src_place,
const void* src,
size_t num,
void* stream) {
if (src_place.GetType() == phi::AllocationType::CPU && if (src_place.GetType() == phi::AllocationType::CPU &&
dst_place.GetType() == phi::AllocationType::CPU) { dst_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_dst, place_src; platform::CPUPlace place_dst, place_src;
...@@ -887,52 +1049,76 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst, ...@@ -887,52 +1049,76 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
// NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CPUPlace). // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CPUPlace).
template <> template <>
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst, void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
phi::Place src_place, const void* src, void* dst,
size_t num, void* stream) { phi::Place src_place,
const void* src,
size_t num,
void* stream) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
} }
// NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace). // NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace).
template <> template <>
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
phi::CPUPlace src_place, const void* src, void* dst,
size_t num, void* stream) { phi::CPUPlace src_place,
const void* src,
size_t num,
void* stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
} }
// NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPlace) // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPlace)
template <> template <>
void Copy<phi::GPUPlace, phi::Place>(phi::GPUPlace dst_place, void* dst, void Copy<phi::GPUPlace, phi::Place>(phi::GPUPlace dst_place,
phi::Place src_place, const void* src, void* dst,
size_t num, void* stream) { phi::Place src_place,
Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place, const void* src,
src, num, stream); size_t num,
void* stream) {
Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
dst,
src_place,
src,
num,
stream);
} }
// NOTE: only for (CUDAPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace) // NOTE: only for (CUDAPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace)
template <> template <>
void Copy<phi::Place, phi::GPUPlace>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::GPUPlace>(phi::Place dst_place,
phi::GPUPlace src_place, const void* src, void* dst,
size_t num, void* stream) { phi::GPUPlace src_place,
Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()), const void* src,
src, num, stream); size_t num,
void* stream) {
Copy(dst_place,
dst,
phi::Place(src_place.GetType(), src_place.GetDeviceId()),
src,
num,
stream);
} }
// NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPinnedPlace) // NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPinnedPlace)
template <> template <>
void Copy<phi::GPUPinnedPlace, phi::Place>(phi::GPUPinnedPlace dst_place, void Copy<phi::GPUPinnedPlace, phi::Place>(phi::GPUPinnedPlace dst_place,
void* dst, phi::Place src_place, void* dst,
const void* src, size_t num, phi::Place src_place,
const void* src,
size_t num,
void* stream) { void* stream) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
} }
// NOTE: only for (CUDAPinnedPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace) // NOTE: only for (CUDAPinnedPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace)
template <> template <>
void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place,
void* dst,
phi::GPUPinnedPlace src_place, phi::GPUPinnedPlace src_place,
const void* src, size_t num, const void* src,
size_t num,
void* stream) { void* stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
} }
...@@ -940,16 +1126,20 @@ void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, void* dst, ...@@ -940,16 +1126,20 @@ void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, void* dst,
// NOTE: only for (CPUPlace) -> (CUDAPinnedPlace) // NOTE: only for (CPUPlace) -> (CUDAPinnedPlace)
template <> template <>
void Copy<phi::GPUPinnedPlace, phi::Place>(phi::GPUPinnedPlace dst_place, void Copy<phi::GPUPinnedPlace, phi::Place>(phi::GPUPinnedPlace dst_place,
void* dst, phi::Place src_place, void* dst,
const void* src, size_t num) { phi::Place src_place,
const void* src,
size_t num) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr); Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr);
} }
// NOTE: only for (CUDAPinnedPlace) -> (CPUPlace) // NOTE: only for (CUDAPinnedPlace) -> (CPUPlace)
template <> template <>
void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place,
void* dst,
phi::GPUPinnedPlace src_place, phi::GPUPinnedPlace src_place,
const void* src, size_t num) { const void* src,
size_t num) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr); Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr);
} }
#endif #endif
...@@ -959,7 +1149,8 @@ template <> ...@@ -959,7 +1149,8 @@ template <>
void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place, void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
void* dst, void* dst,
platform::MLUPlace src_place, platform::MLUPlace src_place,
const void* src, size_t num, const void* src,
size_t num,
void* stream) { void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
...@@ -970,8 +1161,8 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place, ...@@ -970,8 +1161,8 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU", platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU",
platform::TracerEventType::UserDefined, platform::TracerEventType::UserDefined,
1); 1);
platform::MLUMemcpyD2HAsync(dst, src, num, platform::MLUMemcpyD2HAsync(
reinterpret_cast<mluStream>(stream)); dst, src, num, reinterpret_cast<mluStream>(stream));
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait(); static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
...@@ -988,7 +1179,8 @@ template <> ...@@ -988,7 +1179,8 @@ template <>
void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place, void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
void* dst, void* dst,
platform::CPUPlace src_place, platform::CPUPlace src_place,
const void* src, size_t num, const void* src,
size_t num,
void* stream) { void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
...@@ -999,8 +1191,8 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place, ...@@ -999,8 +1191,8 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU", platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU",
platform::TracerEventType::UserDefined, platform::TracerEventType::UserDefined,
1); 1);
platform::MLUMemcpyH2DAsync(dst, src, num, platform::MLUMemcpyH2DAsync(
reinterpret_cast<mluStream>(stream)); dst, src, num, reinterpret_cast<mluStream>(stream));
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait(); static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
...@@ -1017,7 +1209,8 @@ template <> ...@@ -1017,7 +1209,8 @@ template <>
void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place, void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
void* dst, void* dst,
platform::MLUPlace src_place, platform::MLUPlace src_place,
const void* src, size_t num, const void* src,
size_t num,
void* stream) { void* stream) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
...@@ -1029,8 +1222,8 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place, ...@@ -1029,8 +1222,8 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU", platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU",
platform::TracerEventType::UserDefined, platform::TracerEventType::UserDefined,
1); 1);
platform::MLUMemcpyD2DAsync(dst, src, num, platform::MLUMemcpyD2DAsync(
reinterpret_cast<mluStream>(stream)); dst, src, num, reinterpret_cast<mluStream>(stream));
} else { } else {
platform::DeviceContextPool& pool = platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
...@@ -1050,25 +1243,32 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place, ...@@ -1050,25 +1243,32 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU", platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU",
platform::TracerEventType::UserDefined, platform::TracerEventType::UserDefined,
1); 1);
platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device, platform::MLUMemcpyPeerAsync(dst,
num, reinterpret_cast<mluStream>(stream)); dst_place.device,
src,
src_place.device,
num,
reinterpret_cast<mluStream>(stream));
} else { } else {
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place; << " to " << dst_place;
platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU", platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU",
platform::TracerEventType::UserDefined, platform::TracerEventType::UserDefined,
1); 1);
platform::MLUMemcpyPeerSync(dst, dst_place.device, src, src_place.device, platform::MLUMemcpyPeerSync(
num); dst, dst_place.device, src, src_place.device, num);
} }
} }
} }
// NOTE: only for CPUPlace and MLUPlace. // NOTE: only for CPUPlace and MLUPlace.
template <> template <>
void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::Place>(phi::Place dst_place,
phi::Place src_place, const void* src, void* dst,
size_t num, void* stream) { phi::Place src_place,
const void* src,
size_t num,
void* stream) {
if (src_place.GetType() == phi::AllocationType::CPU && if (src_place.GetType() == phi::AllocationType::CPU &&
dst_place.GetType() == phi::AllocationType::CPU) { dst_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_dst, place_src; platform::CPUPlace place_dst, place_src;
...@@ -1110,35 +1310,55 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst, ...@@ -1110,35 +1310,55 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
// NOTE: only for (CPUPlace and MLUPlace) -> (MLUPlace) // NOTE: only for (CPUPlace and MLUPlace) -> (MLUPlace)
template <> template <>
void Copy<phi::MLUPlace, phi::Place>(phi::MLUPlace dst_place, void* dst, void Copy<phi::MLUPlace, phi::Place>(phi::MLUPlace dst_place,
phi::Place src_place, const void* src, void* dst,
size_t num, void* stream) { phi::Place src_place,
Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place, const void* src,
src, num, stream); size_t num,
void* stream) {
Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
dst,
src_place,
src,
num,
stream);
} }
// NOTE: only for (MLUPlace) -> (CPUPlace and MLUPlace) // NOTE: only for (MLUPlace) -> (CPUPlace and MLUPlace)
template <> template <>
void Copy<phi::Place, phi::MLUPlace>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::MLUPlace>(phi::Place dst_place,
phi::MLUPlace src_place, const void* src, void* dst,
size_t num, void* stream) { phi::MLUPlace src_place,
Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()), const void* src,
src, num, stream); size_t num,
void* stream) {
Copy(dst_place,
dst,
phi::Place(src_place.GetType(), src_place.GetDeviceId()),
src,
num,
stream);
} }
// NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream. // NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream.
template <> template <>
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst, void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
phi::Place src_place, const void* src, void* dst,
size_t num, void* stream) { phi::Place src_place,
const void* src,
size_t num,
void* stream) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
} }
// NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream. // NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream.
template <> template <>
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
phi::CPUPlace src_place, const void* src, void* dst,
size_t num, void* stream) { phi::CPUPlace src_place,
const void* src,
size_t num,
void* stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
} }
...@@ -1146,8 +1366,10 @@ void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst, ...@@ -1146,8 +1366,10 @@ void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
// NOTE: Only for CPUPlace, XPUPlace and PinnedPlace. // NOTE: Only for CPUPlace, XPUPlace and PinnedPlace.
template <> template <>
void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::Place>(phi::Place dst_place,
phi::Place src_place, const void* src, void* dst,
phi::Place src_place,
const void* src,
size_t num) { size_t num) {
if (UNLIKELY(num == 0)) return; if (UNLIKELY(num == 0)) return;
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
...@@ -1224,16 +1446,20 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst, ...@@ -1224,16 +1446,20 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
// NOTE: Only for (CPUPlace) -> (CPUPlace and PinnedPlace). // NOTE: Only for (CPUPlace) -> (CPUPlace and PinnedPlace).
template <> template <>
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
phi::CPUPlace src_place, const void* src, void* dst,
phi::CPUPlace src_place,
const void* src,
size_t num) { size_t num) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num); Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num);
} }
// NOTE: Only for (CPUPlace and PinnedPlace) -> (CPUPlace). // NOTE: Only for (CPUPlace and PinnedPlace) -> (CPUPlace).
template <> template <>
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst, void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
phi::Place src_place, const void* src, void* dst,
phi::Place src_place,
const void* src,
size_t num) { size_t num) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num); Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num);
} }
...@@ -1243,9 +1469,12 @@ void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst, ...@@ -1243,9 +1469,12 @@ void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
!defined(PADDLE_WITH_MLU) !defined(PADDLE_WITH_MLU)
template <> template <>
void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::Place>(phi::Place dst_place,
phi::Place src_place, const void* src, void* dst,
size_t num, void* stream) { phi::Place src_place,
const void* src,
size_t num,
void* stream) {
if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT
dst_place.GetType() == phi::AllocationType::CUSTOM) { dst_place.GetType() == phi::AllocationType::CUSTOM) {
platform::CPUPlace place_src; platform::CPUPlace place_src;
...@@ -1265,17 +1494,23 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst, ...@@ -1265,17 +1494,23 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
} }
template <> template <>
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst, void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
phi::Place src_place, const void* src, void* dst,
size_t num, void* stream) { phi::Place src_place,
const void* src,
size_t num,
void* stream) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream); Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
} }
// NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace). // NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace).
template <> template <>
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst, void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
phi::CPUPlace src_place, const void* src, void* dst,
size_t num, void* stream) { phi::CPUPlace src_place,
const void* src,
size_t num,
void* stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream); Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
} }
#endif #endif
......
...@@ -354,7 +354,9 @@ if(WITH_GPU) ...@@ -354,7 +354,9 @@ if(WITH_GPU)
enforce enforce
dynload_cuda dynload_cuda
new_profiler new_profiler
stats) stats
op_proto_maker
shape_inference)
nv_library( nv_library(
device_memory_aligment device_memory_aligment
SRCS device_memory_aligment.cc SRCS device_memory_aligment.cc
...@@ -363,7 +365,14 @@ elseif(WITH_ROCM) ...@@ -363,7 +365,14 @@ elseif(WITH_ROCM)
hip_library( hip_library(
profiler profiler
SRCS profiler.cc profiler.cu SRCS profiler.cc profiler.cu
DEPS os_info device_tracer gpu_info enforce new_profiler stats) DEPS os_info
device_tracer
gpu_info
enforce
new_profiler
stats
op_proto_maker
shape_inference)
hip_library( hip_library(
device_memory_aligment device_memory_aligment
SRCS device_memory_aligment.cc SRCS device_memory_aligment.cc
...@@ -372,7 +381,13 @@ else() ...@@ -372,7 +381,13 @@ else()
cc_library( cc_library(
profiler profiler
SRCS profiler.cc SRCS profiler.cc
DEPS os_info device_tracer enforce new_profiler stats) DEPS os_info
device_tracer
enforce
new_profiler
stats
op_proto_maker
shape_inference)
cc_library( cc_library(
device_memory_aligment device_memory_aligment
SRCS device_memory_aligment.cc SRCS device_memory_aligment.cc
......
...@@ -29,6 +29,7 @@ limitations under the License. */ ...@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h" #include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
#include "paddle/fluid/string/split.h" #include "paddle/fluid/string/split.h"
#include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_info.h"
...@@ -51,10 +52,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb); ...@@ -51,10 +52,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(enable_cublas_tensor_op_math); DECLARE_bool(enable_cublas_tensor_op_math);
DECLARE_uint64(gpu_memory_limit_mb); DECLARE_uint64(gpu_memory_limit_mb);
PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false, PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log,
false,
"Whether to print the message of gpu memory usage " "Whether to print the message of gpu memory usage "
"at exit, mainly used for UT and CI."); "at exit, mainly used for UT and CI.");
PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, true, PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb,
true,
"Whether to print the message of gpu memory usage " "Whether to print the message of gpu memory usage "
"MB as a unit of measurement."); "MB as a unit of measurement.");
...@@ -66,7 +69,10 @@ namespace platform { ...@@ -66,7 +69,10 @@ namespace platform {
void GpuMemoryUsage(size_t *available, size_t *total) { void GpuMemoryUsage(size_t *available, size_t *total) {
size_t actual_available, actual_total; size_t actual_available, actual_total;
RecordedGpuMemGetInfo(available, total, &actual_available, &actual_total, RecordedGpuMemGetInfo(available,
total,
&actual_available,
&actual_total,
platform::GetCurrentDeviceId()); platform::GetCurrentDeviceId());
} }
...@@ -94,7 +100,8 @@ size_t GpuMaxAllocSize() { ...@@ -94,7 +100,8 @@ size_t GpuMaxAllocSize() {
static size_t GpuAllocSize(bool realloc) { static size_t GpuAllocSize(bool realloc) {
size_t available_to_alloc = GpuAvailableMemToAlloc(); size_t available_to_alloc = GpuAvailableMemToAlloc();
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
available_to_alloc, 0, available_to_alloc,
0,
platform::errors::ResourceExhausted("Not enough available GPU memory.")); platform::errors::ResourceExhausted("Not enough available GPU memory."));
// If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be // If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
// allocated by fraction // allocated by fraction
...@@ -105,7 +112,8 @@ static size_t GpuAllocSize(bool realloc) { ...@@ -105,7 +112,8 @@ static size_t GpuAllocSize(bool realloc) {
? flag_mb << 20 ? flag_mb << 20
: available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use); : available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(
available_to_alloc, alloc_bytes, available_to_alloc,
alloc_bytes,
platform::errors::ResourceExhausted("Not enough available GPU memory.")); platform::errors::ResourceExhausted("Not enough available GPU memory."));
VLOG(10) << "Alloc size is " << (alloc_bytes >> 20) VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
<< " MiB, is it Re-alloc: " << realloc; << " MiB, is it Re-alloc: " << realloc;
...@@ -192,13 +200,16 @@ class RecordedGpuMallocHelper { ...@@ -192,13 +200,16 @@ class RecordedGpuMallocHelper {
}); });
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(
dev_id, 0, dev_id,
0,
platform::errors::OutOfRange( platform::errors::OutOfRange(
"Device id must be not less than 0, but got %d.", dev_id)); "Device id must be not less than 0, but got %d.", dev_id));
PADDLE_ENFORCE_LT( PADDLE_ENFORCE_LT(
dev_id, instances_.size(), dev_id,
instances_.size(),
platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.", platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
dev_id, instances_.size())); dev_id,
instances_.size()));
return instances_[dev_id].get(); return instances_[dev_id].get();
} }
...@@ -207,7 +218,8 @@ class RecordedGpuMallocHelper { ...@@ -207,7 +218,8 @@ class RecordedGpuMallocHelper {
* or cudaSuccess would be returned, and the cudaGetLastError() flag * or cudaSuccess would be returned, and the cudaGetLastError() flag
* would be clear. * would be clear.
*/ */
gpuError_t Malloc(void **ptr, size_t size, gpuError_t Malloc(void **ptr,
size_t size,
bool malloc_managed_memory = false) { bool malloc_managed_memory = false) {
LockGuardPtr<std::mutex> lock(mtx_); LockGuardPtr<std::mutex> lock(mtx_);
if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) { if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
...@@ -236,7 +248,10 @@ class RecordedGpuMallocHelper { ...@@ -236,7 +248,10 @@ class RecordedGpuMallocHelper {
cur_size_.fetch_add(size); cur_size_.fetch_add(size);
STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size); DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
platform::RecordMemEvent(ptr,
GPUPlace(dev_id_),
size,
platform::TracerMemEventType::ReservedAllocate);
#ifdef PADDLE_WITH_TESTING #ifdef PADDLE_WITH_TESTING
gpu_ptrs.insert(*ptr); gpu_ptrs.insert(*ptr);
#endif #endif
...@@ -275,6 +290,10 @@ class RecordedGpuMallocHelper { ...@@ -275,6 +290,10 @@ class RecordedGpuMallocHelper {
cur_size_.fetch_sub(size); cur_size_.fetch_sub(size);
STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size); STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size); DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
platform::RecordMemEvent(ptr,
GPUPlace(dev_id_),
size,
platform::TracerMemEventType::ReservedFree);
} else { } else {
platform::GpuGetLastError(); // clear the error flag when platform::GpuGetLastError(); // clear the error flag when
// cudaErrorCudartUnloading / // cudaErrorCudartUnloading /
...@@ -300,7 +319,9 @@ class RecordedGpuMallocHelper { ...@@ -300,7 +319,9 @@ class RecordedGpuMallocHelper {
#endif #endif
} }
bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail, bool GetMemInfo(size_t *avail,
size_t *total,
size_t *actual_avail,
size_t *actual_total) { size_t *actual_total) {
{ {
CUDADeviceGuard guard(dev_id_); CUDADeviceGuard guard(dev_id_);
...@@ -335,7 +356,8 @@ class RecordedGpuMallocHelper { ...@@ -335,7 +356,8 @@ class RecordedGpuMallocHelper {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10020 #if CUDA_VERSION >= 10020
CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size, CUresult MemCreate(CUmemGenericAllocationHandle *handle,
size_t size,
const CUmemAllocationProp *prop, const CUmemAllocationProp *prop,
unsigned long long flags) { // NOLINT unsigned long long flags) { // NOLINT
auto result = auto result =
...@@ -371,7 +393,9 @@ class RecordedGpuMallocHelper { ...@@ -371,7 +393,9 @@ class RecordedGpuMallocHelper {
std::once_flag RecordedGpuMallocHelper::once_flag_; std::once_flag RecordedGpuMallocHelper::once_flag_;
gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id, gpuError_t RecordedGpuMalloc(void **ptr,
size_t size,
int dev_id,
bool malloc_managed_memory) { bool malloc_managed_memory) {
return RecordedGpuMallocHelper::Instance(dev_id)->Malloc( return RecordedGpuMallocHelper::Instance(dev_id)->Malloc(
ptr, size, malloc_managed_memory); ptr, size, malloc_managed_memory);
...@@ -383,22 +407,28 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) { ...@@ -383,22 +407,28 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10020 #if CUDA_VERSION >= 10020
CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size, CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
size_t size,
const CUmemAllocationProp *prop, const CUmemAllocationProp *prop,
unsigned long long flags, int dev_id) { // NOLINT unsigned long long flags,
return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(handle, size, int dev_id) { // NOLINT
prop, flags); return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(
handle, size, prop, flags);
} }
CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, size_t size, CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle,
size_t size,
int dev_id) { int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size); return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size);
} }
#endif #endif
#endif #endif
bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail, bool RecordedGpuMemGetInfo(size_t *avail,
size_t *actual_total, int dev_id) { size_t *total,
size_t *actual_avail,
size_t *actual_total,
int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->GetMemInfo( return RecordedGpuMallocHelper::Instance(dev_id)->GetMemInfo(
avail, total, actual_avail, actual_total); avail, total, actual_avail, actual_total);
} }
...@@ -493,26 +523,35 @@ void GpuDestroyStream(gpuStream_t stream) { ...@@ -493,26 +523,35 @@ void GpuDestroyStream(gpuStream_t stream) {
void GpuDeviceSync() { phi::backends::gpu::GpuDeviceSync(); } void GpuDeviceSync() { phi::backends::gpu::GpuDeviceSync(); }
void GpuMemcpyAsync(void *dst, const void *src, size_t count, void GpuMemcpyAsync(void *dst,
gpuMemcpyKind kind, gpuStream_t stream) { const void *src,
size_t count,
gpuMemcpyKind kind,
gpuStream_t stream) {
phi::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream); phi::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream);
} }
void GpuMemcpySync(void *dst, const void *src, size_t count, void GpuMemcpySync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind) { gpuMemcpyKind kind) {
phi::backends::gpu::GpuMemcpySync(dst, src, count, kind); phi::backends::gpu::GpuMemcpySync(dst, src, count, kind);
} }
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerAsync(void *dst,
int src_device, size_t count, gpuStream_t stream) { int dst_device,
phi::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device, const void *src,
count, stream); int src_device,
size_t count,
gpuStream_t stream) {
phi::backends::gpu::GpuMemcpyPeerAsync(
dst, dst_device, src, src_device, count, stream);
} }
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src, void GpuMemcpyPeerSync(
int src_device, size_t count) { void *dst, int dst_device, const void *src, int src_device, size_t count) {
phi::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device, phi::backends::gpu::GpuMemcpyPeerSync(
count); dst, dst_device, src, src_device, count);
} }
void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) { void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
......
...@@ -30,12 +30,16 @@ limitations under the License. */ ...@@ -30,12 +30,16 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/nvtx.h" #include "paddle/fluid/platform/dynload/nvtx.h"
#endif #endif
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/os_info.h" #include "paddle/fluid/platform/os_info.h"
PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false, PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler,
false,
"Enable rpc profiler or not."); "Enable rpc profiler or not.");
DEFINE_bool(enable_host_event_recorder_hook, false, DEFINE_bool(enable_host_event_recorder_hook,
false,
"enable HostEventRecorder, hook Profiler"); "enable HostEventRecorder, hook Profiler");
namespace paddle { namespace paddle {
...@@ -43,8 +47,11 @@ namespace platform { ...@@ -43,8 +47,11 @@ namespace platform {
MemEvenRecorder MemEvenRecorder::recorder; MemEvenRecorder MemEvenRecorder::recorder;
Event::Event(EventType type, std::string name, uint32_t thread_id, Event::Event(EventType type,
EventRole role, std::string attr) std::string name,
uint32_t thread_id,
EventRole role,
std::string attr)
: type_(type), : type_(type),
name_(name), name_(name),
thread_id_(thread_id), thread_id_(thread_id),
...@@ -68,8 +75,10 @@ double Event::CudaElapsedMs(const Event &e) const { ...@@ -68,8 +75,10 @@ double Event::CudaElapsedMs(const Event &e) const {
#endif #endif
} }
RecordEvent::RecordEvent(const char *name, const TracerEventType type, RecordEvent::RecordEvent(const char *name,
uint32_t level, const EventRole role) { const TracerEventType type,
uint32_t level,
const EventRole role) {
#ifndef _WIN32 #ifndef _WIN32
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook) { if (g_enable_nvprof_hook) {
...@@ -100,8 +109,10 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type, ...@@ -100,8 +109,10 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type,
start_ns_ = PosixInNsec(); start_ns_ = PosixInNsec();
} }
RecordEvent::RecordEvent(const std::string &name, const TracerEventType type, RecordEvent::RecordEvent(const std::string &name,
uint32_t level, const EventRole role) { const TracerEventType type,
uint32_t level,
const EventRole role) {
#ifndef _WIN32 #ifndef _WIN32
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook) { if (g_enable_nvprof_hook) {
...@@ -130,8 +141,10 @@ RecordEvent::RecordEvent(const std::string &name, const TracerEventType type, ...@@ -130,8 +141,10 @@ RecordEvent::RecordEvent(const std::string &name, const TracerEventType type,
start_ns_ = PosixInNsec(); start_ns_ = PosixInNsec();
} }
RecordEvent::RecordEvent(const std::string &name, const std::string &attr, RecordEvent::RecordEvent(const std::string &name,
const TracerEventType type, uint32_t level, const std::string &attr,
const TracerEventType type,
uint32_t level,
const EventRole role) { const EventRole role) {
#ifndef _WIN32 #ifndef _WIN32
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -215,8 +228,8 @@ void RecordEvent::End() { ...@@ -215,8 +228,8 @@ void RecordEvent::End() {
DeviceTracer *tracer = GetDeviceTracer(); DeviceTracer *tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
uint64_t end_ns = PosixInNsec(); uint64_t end_ns = PosixInNsec();
tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(), tracer->AddCPURecords(
g_thread_id); CurAnnotationName(), start_ns_, end_ns, BlockDepth(), g_thread_id);
} }
ClearCurAnnotation(); ClearCurAnnotation();
PopEvent(*name_, role_); PopEvent(*name_, role_);
...@@ -226,7 +239,8 @@ void RecordEvent::End() { ...@@ -226,7 +239,8 @@ void RecordEvent::End() {
is_enabled_ = false; is_enabled_ = false;
} }
RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type, RecordInstantEvent::RecordInstantEvent(const char *name,
TracerEventType type,
uint32_t level) { uint32_t level) {
if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) { if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
return; return;
...@@ -236,21 +250,242 @@ RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type, ...@@ -236,21 +250,242 @@ RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
name, start_end_ns, start_end_ns, EventRole::kOrdinary, type); name, start_end_ns, start_end_ns, EventRole::kOrdinary, type);
} }
void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place, RecordOpInfoSupplement::RecordOpInfoSupplement(
const std::string &type,
const framework::AttributeMap &attrs,
const framework::InferShapeContext &shape_ctx,
const framework::RuntimeContext &ctx) {
if (FLAGS_enable_host_event_recorder_hook == false) {
return;
}
std::map<std::string, std::vector<framework::DDim>> input_shapes;
std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
for (auto it = ctx.inputs.begin(); it != ctx.inputs.end(); it++) {
input_shapes[it->first] = shape_ctx.GetInputsDim(it->first);
dtypes[it->first] = shape_ctx.GetInputsVarType(it->first);
}
const std::vector<std::string> *callstack_ptr = nullptr;
std::vector<std::string> callstack;
auto iter = attrs.find(
framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
if (iter != attrs.end()) {
callstack_ptr = &BOOST_GET_CONST(std::vector<std::string>, iter->second);
callstack = *callstack_ptr;
}
HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance().RecordEvent(
PosixInNsec(), type, input_shapes, dtypes, callstack);
}
RecordMemEvent::RecordMemEvent(const void *ptr,
const phi::Place &place,
size_t size,
const TracerMemEventType type) {
if (g_state == ProfilerState::kDisabled &&
FLAGS_enable_host_event_recorder_hook == false) {
return;
}
if (type == TracerMemEventType::Allocate) {
uint64_t current_allocated;
uint64_t peak_allocated;
uint64_t current_reserved = 0; // 0 means keep the same as before
uint64_t peak_reserved = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
current_allocated =
HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
} else {
current_allocated =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
}
platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
place,
size,
type,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
} else if (type == TracerMemEventType::ReservedAllocate) {
uint64_t current_reserved;
uint64_t peak_reserved;
uint64_t current_allocated = 0; // 0 means keep the same as before
uint64_t peak_allocated = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
current_reserved =
HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
} else {
current_reserved =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
}
platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
place,
size,
type,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
} else if (type == TracerMemEventType::Free) {
uint64_t current_allocated;
uint64_t peak_allocated;
uint64_t current_reserved = 0; // 0 means keep the same as before
uint64_t peak_reserved = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
current_allocated =
HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
} else {
current_allocated =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
}
platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
place,
size,
type,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
} else if (type == TracerMemEventType::ReservedFree) {
uint64_t current_reserved;
uint64_t peak_reserved;
uint64_t current_allocated = 0; // 0 means keep the same as before
uint64_t peak_allocated = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
current_reserved =
HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
} else {
current_reserved =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
}
platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
place,
size,
type,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
}
}
void MemEvenRecorder::PushMemRecord(const void *ptr,
const Place &place,
size_t size) { size_t size) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) {
return;
}
std::lock_guard<std::mutex> guard(mtx_); std::lock_guard<std::mutex> guard(mtx_);
auto &events = address_memevent_[place]; auto &events = address_memevent_[place];
PADDLE_ENFORCE_EQ(events.count(ptr), 0, PADDLE_ENFORCE_EQ(events.count(ptr),
0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The Place can't exist in the stage of PushMemRecord")); "The Place can't exist in the stage of PushMemRecord"));
events.emplace(ptr, std::unique_ptr<RecordMemEvent>( events.emplace(ptr,
std::unique_ptr<RecordMemEvent>(
new MemEvenRecorder::RecordMemEvent(place, size))); new MemEvenRecorder::RecordMemEvent(place, size)));
} }
void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) { void MemEvenRecorder::PushMemRecord(const void *ptr,
const Place &place,
size_t size,
TracerMemEventType type,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved) {
std::lock_guard<std::mutex> guard(mtx_);
if (FLAGS_enable_host_event_recorder_hook) { // new MemRecord
HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
PosixInNsec(),
reinterpret_cast<uint64_t>(ptr),
type,
size,
place,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
return;
}
if (type == TracerMemEventType::ReservedAllocate) {
// old profiler only analyse memory managed by paddle.
return;
}
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
auto &events = address_memevent_[place];
PADDLE_ENFORCE_EQ(events.count(ptr),
0,
platform::errors::InvalidArgument(
"The Place can't exist in the stage of PushMemRecord"));
events.emplace(ptr,
std::unique_ptr<RecordMemEvent>(
new MemEvenRecorder::RecordMemEvent(place, size)));
}
void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
if (g_state == ProfilerState::kDisabled) {
return;
}
std::lock_guard<std::mutex> guard(mtx_);
auto &events = address_memevent_[place];
auto iter = events.find(ptr);
// The ptr maybe not in address_memevent
if (iter != events.end()) {
events.erase(iter);
}
}
void MemEvenRecorder::PopMemRecord(const void *ptr,
const Place &place,
size_t size,
TracerMemEventType type,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved) {
std::lock_guard<std::mutex> guard(mtx_); std::lock_guard<std::mutex> guard(mtx_);
if (FLAGS_enable_host_event_recorder_hook) { // new MemRecord
HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
PosixInNsec(),
reinterpret_cast<uint64_t>(ptr),
type,
-size,
place,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
return;
}
if (type == TracerMemEventType::ReservedFree) {
// old profiler only analyse memory managed by paddle.
return;
}
if (g_state == ProfilerState::kDisabled) return;
auto &events = address_memevent_[place]; auto &events = address_memevent_[place];
auto iter = events.find(ptr); auto iter = events.find(ptr);
// The ptr maybe not in address_memevent // The ptr maybe not in address_memevent
...@@ -279,8 +514,13 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() { ...@@ -279,8 +514,13 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
auto annotation_free = CurAnnotationName(); auto annotation_free = CurAnnotationName();
if (tracer) { if (tracer) {
tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_, tracer->AddMemInfoRecord(start_ns_,
annotation_free, g_mem_thread_id); end_ns_,
bytes_,
place_,
alloc_in_,
annotation_free,
g_mem_thread_id);
} }
PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free); PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
} }
...@@ -307,22 +547,38 @@ RecordBlock::~RecordBlock() { ...@@ -307,22 +547,38 @@ RecordBlock::~RecordBlock() {
if (tracer) { if (tracer) {
// We try to put all blocks at the same nested depth in the // We try to put all blocks at the same nested depth in the
// same timeline lane. and distinguish the using thread_id. // same timeline lane. and distinguish the using thread_id.
tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(), tracer->AddCPURecords(
g_thread_id); name_, start_ns_, PosixInNsec(), BlockDepth(), g_thread_id);
} }
ClearCurBlock(); ClearCurBlock();
} }
void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, void PushMemEvent(uint64_t start_ns,
const Place &place, const std::string &annotation) { uint64_t end_ns,
GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes, size_t bytes,
place, g_mem_thread_id, annotation); const Place &place,
} const std::string &annotation) {
GetMemEventList().Record(EventType::kPushRange,
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, start_ns,
const Place &place, const std::string &annotation) { end_ns,
GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place, bytes,
g_mem_thread_id, annotation); place,
g_mem_thread_id,
annotation);
}
void PopMemEvent(uint64_t start_ns,
uint64_t end_ns,
size_t bytes,
const Place &place,
const std::string &annotation) {
GetMemEventList().Record(EventType::kPopRange,
start_ns,
end_ns,
bytes,
place,
g_mem_thread_id,
annotation);
} }
void Mark(const std::string &name) { void Mark(const std::string &name) {
...@@ -334,17 +590,19 @@ void Mark(const std::string &name) { ...@@ -334,17 +590,19 @@ void Mark(const std::string &name) {
GetEventList().Record(EventType::kMark, name, g_thread_id); GetEventList().Record(EventType::kMark, name, g_thread_id);
} }
Event *PushEvent(const std::string &name, const EventRole role, Event *PushEvent(const std::string &name,
const EventRole role,
std::string attr) { std::string attr) {
return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role, return GetEventList().Record(
attr); EventType::kPushRange, name, g_thread_id, role, attr);
} }
void PopEvent(const std::string &name, const EventRole role, std::string attr) { void PopEvent(const std::string &name, const EventRole role, std::string attr) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr); GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr);
} }
void EnableProfiler(ProfilerState state) { void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled, PADDLE_ENFORCE_NE(state,
ProfilerState::kDisabled,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Can't enable profiling, since the input state is" "Can't enable profiling, since the input state is"
"ProfilerState::kDisabled")); "ProfilerState::kDisabled"));
...@@ -380,7 +638,8 @@ void ResetProfiler() { ...@@ -380,7 +638,8 @@ void ResetProfiler() {
(*it)->Clear(); (*it)->Clear();
} }
for (auto it = g_all_mem_event_lists.begin(); for (auto it = g_all_mem_event_lists.begin();
it != g_all_mem_event_lists.end(); ++it) { it != g_all_mem_event_lists.end();
++it) {
(*it)->Clear(); (*it)->Clear();
} }
} }
...@@ -576,8 +835,8 @@ static void EmulateEventPushAndPop( ...@@ -576,8 +835,8 @@ static void EmulateEventPushAndPop(
std::string name = std::string name =
prefix_stk.empty() ? evt.name : prefix_stk.top() + "/" + evt.name; prefix_stk.empty() ? evt.name : prefix_stk.top() + "/" + evt.name;
const char *attr = (evt.attr == nullptr ? "none" : evt.attr); const char *attr = (evt.attr == nullptr ? "none" : evt.attr);
Event *orig_evt = cur_thr_list->Record(EventType::kPushRange, name, tid, Event *orig_evt = cur_thr_list->Record(
evt.role, attr); EventType::kPushRange, name, tid, evt.role, attr);
(*out)[tid][evt.end_ns] = std::make_pair(orig_evt, evt.start_ns); (*out)[tid][evt.end_ns] = std::make_pair(orig_evt, evt.start_ns);
cur_thr_list->Record(EventType::kPopRange, name, tid, evt.role, attr); cur_thr_list->Record(EventType::kPopRange, name, tid, evt.role, attr);
} }
...@@ -593,8 +852,8 @@ static void EmulateCPURecordsAdd( ...@@ -593,8 +852,8 @@ static void EmulateCPURecordsAdd(
for (const auto &thr_sec : host_sec.thr_sections) { for (const auto &thr_sec : host_sec.thr_sections) {
uint64_t tid = thr_sec.thread_id; uint64_t tid = thr_sec.thread_id;
for (const auto &evt : thr_sec.events) { for (const auto &evt : thr_sec.events) {
tracer->AddCPURecords(evt.name, evt.start_ns, evt.end_ns, BlockDepth(), tracer->AddCPURecords(
tid); evt.name, evt.start_ns, evt.end_ns, BlockDepth(), tid);
} }
} }
} }
......
...@@ -30,6 +30,8 @@ limitations under the License. */ ...@@ -30,6 +30,8 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.pb.h" #include "paddle/fluid/platform/profiler.pb.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif #endif
...@@ -102,6 +104,22 @@ struct MemEvenRecorder { ...@@ -102,6 +104,22 @@ struct MemEvenRecorder {
public: public:
void PushMemRecord(const void* ptr, const Place& place, size_t size); void PushMemRecord(const void* ptr, const Place& place, size_t size);
void PopMemRecord(const void* ptr, const Place& place); void PopMemRecord(const void* ptr, const Place& place);
void PushMemRecord(const void* ptr,
const Place& place,
size_t size,
TracerMemEventType type,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved);
void PopMemRecord(const void* ptr,
const Place& place,
size_t size,
TracerMemEventType type,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved);
void Flush(); void Flush();
static MemEvenRecorder& Instance() { return recorder; } static MemEvenRecorder& Instance() { return recorder; }
...@@ -160,7 +178,8 @@ struct EventList { ...@@ -160,7 +178,8 @@ struct EventList {
std::vector<T> Reduce() { std::vector<T> Reduce() {
std::vector<T> result; std::vector<T> result;
for (auto& block : event_blocks) { for (auto& block : event_blocks) {
result.insert(result.begin(), std::make_move_iterator(block.begin()), result.insert(result.begin(),
std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end())); std::make_move_iterator(block.end()));
} }
event_blocks.clear(); event_blocks.clear();
...@@ -173,13 +192,21 @@ struct EventList { ...@@ -173,13 +192,21 @@ struct EventList {
}; };
void Mark(const std::string& name); void Mark(const std::string& name);
void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, void PushMemEvent(uint64_t start_ns,
const Place& place, const std::string& annotation); uint64_t end_ns,
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, size_t bytes,
const Place& place, const std::string& annotation); const Place& place,
Event* PushEvent(const std::string& name, const EventRole role, const std::string& annotation);
void PopMemEvent(uint64_t start_ns,
uint64_t end_ns,
size_t bytes,
const Place& place,
const std::string& annotation);
Event* PushEvent(const std::string& name,
const EventRole role,
const std::string attr = "none"); const std::string attr = "none");
void PopEvent(const std::string& name, const EventRole role, void PopEvent(const std::string& name,
const EventRole role,
const std::string attr = "none"); const std::string attr = "none");
// Return the event list of all threads. Assumed the returned value calls // Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
......
cc_library( cc_library(
host_tracer host_tracer
SRCS host_tracer.cc SRCS host_tracer.cc
DEPS enforce) DEPS enforce ddim var_type_traits)
cc_library( cc_library(
cuda_tracer cuda_tracer
SRCS cuda_tracer.cc cupti_data_process.cc SRCS cuda_tracer.cc cupti_data_process.cc
...@@ -10,7 +10,7 @@ add_subdirectory(mlu) ...@@ -10,7 +10,7 @@ add_subdirectory(mlu)
cc_library( cc_library(
event_node event_node
SRCS event_node.cc SRCS event_node.cc
DEPS enforce) DEPS enforce place)
cc_library( cc_library(
profiler_utils profiler_utils
SRCS utils.cc SRCS utils.cc
......
...@@ -18,16 +18,21 @@ ...@@ -18,16 +18,21 @@
#include <functional> #include <functional>
#include <string> #include <string>
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/event.h" // import EventRole, TODO(TIEXING): remove later #include "paddle/fluid/platform/event.h" // import EventRole, TODO(TIEXING): remove later
#include "paddle/fluid/platform/profiler/trace_event.h" #include "paddle/fluid/platform/profiler/trace_event.h"
#include "paddle/phi/core/ddim.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
struct CommonEvent { struct CommonEvent {
public: public:
CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns, CommonEvent(const char *name,
EventRole role, TracerEventType type) uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type)
: name(name), : name(name),
start_ns(start_ns), start_ns(start_ns),
end_ns(end_ns), end_ns(end_ns),
...@@ -35,8 +40,12 @@ struct CommonEvent { ...@@ -35,8 +40,12 @@ struct CommonEvent {
type(type) {} type(type) {}
CommonEvent(std::function<void *(size_t)> arena_allocator, CommonEvent(std::function<void *(size_t)> arena_allocator,
const std::string &name_str, uint64_t start_ns, uint64_t end_ns, const std::string &name_str,
EventRole role, TracerEventType type, const std::string &attr_str) uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type,
const std::string &attr_str)
: start_ns(start_ns), end_ns(end_ns), role(role), type(type) { : start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1)); auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1); strncpy(buf, name_str.c_str(), name_str.length() + 1);
...@@ -47,8 +56,11 @@ struct CommonEvent { ...@@ -47,8 +56,11 @@ struct CommonEvent {
} }
CommonEvent(std::function<void *(size_t)> arena_allocator, CommonEvent(std::function<void *(size_t)> arena_allocator,
const std::string &name_str, uint64_t start_ns, uint64_t end_ns, const std::string &name_str,
EventRole role, TracerEventType type) uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type)
: start_ns(start_ns), end_ns(end_ns), role(role), type(type) { : start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1)); auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1); strncpy(buf, name_str.c_str(), name_str.length() + 1);
...@@ -63,5 +75,61 @@ struct CommonEvent { ...@@ -63,5 +75,61 @@ struct CommonEvent {
const char *attr = nullptr; // not owned, designed for performance const char *attr = nullptr; // not owned, designed for performance
}; };
struct CommonMemEvent {
public:
CommonMemEvent(uint64_t timestamp_ns,
uint64_t addr,
TracerMemEventType type,
int64_t increase_bytes,
const Place &place,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved)
: timestamp_ns(timestamp_ns),
addr(addr),
type(type),
increase_bytes(increase_bytes),
place(place),
peak_allocated(peak_allocated),
peak_reserved(peak_reserved) {}
uint64_t timestamp_ns;
uint64_t addr;
TracerMemEventType type;
int64_t increase_bytes;
Place place;
uint64_t current_allocated;
uint64_t current_reserved;
uint64_t peak_allocated;
uint64_t peak_reserved;
};
struct OperatorSupplementOriginEvent {
public:
OperatorSupplementOriginEvent(
std::function<void *(size_t)> arena_allocator,
uint64_t timestamp_ns,
const std::string &type_name,
const std::map<std::string, std::vector<framework::DDim>> &input_shapes,
const std::map<std::string, std::vector<framework::proto::VarType::Type>>
&dtypes,
const std::vector<std::string> callstack)
: timestamp_ns(timestamp_ns),
input_shapes(input_shapes),
dtypes(dtypes),
callstack(callstack) {
auto buf = static_cast<char *>(arena_allocator(type_name.length() + 1));
strncpy(buf, type_name.c_str(), type_name.length() + 1);
op_type = buf;
}
uint64_t timestamp_ns;
const char *op_type = nullptr; // not owned, designed for performance
// input shapes
std::map<std::string, std::vector<framework::DDim>> input_shapes;
std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
// call stack
const std::vector<std::string> callstack;
};
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -11,9 +11,10 @@ ...@@ -11,9 +11,10 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/platform/profiler/host_tracer.h" #include "paddle/fluid/platform/profiler/host_tracer.h"
#include <sstream>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/flags.h"
#include "paddle/fluid/platform/profiler/common_event.h" #include "paddle/fluid/platform/profiler/common_event.h"
...@@ -21,7 +22,8 @@ ...@@ -21,7 +22,8 @@
// Used to filter events, works like glog VLOG(level). // Used to filter events, works like glog VLOG(level).
// RecordEvent will works if host_trace_level >= level. // RecordEvent will works if host_trace_level >= level.
PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1, PADDLE_DEFINE_EXPORTED_int64(host_trace_level,
1,
"RecordEvent will works " "RecordEvent will works "
"if host_trace_level >= level."); "if host_trace_level >= level.");
...@@ -50,6 +52,79 @@ void ProcessHostEvents(const HostEventSection<CommonEvent>& host_events, ...@@ -50,6 +52,79 @@ void ProcessHostEvents(const HostEventSection<CommonEvent>& host_events,
} }
} }
void ProcessHostMemEvents(
const HostEventSection<CommonMemEvent>& host_mem_events,
TraceEventCollector* collector) {
for (const auto& thr_sec : host_mem_events.thr_sections) {
uint64_t tid = thr_sec.thread_id;
if (thr_sec.thread_name != kDefaultThreadName) {
collector->AddThreadName(tid, thr_sec.thread_name);
}
for (const auto& evt : thr_sec.events) {
MemTraceEvent event;
event.timestamp_ns = evt.timestamp_ns;
event.addr = evt.addr;
event.type = evt.type;
event.increase_bytes = evt.increase_bytes;
event.place = evt.place.DebugString();
event.current_allocated = evt.current_allocated;
event.current_reserved = evt.current_reserved;
event.peak_allocated = evt.peak_allocated;
event.peak_reserved = evt.peak_reserved;
event.process_id = host_mem_events.process_id;
event.thread_id = tid;
collector->AddMemEvent(std::move(event));
}
}
}
void ProcessOperatorSupplementEvents(
const HostEventSection<OperatorSupplementOriginEvent>& op_supplement_events,
TraceEventCollector* collector) {
for (const auto& thr_sec : op_supplement_events.thr_sections) {
uint64_t tid = thr_sec.thread_id;
if (thr_sec.thread_name != kDefaultThreadName) {
collector->AddThreadName(tid, thr_sec.thread_name);
}
for (const auto& evt : thr_sec.events) {
OperatorSupplementEvent event;
event.timestamp_ns = evt.timestamp_ns;
event.op_type = evt.op_type;
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> dtypes;
std::string callstack;
for (auto it = evt.input_shapes.begin(); it != evt.input_shapes.end();
it++) {
for (auto idx = 0lu; idx < it->second.size(); idx++) {
input_shapes[it->first].push_back(std::vector<int64_t>());
for (auto dim_idx = 0; dim_idx < it->second.at(idx).size();
dim_idx++) {
input_shapes[it->first][idx].push_back(
it->second.at(idx).at(dim_idx));
}
}
}
for (auto it = evt.dtypes.begin(); it != evt.dtypes.end(); it++) {
for (auto idx = 0lu; idx < it->second.size(); idx++) {
dtypes[it->first].push_back(
framework::proto::VarType::Type_Name(it->second.at(idx)));
}
}
std::ostringstream result_string;
for (auto it = evt.callstack.begin(); it != evt.callstack.end(); it++) {
result_string << (*it) << std::endl;
}
event.input_shapes = input_shapes;
event.dtypes = dtypes;
event.callstack = result_string.str();
event.process_id = op_supplement_events.process_id;
event.thread_id = tid;
collector->AddOperatorSupplementEvent(std::move(event));
}
}
}
} // namespace } // namespace
void HostTracer::PrepareTracing() { void HostTracer::PrepareTracing() {
...@@ -60,16 +135,21 @@ void HostTracer::PrepareTracing() { ...@@ -60,16 +135,21 @@ void HostTracer::PrepareTracing() {
void HostTracer::StartTracing() { void HostTracer::StartTracing() {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
state_ == TracerState::READY || state_ == TracerState::STOPED, true, state_ == TracerState::READY || state_ == TracerState::STOPED,
true,
platform::errors::PreconditionNotMet("TracerState must be READY")); platform::errors::PreconditionNotMet("TracerState must be READY"));
HostEventRecorder<CommonEvent>::GetInstance().GatherEvents(); HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
HostEventRecorder<CommonMemEvent>::GetInstance().GatherEvents();
HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance()
.GatherEvents();
HostTraceLevel::GetInstance().SetLevel(options_.trace_level); HostTraceLevel::GetInstance().SetLevel(options_.trace_level);
state_ = TracerState::STARTED; state_ = TracerState::STARTED;
} }
void HostTracer::StopTracing() { void HostTracer::StopTracing() {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
state_, TracerState::STARTED, state_,
TracerState::STARTED,
platform::errors::PreconditionNotMet("TracerState must be STARTED")); platform::errors::PreconditionNotMet("TracerState must be STARTED"));
HostTraceLevel::GetInstance().SetLevel(HostTraceLevel::kDisabled); HostTraceLevel::GetInstance().SetLevel(HostTraceLevel::kDisabled);
state_ = TracerState::STOPED; state_ = TracerState::STOPED;
...@@ -77,11 +157,19 @@ void HostTracer::StopTracing() { ...@@ -77,11 +157,19 @@ void HostTracer::StopTracing() {
void HostTracer::CollectTraceData(TraceEventCollector* collector) { void HostTracer::CollectTraceData(TraceEventCollector* collector) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
state_, TracerState::STOPED, state_,
TracerState::STOPED,
platform::errors::PreconditionNotMet("TracerState must be STOPED")); platform::errors::PreconditionNotMet("TracerState must be STOPED"));
HostEventSection<CommonEvent> host_events = HostEventSection<CommonEvent> host_events =
HostEventRecorder<CommonEvent>::GetInstance().GatherEvents(); HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
ProcessHostEvents(host_events, collector); ProcessHostEvents(host_events, collector);
HostEventSection<CommonMemEvent> host_mem_events =
HostEventRecorder<CommonMemEvent>::GetInstance().GatherEvents();
ProcessHostMemEvents(host_mem_events, collector);
HostEventSection<OperatorSupplementOriginEvent> op_supplement_events =
HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance()
.GatherEvents();
ProcessOperatorSupplementEvents(op_supplement_events, collector);
} }
} // namespace platform } // namespace platform
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler/trace_event.h"
namespace paddle {
namespace platform {
// Memory event tracing. A trace marks memory manipulation such as allocation
// and free.
// The events can be used to draw memory variation curve.
class RecordMemEvent {
public:
/**
* @param ptr: Pointer address allocated or free.
* @param place: Device for this memory event.
* @param size: Memory size allocated or free.
* @param type: Denote manipulation type for this memory event.
*/
explicit RecordMemEvent(
const void* ptr,
const Place& place,
size_t size,
const TracerMemEventType type = TracerMemEventType::Allocate);
};
} // namespace platform
} // namespace paddle
...@@ -23,6 +23,8 @@ ...@@ -23,6 +23,8 @@
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h> #include <hip/hip_runtime.h>
#endif #endif
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_python.h" #include "paddle/fluid/platform/profiler/event_python.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/profiler.h" #include "paddle/fluid/platform/profiler/profiler.h"
...@@ -41,10 +43,10 @@ TEST(ProfilerTest, TestHostTracer) { ...@@ -41,10 +43,10 @@ TEST(ProfilerTest, TestHostTracer) {
profiler->Prepare(); profiler->Prepare();
profiler->Start(); profiler->Start();
{ {
RecordInstantEvent("TestTraceLevel_record1", TracerEventType::UserDefined, RecordInstantEvent(
2); "TestTraceLevel_record1", TracerEventType::UserDefined, 2);
RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined, RecordInstantEvent(
3); "TestTraceLevel_record2", TracerEventType::UserDefined, 3);
} }
auto profiler_result = profiler->Stop(); auto profiler_result = profiler->Stop();
auto nodetree = profiler_result->GetNodeTrees(); auto nodetree = profiler_result->GetNodeTrees();
...@@ -93,3 +95,49 @@ TEST(ProfilerTest, TestCudaTracer) { ...@@ -93,3 +95,49 @@ TEST(ProfilerTest, TestCudaTracer) {
EXPECT_GT(runtime_events.size(), 0u); EXPECT_GT(runtime_events.size(), 0u);
#endif #endif
} }
TEST(ProfilerTest, TestHostTracerForMem) {
using paddle::platform::CPUPlace;
using paddle::platform::EnableHostEventRecorder;
using paddle::platform::MemTraceEventNode;
using paddle::platform::Profiler;
using paddle::platform::ProfilerOptions;
using paddle::platform::ProfilerResult;
using paddle::platform::RecordEvent;
using paddle::platform::RecordInstantEvent;
using paddle::platform::RecordMemEvent;
using paddle::platform::TracerEventType;
using paddle::platform::TracerMemEventType;
ProfilerOptions options;
options.trace_level = 1;
options.trace_switch = 3;
auto profiler = Profiler::Create(options);
EXPECT_TRUE(profiler);
EnableHostEventRecorder();
profiler->Prepare();
profiler->Start();
{
RecordEvent event1(
"TestTracerForMem_phase1", TracerEventType::UserDefined, 1);
RecordMemEvent(reinterpret_cast<void*>(0),
CPUPlace(),
1024,
TracerMemEventType::Allocate);
RecordMemEvent(
reinterpret_cast<void*>(0), CPUPlace(), 1024, TracerMemEventType::Free);
}
{
RecordEvent event2(
"TestTracerForMem_phase2", TracerEventType::UserDefined, 1);
RecordMemEvent(reinterpret_cast<void*>(1024),
CPUPlace(),
1024,
TracerMemEventType::Allocate);
RecordMemEvent(reinterpret_cast<void*>(1024),
CPUPlace(),
1024,
TracerMemEventType::Free);
}
auto profiler_result = profiler->Stop();
auto nodetree = profiler_result->GetNodeTrees();
}
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/platform/profiler/trace_event.h"
namespace paddle {
namespace framework {
class RuntimeContext;
}
namespace platform {
class RecordOpInfoSupplement {
public:
/**
* @param type: Operator type name.
* @param attrs: Attribute map of op.
* @param shape_ctx: Infershape context object.
* @param ctx: Runtime context object.
*/
explicit RecordOpInfoSupplement(const std::string& type,
const framework::AttributeMap& attrs,
const framework::InferShapeContext& shape_ctx,
const framework::RuntimeContext& ctx);
};
} // namespace platform
} // namespace paddle
...@@ -382,7 +382,8 @@ static T PyObjectCast(PyObject *obj) { ...@@ -382,7 +382,8 @@ static T PyObjectCast(PyObject *obj) {
} catch (py::cast_error &) { } catch (py::cast_error &) {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"Python object is not type of %s, the real type is %s", "Python object is not type of %s, the real type is %s",
typeid(T).name(), obj->ob_type->tp_name)); typeid(T).name(),
obj->ob_type->tp_name));
} }
} }
...@@ -441,7 +442,8 @@ static std::vector<std::string> inline GetNameList( ...@@ -441,7 +442,8 @@ static std::vector<std::string> inline GetNameList(
} }
static void inline CreateVariableIfNotExit( static void inline CreateVariableIfNotExit(
const py::handle &py_handle, const framework::Scope &scope, const py::handle &py_handle,
const framework::Scope &scope,
const framework::Executor *exe = nullptr) { const framework::Executor *exe = nullptr) {
std::vector<std::string> vec_res; std::vector<std::string> vec_res;
...@@ -479,7 +481,8 @@ static void inline CreateVariableIfNotExit( ...@@ -479,7 +481,8 @@ static void inline CreateVariableIfNotExit(
PyObject *py_var_desc = PyObject *py_var_desc =
PyObject_GetAttrString(PyList_GET_ITEM(py_obj, i), kVarDescField); PyObject_GetAttrString(PyList_GET_ITEM(py_obj, i), kVarDescField);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
py_var_desc, platform::errors::InvalidArgument( py_var_desc,
platform::errors::InvalidArgument(
"The var_desc of parameter to set is None")); "The var_desc of parameter to set is None"));
auto var_desc = PyObjectCast<framework::VarDesc>(py_var_desc); auto var_desc = PyObjectCast<framework::VarDesc>(py_var_desc);
Py_DECREF(py_var_desc); Py_DECREF(py_var_desc);
...@@ -515,7 +518,8 @@ static void AssertStaticGraphAndDygraphGradMakerNoDiff() { ...@@ -515,7 +518,8 @@ static void AssertStaticGraphAndDygraphGradMakerNoDiff() {
} }
} }
} }
PADDLE_ENFORCE_EQ(ops.empty(), true, PADDLE_ENFORCE_EQ(ops.empty(),
true,
platform::errors::Unimplemented( platform::errors::Unimplemented(
"OperatorWithKernel [%s] have only static graph grad " "OperatorWithKernel [%s] have only static graph grad "
"maker or have only dygraph grad maker, which is not " "maker or have only dygraph grad maker, which is not "
...@@ -537,8 +541,10 @@ static int GetNCCLVersion() { ...@@ -537,8 +541,10 @@ static int GetNCCLVersion() {
#endif #endif
template <typename PlaceType> template <typename PlaceType>
static void TensorCopyFrom(framework::Tensor *dst, const framework::Tensor &src, static void TensorCopyFrom(framework::Tensor *dst,
const PlaceType &place, int64_t batch_size) { const framework::Tensor &src,
const PlaceType &place,
int64_t batch_size) {
if (batch_size < 0) { if (batch_size < 0) {
framework::TensorCopy(src, place, dst); framework::TensorCopy(src, place, dst);
} else { } else {
...@@ -624,7 +630,8 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -624,7 +630,8 @@ PYBIND11_MODULE(core_noavx, m) {
PyCapsule_GetPointer(dltensor->ptr(), "dltensor")); PyCapsule_GetPointer(dltensor->ptr(), "dltensor"));
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
dmt, platform::errors::InvalidArgument( dmt,
platform::errors::InvalidArgument(
"from_dlpack received an invalid capsule. " "from_dlpack received an invalid capsule. "
"Note that a DLPack tensor can be consumed only once.")); "Note that a DLPack tensor can be consumed only once."));
...@@ -644,7 +651,8 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -644,7 +651,8 @@ PYBIND11_MODULE(core_noavx, m) {
}); });
m.def("_create_loaded_parameter", m.def("_create_loaded_parameter",
[](const py::handle &vec_var_list, const Scope &scope, [](const py::handle &vec_var_list,
const Scope &scope,
const Executor *executor) { const Executor *executor) {
CreateVariableIfNotExit(vec_var_list, scope, executor); CreateVariableIfNotExit(vec_var_list, scope, executor);
}); });
...@@ -682,8 +690,9 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -682,8 +690,9 @@ PYBIND11_MODULE(core_noavx, m) {
<< ", sci_mode=" << print_opt.sci_mode; << ", sci_mode=" << print_opt.sci_mode;
}); });
m.def("broadcast_shape", [](const std::vector<int64_t> &x_dim, m.def(
const std::vector<int64_t> &y_dim) { "broadcast_shape",
[](const std::vector<int64_t> &x_dim, const std::vector<int64_t> &y_dim) {
return phi::vectorize(operators::details::BroadcastTwoDims( return phi::vectorize(operators::details::BroadcastTwoDims(
phi::make_ddim(x_dim), phi::make_ddim(y_dim), -1)); phi::make_ddim(x_dim), phi::make_ddim(y_dim), -1));
}); });
...@@ -808,14 +817,22 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -808,14 +817,22 @@ PYBIND11_MODULE(core_noavx, m) {
self.EmplaceBackOutput(std::move(CastPyArg2Tensor(obj, 1))); self.EmplaceBackOutput(std::move(CastPyArg2Tensor(obj, 1)));
} }
}) })
.def("add_attr", [](paddle::CustomOpKernelContext &self, .def("add_attr",
bool attr) { self.EmplaceBackAttr(attr); }) [](paddle::CustomOpKernelContext &self, bool attr) {
.def("add_attr", [](paddle::CustomOpKernelContext &self, self.EmplaceBackAttr(attr);
int attr) { self.EmplaceBackAttr(attr); }) })
.def("add_attr", [](paddle::CustomOpKernelContext &self, .def("add_attr",
float attr) { self.EmplaceBackAttr(attr); }) [](paddle::CustomOpKernelContext &self, int attr) {
.def("add_attr", [](paddle::CustomOpKernelContext &self, self.EmplaceBackAttr(attr);
int64_t attr) { self.EmplaceBackAttr(attr); }) })
.def("add_attr",
[](paddle::CustomOpKernelContext &self, float attr) {
self.EmplaceBackAttr(attr);
})
.def("add_attr",
[](paddle::CustomOpKernelContext &self, int64_t attr) {
self.EmplaceBackAttr(attr);
})
.def("add_attr", .def("add_attr",
[](paddle::CustomOpKernelContext &self, const std::string &attr) { [](paddle::CustomOpKernelContext &self, const std::string &attr) {
self.EmplaceBackAttr(attr); self.EmplaceBackAttr(attr);
...@@ -829,13 +846,14 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -829,13 +846,14 @@ PYBIND11_MODULE(core_noavx, m) {
.def("add_attr", .def("add_attr",
[](paddle::CustomOpKernelContext &self, [](paddle::CustomOpKernelContext &self,
const std::vector<int64_t> &attr) { self.EmplaceBackAttr(attr); }) const std::vector<int64_t> &attr) { self.EmplaceBackAttr(attr); })
.def("add_attr", [](paddle::CustomOpKernelContext &self, .def("add_attr",
[](paddle::CustomOpKernelContext &self,
const std::vector<std::string> &attr) { const std::vector<std::string> &attr) {
self.EmplaceBackAttr(attr); self.EmplaceBackAttr(attr);
}); });
py::class_<framework::Tensor> framework_tensor(m, "Tensor", py::class_<framework::Tensor> framework_tensor(
py::buffer_protocol()); m, "Tensor", py::buffer_protocol());
g_framework_tensor_pytype = g_framework_tensor_pytype =
reinterpret_cast<PyTypeObject *>(framework_tensor.ptr()); reinterpret_cast<PyTypeObject *>(framework_tensor.ptr());
framework_tensor framework_tensor
...@@ -918,80 +936,135 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -918,80 +936,135 @@ PYBIND11_MODULE(core_noavx, m) {
self.mutable_data<float>(place); self.mutable_data<float>(place);
}) })
.def("_mutable_data", .def("_mutable_data",
[](framework::Tensor &self, paddle::platform::CPUPlace &place, [](framework::Tensor &self,
paddle::platform::CPUPlace &place,
paddle::framework::proto::VarType::Type type) { paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>( return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type))); self.mutable_data(place, framework::TransToPhiDataType(type)));
}) })
.def("_mutable_data", .def("_mutable_data",
[](framework::Tensor &self, paddle::platform::CustomPlace &place, [](framework::Tensor &self,
paddle::platform::CustomPlace &place,
paddle::framework::proto::VarType::Type type) { paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>( return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type))); self.mutable_data(place, framework::TransToPhiDataType(type)));
}) })
.def("_mutable_data", .def("_mutable_data",
[](framework::Tensor &self, paddle::platform::XPUPlace &place, [](framework::Tensor &self,
paddle::platform::XPUPlace &place,
paddle::framework::proto::VarType::Type type) { paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>( return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type))); self.mutable_data(place, framework::TransToPhiDataType(type)));
}) })
.def("_mutable_data", .def("_mutable_data",
[](framework::Tensor &self, paddle::platform::CUDAPlace &place, [](framework::Tensor &self,
paddle::platform::CUDAPlace &place,
paddle::framework::proto::VarType::Type type) { paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>( return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type))); self.mutable_data(place, framework::TransToPhiDataType(type)));
}) })
.def("_mutable_data", .def("_mutable_data",
[](framework::Tensor &self, paddle::platform::CUDAPinnedPlace &place, [](framework::Tensor &self,
paddle::platform::CUDAPinnedPlace &place,
paddle::framework::proto::VarType::Type type) { paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>( return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type))); self.mutable_data(place, framework::TransToPhiDataType(type)));
}) })
.def("_mutable_data", .def("_mutable_data",
[](framework::Tensor &self, paddle::platform::MLUPlace &place, [](framework::Tensor &self,
paddle::platform::MLUPlace &place,
paddle::framework::proto::VarType::Type type) { paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>( return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type))); self.mutable_data(place, framework::TransToPhiDataType(type)));
}) })
.def("_clear", &framework::Tensor::clear) .def("_clear", &framework::Tensor::clear)
.def("_mutable_data", .def("_mutable_data",
[](framework::Tensor &self, paddle::platform::NPUPlace &place, [](framework::Tensor &self,
paddle::platform::NPUPlace &place,
paddle::framework::proto::VarType::Type type) { paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>( return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type))); self.mutable_data(place, framework::TransToPhiDataType(type)));
}) })
.def("_copy_from", &TensorCopyFrom<paddle::platform::CPUPlace>, .def("_copy_from",
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) &TensorCopyFrom<paddle::platform::CPUPlace>,
.def("_copy_from", &TensorCopyFrom<paddle::platform::CustomPlace>, py::arg("tensor"),
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) py::arg("place"),
.def("_copy_from", &TensorCopyFrom<paddle::platform::XPUPlace>, py::arg("batch_size") = -1)
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) .def("_copy_from",
.def("_copy_from", &TensorCopyFrom<paddle::platform::CUDAPlace>, &TensorCopyFrom<paddle::platform::CustomPlace>,
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) py::arg("tensor"),
.def("_copy_from", &TensorCopyFrom<paddle::platform::NPUPlace>, py::arg("place"),
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) py::arg("batch_size") = -1)
.def("_copy_from", &TensorCopyFrom<paddle::platform::CUDAPinnedPlace>, .def("_copy_from",
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) &TensorCopyFrom<paddle::platform::XPUPlace>,
.def("_copy_from", &TensorCopyFrom<paddle::platform::MLUPlace>, py::arg("tensor"),
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) py::arg("place"),
.def("_copy_from", &TensorCopyFrom<paddle::platform::Place>, py::arg("batch_size") = -1)
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) .def("_copy_from",
.def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>, &TensorCopyFrom<paddle::platform::CUDAPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) py::arg("tensor"),
.def("set", SetTensorFromPyArray<paddle::platform::CustomPlace>, py::arg("place"),
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) py::arg("batch_size") = -1)
.def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>, .def("_copy_from",
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) &TensorCopyFrom<paddle::platform::NPUPlace>,
.def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>, py::arg("tensor"),
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) py::arg("place"),
.def("set", SetTensorFromPyArray<paddle::platform::NPUPlace>, py::arg("batch_size") = -1)
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) .def("_copy_from",
.def("set", SetTensorFromPyArray<paddle::platform::IPUPlace>, &TensorCopyFrom<paddle::platform::CUDAPinnedPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) py::arg("tensor"),
.def("set", SetTensorFromPyArray<paddle::platform::MLUPlace>, py::arg("place"),
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) py::arg("batch_size") = -1)
.def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>, .def("_copy_from",
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false, &TensorCopyFrom<paddle::platform::MLUPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::Place>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("set",
SetTensorFromPyArray<paddle::platform::CPUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::CustomPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::XPUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::CUDAPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::NPUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::IPUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::MLUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false,
R"DOC( R"DOC(
Set the data of Tensor on place with given numpy array. Set the data of Tensor on place with given numpy array.
...@@ -1077,9 +1150,9 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -1077,9 +1150,9 @@ PYBIND11_MODULE(core_noavx, m) {
ostr << self; ostr << self;
return ostr.str(); return ostr.str();
}) /* ------ End of original Tensor ------ */ }) /* ------ End of original Tensor ------ */
.def( .def("__init__",
"__init__", [](framework::Tensor &instance,
[](framework::Tensor &instance, const std::vector<std::vector<size_t>> const std::vector<std::vector<size_t>>
&recursive_sequence_lengths) { &recursive_sequence_lengths) {
LoD new_lod; LoD new_lod;
new_lod.reserve(recursive_sequence_lengths.size()); new_lod.reserve(recursive_sequence_lengths.size());
...@@ -1088,7 +1161,8 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -1088,7 +1161,8 @@ PYBIND11_MODULE(core_noavx, m) {
std::back_inserter(new_lod)); std::back_inserter(new_lod));
LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
CheckLoD(new_offset_lod, -1), true, CheckLoD(new_offset_lod, -1),
true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The provided recursive_sequence_lengths info is " "The provided recursive_sequence_lengths info is "
"invalid, " "invalid, "
...@@ -1115,12 +1189,14 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -1115,12 +1189,14 @@ PYBIND11_MODULE(core_noavx, m) {
new_lod.reserve(lod.size()); new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod)); std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
CheckLoD(new_lod, vectorize(self.dims()).front()), true, CheckLoD(new_lod, vectorize(self.dims()).front()),
true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The provided LoD is invalid, the LoD is %s", new_lod)); "The provided LoD is invalid, the LoD is %s", new_lod));
self.set_lod(new_lod); self.set_lod(new_lod);
}, },
py::arg("lod"), R"DOC( py::arg("lod"),
R"DOC(
Set LoD of the Tensor. Set LoD of the Tensor.
Args: Args:
...@@ -1142,7 +1218,8 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -1142,7 +1218,8 @@ PYBIND11_MODULE(core_noavx, m) {
)DOC") )DOC")
.def( .def(
"set_recursive_sequence_lengths", "set_recursive_sequence_lengths",
[](framework::Tensor &self, const std::vector<std::vector<size_t>> [](framework::Tensor &self,
const std::vector<std::vector<size_t>>
&recursive_sequence_lengths) { &recursive_sequence_lengths) {
// the input recursive_sequence_lengths is length-based // the input recursive_sequence_lengths is length-based
// level-of-detail info // level-of-detail info
...@@ -1153,7 +1230,8 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -1153,7 +1230,8 @@ PYBIND11_MODULE(core_noavx, m) {
std::back_inserter(new_lod)); std::back_inserter(new_lod));
LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod); LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
CheckLoD(new_offset_lod, vectorize(self.dims()).front()), true, CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The provided recursive_sequence_lengths info is " "The provided recursive_sequence_lengths info is "
"invalid, " "invalid, "
...@@ -1162,7 +1240,8 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -1162,7 +1240,8 @@ PYBIND11_MODULE(core_noavx, m) {
new_lod)); new_lod));
self.set_lod(new_offset_lod); self.set_lod(new_offset_lod);
}, },
py::arg("recursive_sequence_lengths"), R"DOC( py::arg("recursive_sequence_lengths"),
R"DOC(
Set LoD of the Tensor according to recursive sequence lengths. Set LoD of the Tensor according to recursive sequence lengths.
For example, if recursive_sequence_lengths=[[2, 3]], which means For example, if recursive_sequence_lengths=[[2, 3]], which means
...@@ -1630,7 +1709,8 @@ PYBIND11_MODULE(core_noavx, m) { ...@@ -1630,7 +1709,8 @@ PYBIND11_MODULE(core_noavx, m) {
new (&instance) phi::SelectedRows(); new (&instance) phi::SelectedRows();
}) })
.def("__init__", .def("__init__",
[](phi::SelectedRows &instance, const std::vector<int64_t> rows, [](phi::SelectedRows &instance,
const std::vector<int64_t> rows,
const int64_t &height) { const int64_t &height) {
new (&instance) phi::SelectedRows(rows, height); new (&instance) phi::SelectedRows(rows, height);
}) })
...@@ -1693,8 +1773,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1693,8 +1773,10 @@ All parameter, weight, gradient are variables in Paddle.
[](Variable &self, Strings str_list) { [](Variable &self, Strings str_list) {
*self.GetMutable<Strings>() = str_list; *self.GetMutable<Strings>() = str_list;
}) })
.def("set_vocab", [](Variable &self, .def("set_vocab",
Vocab vocab) { *self.GetMutable<Vocab>() = vocab; }) [](Variable &self, Vocab vocab) {
*self.GetMutable<Vocab>() = vocab;
})
.def( .def(
"get_string_tensor", "get_string_tensor",
[](Variable &self) { return self.GetMutable<Strings>(); }, [](Variable &self) { return self.GetMutable<Strings>(); },
...@@ -1732,7 +1814,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1732,7 +1814,8 @@ All parameter, weight, gradient are variables in Paddle.
.def( .def(
"get_reader", "get_reader",
[](Variable &self) -> framework::ReaderHolder * { [](Variable &self) -> framework::ReaderHolder * {
PADDLE_ENFORCE_EQ(self.IsType<framework::ReaderHolder>(), true, PADDLE_ENFORCE_EQ(self.IsType<framework::ReaderHolder>(),
true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The variable is not type of ReaderHolder.")); "The variable is not type of ReaderHolder."));
return self.GetMutable<framework::ReaderHolder>(); return self.GetMutable<framework::ReaderHolder>();
...@@ -1743,7 +1826,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1743,7 +1826,8 @@ All parameter, weight, gradient are variables in Paddle.
[](Variable &self) -> Scope * { [](Variable &self) -> Scope * {
auto scope_vec = self.GetMutable<std::vector<framework::Scope *>>(); auto scope_vec = self.GetMutable<std::vector<framework::Scope *>>();
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
scope_vec->size(), 0, scope_vec->size(),
0,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The size of scope_vec should be greater than 0")); "The size of scope_vec should be greater than 0"));
return scope_vec->front(); return scope_vec->front();
...@@ -1801,7 +1885,9 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1801,7 +1885,9 @@ All parameter, weight, gradient are variables in Paddle.
out (core.Variable): the found or created variable. out (core.Variable): the found or created variable.
)DOC", )DOC",
py::return_value_policy::reference) py::return_value_policy::reference)
.def("find_var", &Scope::FindVar, py::arg("name"), .def("find_var",
&Scope::FindVar,
py::arg("name"),
R"DOC( R"DOC(
Find variable named :code:`name` in the current scope or Find variable named :code:`name` in the current scope or
its parent scope. Return None if not found. its parent scope. Return None if not found.
...@@ -1814,7 +1900,9 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1814,7 +1900,9 @@ All parameter, weight, gradient are variables in Paddle.
)DOC", )DOC",
py::return_value_policy::reference) py::return_value_policy::reference)
.def("size", &Scope::Size) .def("size", &Scope::Size)
.def("erase", &Scope::EraseVars, py::arg("names"), .def("erase",
&Scope::EraseVars,
py::arg("names"),
R"DOC( R"DOC(
Find variable named :code:`name` in the current scope or Find variable named :code:`name` in the current scope or
its parent scope. Return None if not found. its parent scope. Return None if not found.
...@@ -1827,7 +1915,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1827,7 +1915,8 @@ All parameter, weight, gradient are variables in Paddle.
)DOC", )DOC",
py::return_value_policy::reference) py::return_value_policy::reference)
.def( .def(
"new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); }, "new_scope",
[](Scope &self) -> Scope * { return &self.NewScope(); },
R"DOC( R"DOC(
Create a new sub-scope of the current scope. Create a new sub-scope of the current scope.
...@@ -1835,7 +1924,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1835,7 +1924,8 @@ All parameter, weight, gradient are variables in Paddle.
out (core._Scope): the created sub-scope. out (core._Scope): the created sub-scope.
)DOC", )DOC",
py::return_value_policy::reference) py::return_value_policy::reference)
.def("drop_kids", &Scope::DropKids, .def("drop_kids",
&Scope::DropKids,
R"DOC( R"DOC(
Delete all sub-scopes of the current scope. Delete all sub-scopes of the current scope.
)DOC") )DOC")
...@@ -1865,7 +1955,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1865,7 +1955,8 @@ All parameter, weight, gradient are variables in Paddle.
if (info.HasOpProtoAndChecker()) { if (info.HasOpProtoAndChecker()) {
std::string str; std::string str;
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
info.Proto().SerializeToString(&str), true, info.Proto().SerializeToString(&str),
true,
platform::errors::Fatal( platform::errors::Fatal(
"Serialize OpProto Error. This could be a bug of Paddle.")); "Serialize OpProto Error. This could be a bug of Paddle."));
ret_values.emplace_back(str); ret_values.emplace_back(str);
...@@ -1886,18 +1977,20 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1886,18 +1977,20 @@ All parameter, weight, gradient are variables in Paddle.
} }
return res; return res;
}); });
m.def( m.def("get_grad_op_desc",
"get_grad_op_desc", [](const OpDesc &op_desc, [](const OpDesc &op_desc,
const std::unordered_set<std::string> &no_grad_set, const std::unordered_set<std::string> &no_grad_set,
const std::vector<BlockDesc *> &grad_sub_block) { const std::vector<BlockDesc *> &grad_sub_block) {
std::unordered_map<std::string, std::string> grad_to_var; std::unordered_map<std::string, std::string> grad_to_var;
std::vector<std::unique_ptr<OpDesc>> grad_op_descs = std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
framework::OpInfoMap::Instance() framework::OpInfoMap::Instance()
.Get(op_desc.Type()) .Get(op_desc.Type())
.GradOpMaker()(op_desc, no_grad_set, &grad_to_var, .GradOpMaker()(
grad_sub_block); op_desc, no_grad_set, &grad_to_var, grad_sub_block);
std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size()); std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
std::transform(grad_op_descs.begin(), grad_op_descs.end(), std::transform(
grad_op_descs.begin(),
grad_op_descs.end(),
grad_op_desc_ptrs.begin(), grad_op_desc_ptrs.begin(),
[](std::unique_ptr<OpDesc> &p) { return p.release(); }); [](std::unique_ptr<OpDesc> &p) { return p.release(); });
return std::make_pair(grad_op_desc_ptrs, grad_to_var); return std::make_pair(grad_op_desc_ptrs, grad_to_var);
...@@ -1914,7 +2007,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1914,7 +2007,8 @@ All parameter, weight, gradient are variables in Paddle.
return framework::OpInfoMap::Instance().Get(op_type).HasInferInplace(); return framework::OpInfoMap::Instance().Get(op_type).HasInferInplace();
}); });
m.def("infer_no_need_buffer_slots", m.def("infer_no_need_buffer_slots",
[](const std::string op_type, const framework::VariableNameMap &inputs, [](const std::string op_type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs, const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs) { const framework::AttributeMap &attrs) {
auto infer_func = framework::OpInfoMap::Instance() auto infer_func = framework::OpInfoMap::Instance()
...@@ -1927,7 +2021,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -1927,7 +2021,8 @@ All parameter, weight, gradient are variables in Paddle.
return empty; return empty;
} }
}); });
m.def("prune", [](const ProgramDesc &origin, m.def("prune",
[](const ProgramDesc &origin,
const std::set<std::string> &feeded_var_names, const std::set<std::string> &feeded_var_names,
const std::vector<std::array<size_t, 2>> &targets) { const std::vector<std::array<size_t, 2>> &targets) {
ProgramDesc prog_with_targets(origin); ProgramDesc prog_with_targets(origin);
...@@ -2168,7 +2263,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2168,7 +2263,8 @@ All parameter, weight, gradient are variables in Paddle.
#endif #endif
return devices; return devices;
}); });
py::class_<platform::CustomPlace> customplace(m, "CustomPlace", py::class_<platform::CustomPlace> customplace(m,
"CustomPlace",
R"DOC( R"DOC(
CustomPlace is a descriptor of a device. CustomPlace is a descriptor of a device.
It represents a custom device on which a tensor will be allocated and a model will run. It represents a custom device on which a tensor will be allocated and a model will run.
...@@ -2182,7 +2278,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2182,7 +2278,8 @@ All parameter, weight, gradient are variables in Paddle.
g_customplace_pytype = reinterpret_cast<PyTypeObject *>(customplace.ptr()); g_customplace_pytype = reinterpret_cast<PyTypeObject *>(customplace.ptr());
customplace customplace
.def("__init__", .def("__init__",
[](platform::CustomPlace &self, const std::string &device_type, [](platform::CustomPlace &self,
const std::string &device_type,
int dev_id) { int dev_id) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE #ifdef PADDLE_WITH_CUSTOM_DEVICE
if (UNLIKELY(dev_id < 0)) { if (UNLIKELY(dev_id < 0)) {
...@@ -2190,7 +2287,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2190,7 +2287,8 @@ All parameter, weight, gradient are variables in Paddle.
"Invalid CustomPlace(%s, %d), device id must be 0 " "Invalid CustomPlace(%s, %d), device id must be 0 "
"or " "or "
"positive integer", "positive integer",
device_type, dev_id); device_type,
dev_id);
std::exit(-1); std::exit(-1);
} }
...@@ -2211,7 +2309,11 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2211,7 +2309,11 @@ All parameter, weight, gradient are variables in Paddle.
"inside " "inside "
"[0, %d), because %s " "[0, %d), because %s "
"number on your machine is %d", "number on your machine is %d",
device_type, dev_id, dev_count, device_type, dev_count); device_type,
dev_id,
dev_count,
device_type,
dev_count);
std::exit(-1); std::exit(-1);
} }
} }
...@@ -2221,7 +2323,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2221,7 +2323,8 @@ All parameter, weight, gradient are variables in Paddle.
"Invalid CustomPlace(%s, %d), the device type is " "Invalid CustomPlace(%s, %d), the device type is "
"not registered " "not registered "
"as a custom device.", "as a custom device.",
device_type, dev_id); device_type,
dev_id);
std::exit(-1); std::exit(-1);
} }
#else #else
...@@ -2293,7 +2396,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2293,7 +2396,8 @@ All parameter, weight, gradient are variables in Paddle.
LOG(ERROR) << string::Sprintf( LOG(ERROR) << string::Sprintf(
"Invalid CUDAPlace(%d), must inside [0, %d), because GPU " "Invalid CUDAPlace(%d), must inside [0, %d), because GPU "
"number on your machine is %d", "number on your machine is %d",
dev_id, platform::GetGPUDeviceCount(), dev_id,
platform::GetGPUDeviceCount(),
platform::GetGPUDeviceCount()); platform::GetGPUDeviceCount());
std::exit(-1); std::exit(-1);
} }
...@@ -2359,7 +2463,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2359,7 +2463,8 @@ All parameter, weight, gradient are variables in Paddle.
LOG(ERROR) << string::Sprintf( LOG(ERROR) << string::Sprintf(
"Invalid XPUPlace(%d), must inside [0, %d), because XPU " "Invalid XPUPlace(%d), must inside [0, %d), because XPU "
"number on your machine is %d", "number on your machine is %d",
dev_id, platform::GetXPUDeviceCount(), dev_id,
platform::GetXPUDeviceCount(),
platform::GetXPUDeviceCount()); platform::GetXPUDeviceCount());
std::exit(-1); std::exit(-1);
} }
...@@ -2524,7 +2629,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2524,7 +2629,8 @@ All parameter, weight, gradient are variables in Paddle.
LOG(ERROR) << string::Sprintf( LOG(ERROR) << string::Sprintf(
"Invalid NPUPlace(%d), must inside [0, %d), because NPU " "Invalid NPUPlace(%d), must inside [0, %d), because NPU "
"number on your machine is %d", "number on your machine is %d",
dev_id, platform::GetNPUDeviceCount(), dev_id,
platform::GetNPUDeviceCount(),
platform::GetNPUDeviceCount()); platform::GetNPUDeviceCount());
std::exit(-1); std::exit(-1);
} }
...@@ -2640,7 +2746,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2640,7 +2746,8 @@ All parameter, weight, gradient are variables in Paddle.
LOG(ERROR) << string::Sprintf( LOG(ERROR) << string::Sprintf(
"Invalid MLUPlace(%d), must inside [0, %d), because MLU " "Invalid MLUPlace(%d), must inside [0, %d), because MLU "
"number on your machine is %d", "number on your machine is %d",
dev_id, platform::GetMLUDeviceCount(), dev_id,
platform::GetMLUDeviceCount(),
platform::GetMLUDeviceCount()); platform::GetMLUDeviceCount());
std::exit(-1); std::exit(-1);
} }
...@@ -2713,8 +2820,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2713,8 +2820,10 @@ All parameter, weight, gradient are variables in Paddle.
.def("mlu_device_id", [](platform::Place &self) { return self.device; }) .def("mlu_device_id", [](platform::Place &self) { return self.device; })
.def("custom_device_id", .def("custom_device_id",
[](platform::Place &self) { return self.device; }) [](platform::Place &self) { return self.device; })
.def("set_place", [](platform::Place &self, .def("set_place",
const platform::Place &other) { self = other; }) [](platform::Place &self, const platform::Place &other) {
self = other;
})
.def("set_place", .def("set_place",
[](platform::Place &self, const platform::CPUPlace &cpu_place) { [](platform::Place &self, const platform::CPUPlace &cpu_place) {
self = cpu_place; self = cpu_place;
...@@ -2759,7 +2868,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2759,7 +2868,8 @@ All parameter, weight, gradient are variables in Paddle.
true, true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Cannot parse user input to OpDesc")); "Cannot parse user input to OpDesc"));
PADDLE_ENFORCE_EQ(desc.IsInitialized(), true, PADDLE_ENFORCE_EQ(desc.IsInitialized(),
true,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The provided OpDesc is not " "The provided OpDesc is not "
"initialized, the reason is: %s", "initialized, the reason is: %s",
...@@ -2767,43 +2877,50 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2767,43 +2877,50 @@ All parameter, weight, gradient are variables in Paddle.
return OpRegistry::CreateOp(desc); return OpRegistry::CreateOp(desc);
}) })
.def("run", .def("run",
[](OperatorBase &self, const Scope &scope, [](OperatorBase &self,
const Scope &scope,
const platform::CPUPlace &place) { const platform::CPUPlace &place) {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
self.Run(scope, place); self.Run(scope, place);
}) })
.def("run", .def("run",
[](OperatorBase &self, const Scope &scope, [](OperatorBase &self,
const Scope &scope,
const platform::XPUPlace &place) { const platform::XPUPlace &place) {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
self.Run(scope, place); self.Run(scope, place);
}) })
.def("run", .def("run",
[](OperatorBase &self, const Scope &scope, [](OperatorBase &self,
const Scope &scope,
const platform::NPUPlace &place) { const platform::NPUPlace &place) {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
self.Run(scope, place); self.Run(scope, place);
}) })
.def("run", .def("run",
[](OperatorBase &self, const Scope &scope, [](OperatorBase &self,
const Scope &scope,
const platform::CUDAPlace &place) { const platform::CUDAPlace &place) {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
self.Run(scope, place); self.Run(scope, place);
}) })
.def("run", .def("run",
[](OperatorBase &self, const Scope &scope, [](OperatorBase &self,
const Scope &scope,
const platform::CUDAPinnedPlace &place) { const platform::CUDAPinnedPlace &place) {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
self.Run(scope, place); self.Run(scope, place);
}) })
.def("run", .def("run",
[](OperatorBase &self, const Scope &scope, [](OperatorBase &self,
const Scope &scope,
const platform::MLUPlace &place) { const platform::MLUPlace &place) {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
self.Run(scope, place); self.Run(scope, place);
}) })
.def("run", .def("run",
[](OperatorBase &self, const Scope &scope, [](OperatorBase &self,
const Scope &scope,
const platform::CustomPlace &place) { const platform::CustomPlace &place) {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
self.Run(scope, place); self.Run(scope, place);
...@@ -2843,13 +2960,17 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2843,13 +2960,17 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<framework::Executor>(m, "Executor") py::class_<framework::Executor>(m, "Executor")
.def(py::init<const platform::Place &>()) .def(py::init<const platform::Place &>())
.def("close", &Executor::Close) .def("close", &Executor::Close)
.def("run_from_dataset", &Executor::RunFromDataset, .def("run_from_dataset",
&Executor::RunFromDataset,
py::call_guard<py::gil_scoped_release>()) py::call_guard<py::gil_scoped_release>())
.def("release_trainer", &Executor::ReleaseTrainer, .def("release_trainer",
&Executor::ReleaseTrainer,
py::call_guard<py::gil_scoped_release>()) py::call_guard<py::gil_scoped_release>())
.def("init_for_dataset", .def("init_for_dataset",
[](Executor &self, const ProgramDesc &prog, [](Executor &self,
const std::string &trainer_desc, Scope *scope, const ProgramDesc &prog,
const std::string &trainer_desc,
Scope *scope,
Dataset *dataset) -> std::shared_ptr<TrainerBase> { Dataset *dataset) -> std::shared_ptr<TrainerBase> {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
return self.InitForDataset(prog, trainer_desc, scope, dataset); return self.InitForDataset(prog, trainer_desc, scope, dataset);
...@@ -2860,40 +2981,62 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2860,40 +2981,62 @@ All parameter, weight, gradient are variables in Paddle.
self.RunFromDataset(trainer); self.RunFromDataset(trainer);
}) })
.def("run_prepared_ctx", .def("run_prepared_ctx",
[](Executor &self, ExecutorPrepareContext *ctx, Scope *scope, [](Executor &self,
ExecutorPrepareContext *ctx,
Scope *scope,
std::map<std::string, const LoDTensor *> *feed_targets, std::map<std::string, const LoDTensor *> *feed_targets,
std::map<std::string, FetchType *> *fetch_targets, std::map<std::string, FetchType *> *fetch_targets,
bool create_local_scope = true, bool create_vars = true, bool create_local_scope = true,
bool create_vars = true,
const std::string &feed_holder_name = "feed", const std::string &feed_holder_name = "feed",
const std::string &fetch_holder_name = "fetch") { const std::string &fetch_holder_name = "fetch") {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
self.RunPreparedContext(ctx, scope, feed_targets, fetch_targets, self.RunPreparedContext(ctx,
create_local_scope, create_vars, scope,
feed_holder_name, fetch_holder_name); feed_targets,
fetch_targets,
create_local_scope,
create_vars,
feed_holder_name,
fetch_holder_name);
}) })
.def("run_prepared_ctx", .def("run_prepared_ctx",
[](Executor &self, ExecutorPrepareContext *ctx, Scope *scope, [](Executor &self,
bool create_local_scope = true, bool create_vars = true, ExecutorPrepareContext *ctx,
Scope *scope,
bool create_local_scope = true,
bool create_vars = true,
bool keep_kids = false) { bool keep_kids = false) {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
self.RunPreparedContext(ctx, scope, create_local_scope, self.RunPreparedContext(
create_vars, keep_kids); ctx, scope, create_local_scope, create_vars, keep_kids);
}) })
.def("prepare", .def("prepare",
[](Executor &self, const ProgramDesc &program, int block_id, [](Executor &self,
const ProgramDesc &program,
int block_id,
const std::vector<std::string> &skip_ref_cnt_vars = const std::vector<std::string> &skip_ref_cnt_vars =
std::vector<std::string>(), std::vector<std::string>(),
bool force_disable_gc = false) { bool force_disable_gc = false) {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
return self.Prepare(program, block_id, skip_ref_cnt_vars, return self.Prepare(
force_disable_gc); program, block_id, skip_ref_cnt_vars, force_disable_gc);
}) })
.def("create_variables", &Executor::CreateVariables) .def("create_variables", &Executor::CreateVariables)
.def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope, .def("run",
int block_id, bool create_local_scope, bool create_vars, [](Executor &self,
const ProgramDesc &prog,
Scope *scope,
int block_id,
bool create_local_scope,
bool create_vars,
const std::vector<std::string> &fetch_vars) { const std::vector<std::string> &fetch_vars) {
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
self.Run(prog, scope, block_id, create_local_scope, create_vars, self.Run(prog,
scope,
block_id,
create_local_scope,
create_vars,
fetch_vars); fetch_vars);
}); });
...@@ -2906,8 +3049,10 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2906,8 +3049,10 @@ All parameter, weight, gradient are variables in Paddle.
}); });
py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor") py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor")
.def(py::init<const platform::Place &, const ProgramDesc &, .def(py::init<const platform::Place &,
const ProgramDesc &, Scope *>()) const ProgramDesc &,
const ProgramDesc &,
Scope *>())
.def("run", .def("run",
[](StandaloneExecutor &self, [](StandaloneExecutor &self,
const std::unordered_map<std::string, py::array> &input_dict, const std::unordered_map<std::string, py::array> &input_dict,
...@@ -2951,7 +3096,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -2951,7 +3096,8 @@ All parameter, weight, gradient are variables in Paddle.
return py::cast(std::move(ret)); return py::cast(std::move(ret));
}) })
.def("run", .def("run",
[](StandaloneExecutor &self, std::vector<std::string> feed_names, [](StandaloneExecutor &self,
std::vector<std::string> feed_names,
std::vector<std::string> fetch_names) { std::vector<std::string> fetch_names) {
paddle::framework::FetchList ret; paddle::framework::FetchList ret;
{ {
...@@ -3036,20 +3182,27 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3036,20 +3182,27 @@ All parameter, weight, gradient are variables in Paddle.
m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue); m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
m.def( m.def(
"run_cmd", "run_cmd",
[](const std::string &cmd, int time_out = -1, [](const std::string &cmd,
int time_out = -1,
int sleep_inter = -1) -> const std::string { int sleep_inter = -1) -> const std::string {
return paddle::framework::shell_get_command_output(cmd, time_out, return paddle::framework::shell_get_command_output(
sleep_inter); cmd, time_out, sleep_inter);
}, },
py::arg("cmd"), py::arg("time_out") = -1, py::arg("sleep_inter") = -1); py::arg("cmd"),
py::arg("time_out") = -1,
py::arg("sleep_inter") = -1);
m.def( m.def(
"shell_execute_cmd", "shell_execute_cmd",
[](const std::string &cmd, int time_out = 0, int sleep_inter = 0, [](const std::string &cmd,
int time_out = 0,
int sleep_inter = 0,
bool redirect_stderr = false) -> std::vector<std::string> { bool redirect_stderr = false) -> std::vector<std::string> {
return paddle::framework::shell_execute_cmd(cmd, time_out, sleep_inter, return paddle::framework::shell_execute_cmd(
redirect_stderr); cmd, time_out, sleep_inter, redirect_stderr);
}, },
py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0, py::arg("cmd"),
py::arg("time_out") = 0,
py::arg("sleep_inter") = 0,
py::arg("redirect_stderr") = false); py::arg("redirect_stderr") = false);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
...@@ -3064,13 +3217,16 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3064,13 +3217,16 @@ All parameter, weight, gradient are variables in Paddle.
#endif #endif
m.def("set_feed_variable", m.def("set_feed_variable",
static_cast<void (*)(Scope *, const LoDTensor &, const std::string &, static_cast<void (*)(
size_t)>(&framework::SetFeedVariable)); Scope *, const LoDTensor &, const std::string &, size_t)>(
&framework::SetFeedVariable));
m.def("set_feed_variable", m.def("set_feed_variable",
static_cast<void (*)(Scope *, const Strings &, const std::string &, static_cast<void (*)(
size_t)>(&framework::SetFeedVariable)); Scope *, const Strings &, const std::string &, size_t)>(
&framework::SetFeedVariable));
m.def("get_fetch_variable", m.def("get_fetch_variable",
[](const Scope &scope, const std::string &var_name, [](const Scope &scope,
const std::string &var_name,
size_t index) -> py::object { size_t index) -> py::object {
auto &var = framework::GetFetchVariable(scope, var_name, index); auto &var = framework::GetFetchVariable(scope, var_name, index);
if (data_is_lod_tensor(var)) { if (data_is_lod_tensor(var)) {
...@@ -3125,7 +3281,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3125,7 +3281,8 @@ All parameter, weight, gradient are variables in Paddle.
.def("__len__", [](LoDTensorArray &self) { return self.size(); }) .def("__len__", [](LoDTensorArray &self) { return self.size(); })
.def("__setitem__", .def("__setitem__",
[](LoDTensorArray &self, size_t i, const LoDTensor &t) { [](LoDTensorArray &self, size_t i, const LoDTensor &t) {
PADDLE_ENFORCE_LT(i, self.size(), PADDLE_ENFORCE_LT(i,
self.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The index to set is larger than the size " "The index to set is larger than the size "
"of LoDTensorArray.")); "of LoDTensorArray."));
...@@ -3139,7 +3296,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3139,7 +3296,8 @@ All parameter, weight, gradient are variables in Paddle.
self.back().ShareDataWith(t); self.back().ShareDataWith(t);
self.back().set_lod(t.lod()); self.back().set_lod(t.lod());
}, },
py::arg("tensor"), R"DOC( py::arg("tensor"),
R"DOC(
Append a LoDensor to LoDTensorArray. Append a LoDensor to LoDTensorArray.
Args: Args:
...@@ -3376,16 +3534,18 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3376,16 +3534,18 @@ All parameter, weight, gradient are variables in Paddle.
m.def("reset_profiler", platform::ResetProfiler); m.def("reset_profiler", platform::ResetProfiler);
m.def("register_pass", [](const std::string &pass_type, py::object callable) { m.def("register_pass", [](const std::string &pass_type, py::object callable) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
framework::ir::PassRegistry::Instance().Has(pass_type), false, framework::ir::PassRegistry::Instance().Has(pass_type),
false,
platform::errors::AlreadyExists("Pass '%s' is registered more than " platform::errors::AlreadyExists("Pass '%s' is registered more than "
"once. Please use another name.", "once. Please use another name.",
pass_type)); pass_type));
callable.inc_ref(); callable.inc_ref();
framework::ir::PassRegistry::Instance().Insert(pass_type, [pass_type, framework::ir::PassRegistry::Instance().Insert(
callable]() { pass_type, [pass_type, callable]() {
py::gil_scoped_acquire guard; py::gil_scoped_acquire guard;
std::unique_ptr<framework::ir::Pass> pass( std::unique_ptr<framework::ir::Pass> pass(
new framework::ir::GeneratePass(py::cast<std::string>(callable()))); new framework::ir::GeneratePass(
py::cast<std::string>(callable())));
return pass; return pass;
}); });
}); });
...@@ -3397,11 +3557,32 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3397,11 +3557,32 @@ All parameter, weight, gradient are variables in Paddle.
m.def("size_of_dtype", framework::SizeOfType); m.def("size_of_dtype", framework::SizeOfType);
py::class_<paddle::platform::ProfilerResult>(m, "_ProfilerResult") py::class_<paddle::platform::ProfilerResult>(m, "_ProfilerResult")
.def(py::init<>()) .def(py::init<>())
.def("get_data", &paddle::platform::ProfilerResult::GetData, .def("get_data",
&paddle::platform::ProfilerResult::GetData,
py::return_value_policy::automatic_reference) py::return_value_policy::automatic_reference)
.def("save", &paddle::platform::ProfilerResult::Save) .def("save", &paddle::platform::ProfilerResult::Save)
.def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo); .def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo);
py::class_<paddle::platform::MemPythonNode>(m, "MemPythonNode")
.def(py::init<>())
.def_readwrite("timestamp_ns",
&paddle::platform::MemPythonNode::timestamp_ns)
.def_readwrite("addr", &paddle::platform::MemPythonNode::addr)
.def_readwrite("type", &paddle::platform::MemPythonNode::type)
.def_readwrite("process_id", &paddle::platform::MemPythonNode::process_id)
.def_readwrite("thread_id", &paddle::platform::MemPythonNode::thread_id)
.def_readwrite("increase_bytes",
&paddle::platform::MemPythonNode::increase_bytes)
.def_readwrite("place", &paddle::platform::MemPythonNode::place)
.def_readwrite("current_allocated",
&paddle::platform::MemPythonNode::current_allocated)
.def_readwrite("current_reserved",
&paddle::platform::MemPythonNode::current_reserved)
.def_readwrite("peak_allocated",
&paddle::platform::MemPythonNode::peak_allocated)
.def_readwrite("peak_reserved",
&paddle::platform::MemPythonNode::peak_reserved);
py::class_<paddle::platform::DevicePythonNode>(m, "DevicePythonNode") py::class_<paddle::platform::DevicePythonNode>(m, "DevicePythonNode")
.def(py::init<>()) .def(py::init<>())
.def_readwrite("name", &paddle::platform::DevicePythonNode::name) .def_readwrite("name", &paddle::platform::DevicePythonNode::name)
...@@ -3424,15 +3605,22 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3424,15 +3605,22 @@ All parameter, weight, gradient are variables in Paddle.
.def_readwrite("process_id", .def_readwrite("process_id",
&paddle::platform::HostPythonNode::process_id) &paddle::platform::HostPythonNode::process_id)
.def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id) .def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id)
.def_readwrite("input_shapes",
&paddle::platform::HostPythonNode::input_shapes)
.def_readwrite("dtypes", &paddle::platform::HostPythonNode::dtypes)
.def_readwrite("callstack", &paddle::platform::HostPythonNode::callstack)
.def_readwrite("children_node", .def_readwrite("children_node",
&paddle::platform::HostPythonNode::children_node_ptrs) &paddle::platform::HostPythonNode::children_node_ptrs)
.def_readwrite("runtime_node", .def_readwrite("runtime_node",
&paddle::platform::HostPythonNode::runtime_node_ptrs) &paddle::platform::HostPythonNode::runtime_node_ptrs)
.def_readwrite("device_node", .def_readwrite("device_node",
&paddle::platform::HostPythonNode::device_node_ptrs); &paddle::platform::HostPythonNode::device_node_ptrs)
.def_readwrite("mem_node",
&paddle::platform::HostPythonNode::mem_node_ptrs);
py::class_<paddle::platform::Profiler>(m, "_Profiler") py::class_<paddle::platform::Profiler>(m, "_Profiler")
.def("create", &paddle::platform::Profiler::Create, .def("create",
&paddle::platform::Profiler::Create,
py::return_value_policy::take_ownership) py::return_value_policy::take_ownership)
.def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported) .def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported)
.def("is_cnpapi_supported", .def("is_cnpapi_supported",
...@@ -3466,6 +3654,14 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3466,6 +3654,14 @@ All parameter, weight, gradient are variables in Paddle.
})) }))
.def("end", [](platform::RecordEvent *event) { event->End(); }); .def("end", [](platform::RecordEvent *event) { event->End(); });
py::enum_<paddle::platform::TracerMemEventType>(m, "TracerMemEventType")
.value("Allocate", paddle::platform::TracerMemEventType::Allocate)
.value("Free", paddle::platform::TracerMemEventType::Free)
.value("ReservedAllocate",
paddle::platform::TracerMemEventType::ReservedAllocate)
.value("ReservedFree",
paddle::platform::TracerMemEventType::ReservedFree);
py::enum_<paddle::platform::TracerEventType>(m, "TracerEventType") py::enum_<paddle::platform::TracerEventType>(m, "TracerEventType")
.value("Operator", paddle::platform::TracerEventType::Operator) .value("Operator", paddle::platform::TracerEventType::Operator)
.value("Dataloader", paddle::platform::TracerEventType::Dataloader) .value("Dataloader", paddle::platform::TracerEventType::Dataloader)
...@@ -3509,22 +3705,29 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3509,22 +3705,29 @@ All parameter, weight, gradient are variables in Paddle.
[](ir::Pass &self, const std::string &name, const std::string &attr) { [](ir::Pass &self, const std::string &name, const std::string &attr) {
self.Set<std::string>(name, new std::string(attr)); self.Set<std::string>(name, new std::string(attr));
}) })
.def("set", [](ir::Pass &self, const std::string &name,
bool val) { self.Set<bool>(name, new bool(val)); })
.def("set", [](ir::Pass &self, const std::string &name,
int val) { self.Set<const int>(name, new int(val)); })
.def("set", .def("set",
[](ir::Pass &self, const std::string &name, [](ir::Pass &self, const std::string &name, bool val) {
self.Set<bool>(name, new bool(val));
})
.def("set",
[](ir::Pass &self, const std::string &name, int val) {
self.Set<const int>(name, new int(val));
})
.def("set",
[](ir::Pass &self,
const std::string &name,
std::vector<std::string> set) { std::vector<std::string> set) {
self.Set(name, new std::vector<std::string>(set)); self.Set(name, new std::vector<std::string>(set));
}) })
.def("set", .def("set",
[](ir::Pass &self, const std::string &name, [](ir::Pass &self,
const std::string &name,
std::unordered_set<std::string> set) { std::unordered_set<std::string> set) {
self.Set(name, new std::unordered_set<std::string>(set)); self.Set(name, new std::unordered_set<std::string>(set));
}) })
.def("set", .def("set",
[](ir::Pass &self, const std::string &name, [](ir::Pass &self,
const std::string &name,
std::unordered_set<int> set) { std::unordered_set<int> set) {
self.Set(name, new std::unordered_set<int>(set)); self.Set(name, new std::unordered_set<int>(set));
}) })
...@@ -3769,7 +3972,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3769,7 +3972,8 @@ All parameter, weight, gradient are variables in Paddle.
"reduce_strategy", "reduce_strategy",
[](const BuildStrategy &self) { return self.reduce_; }, [](const BuildStrategy &self) { return self.reduce_; },
[](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) { [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true, PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be " "BuildStrategy has been finlaized, cannot be "
"configured again.")); "configured again."));
...@@ -3799,7 +4003,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3799,7 +4003,8 @@ All parameter, weight, gradient are variables in Paddle.
[](const BuildStrategy &self) { return self.gradient_scale_; }, [](const BuildStrategy &self) { return self.gradient_scale_; },
[](BuildStrategy &self, [](BuildStrategy &self,
BuildStrategy::GradientScaleStrategy strategy) { BuildStrategy::GradientScaleStrategy strategy) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true, PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be " "BuildStrategy has been finlaized, cannot be "
"configured again.")); "configured again."));
...@@ -3864,7 +4069,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3864,7 +4069,8 @@ All parameter, weight, gradient are variables in Paddle.
"debug_graphviz_path", "debug_graphviz_path",
[](const BuildStrategy &self) { return self.debug_graphviz_path_; }, [](const BuildStrategy &self) { return self.debug_graphviz_path_; },
[](BuildStrategy &self, const std::string &path) { [](BuildStrategy &self, const std::string &path) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true, PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be " "BuildStrategy has been finlaized, cannot be "
"configured again.")); "configured again."));
...@@ -3891,7 +4097,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3891,7 +4097,8 @@ All parameter, weight, gradient are variables in Paddle.
return self.enable_sequential_execution_; return self.enable_sequential_execution_;
}, },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true, PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be " "BuildStrategy has been finlaized, cannot be "
"configured again.")); "configured again."));
...@@ -3917,7 +4124,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3917,7 +4124,8 @@ All parameter, weight, gradient are variables in Paddle.
return self.remove_unnecessary_lock_; return self.remove_unnecessary_lock_;
}, },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true, PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be " "BuildStrategy has been finlaized, cannot be "
"configured again.")); "configured again."));
...@@ -3995,7 +4203,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -3995,7 +4203,8 @@ All parameter, weight, gradient are variables in Paddle.
return self.fuse_elewise_add_act_ops_; return self.fuse_elewise_add_act_ops_;
}, },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true, PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be " "BuildStrategy has been finlaized, cannot be "
"configured again.")); "configured again."));
...@@ -4020,7 +4229,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -4020,7 +4229,8 @@ All parameter, weight, gradient are variables in Paddle.
"fuse_gemm_epilogue", "fuse_gemm_epilogue",
[](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; }, [](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true, PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be " "BuildStrategy has been finlaized, cannot be "
"configured again.")); "configured again."));
...@@ -4045,7 +4255,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -4045,7 +4255,8 @@ All parameter, weight, gradient are variables in Paddle.
"fuse_bn_act_ops", "fuse_bn_act_ops",
[](const BuildStrategy &self) { return self.fuse_bn_act_ops_; }, [](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true, PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be " "BuildStrategy has been finlaized, cannot be "
"configured again.")); "configured again."));
...@@ -4070,7 +4281,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -4070,7 +4281,8 @@ All parameter, weight, gradient are variables in Paddle.
"fuse_bn_add_act_ops", "fuse_bn_add_act_ops",
[](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; }, [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true, PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be " "BuildStrategy has been finlaized, cannot be "
"configured again.")); "configured again."));
...@@ -4095,7 +4307,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -4095,7 +4307,8 @@ All parameter, weight, gradient are variables in Paddle.
"enable_auto_fusion", "enable_auto_fusion",
[](const BuildStrategy &self) { return self.enable_auto_fusion_; }, [](const BuildStrategy &self) { return self.enable_auto_fusion_; },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true, PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be " "BuildStrategy has been finlaized, cannot be "
"configured again.")); "configured again."));
...@@ -4123,7 +4336,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -4123,7 +4336,8 @@ All parameter, weight, gradient are variables in Paddle.
return self.fuse_relu_depthwise_conv_; return self.fuse_relu_depthwise_conv_;
}, },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true, PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be " "BuildStrategy has been finlaized, cannot be "
"configured again.")); "configured again."));
...@@ -4153,7 +4367,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -4153,7 +4367,8 @@ All parameter, weight, gradient are variables in Paddle.
self.fuse_broadcast_ops_ == paddle::none; self.fuse_broadcast_ops_ == paddle::none;
}, },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true, PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, " "BuildStrategy has been finlaized, "
"cannot be configured again.")); "cannot be configured again."));
...@@ -4184,7 +4399,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -4184,7 +4399,8 @@ All parameter, weight, gradient are variables in Paddle.
self.fuse_all_optimizer_ops_ == paddle::none; self.fuse_all_optimizer_ops_ == paddle::none;
}, },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true, PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, " "BuildStrategy has been finlaized, "
"cannot be configured again.")); "cannot be configured again."));
...@@ -4194,7 +4410,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -4194,7 +4410,8 @@ All parameter, weight, gradient are variables in Paddle.
"sync_batch_norm", "sync_batch_norm",
[](const BuildStrategy &self) { return self.sync_batch_norm_; }, [](const BuildStrategy &self) { return self.sync_batch_norm_; },
[](BuildStrategy &self, bool b) { [](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true, PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet( platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be " "BuildStrategy has been finlaized, cannot be "
"configured again.")); "configured again."));
...@@ -4348,9 +4565,13 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -4348,9 +4565,13 @@ All parameter, weight, gradient are variables in Paddle.
}); });
pe.def(py::init<const std::vector<platform::Place> &, pe.def(py::init<const std::vector<platform::Place> &,
const std::vector<std::string> &, const std::string &, const std::vector<std::string> &,
Scope *, std::vector<Scope *> &, const ExecutionStrategy &, const std::string &,
const BuildStrategy &, ir::Graph *>()) Scope *,
std::vector<Scope *> &,
const ExecutionStrategy &,
const BuildStrategy &,
ir::Graph *>())
// NOTE: even we return a vec<Scope*>* to Python use reference policy. // NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element // We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope* // of vec<Scope*> will be freed by Python GC. We can only return Scope*
...@@ -4439,7 +4660,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -4439,7 +4660,8 @@ All parameter, weight, gradient are variables in Paddle.
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"Failed to convert type: %s when set IpuStrategy " "Failed to convert type: %s when set IpuStrategy "
"option: %s", "option: %s",
option.get_type(), option_name)); option.get_type(),
option_name));
} }
self.InsertStringOption(option_name, option_val); self.InsertStringOption(option_name, option_val);
} }
...@@ -4447,7 +4669,8 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -4447,7 +4669,8 @@ All parameter, weight, gradient are variables in Paddle.
if (option_name.rfind("location_", 0) == 0) { if (option_name.rfind("location_", 0) == 0) {
for (auto option : element.second.cast<py::dict>()) { for (auto option : element.second.cast<py::dict>()) {
self.SetTensorLocation( self.SetTensorLocation(
option_name, option.first.cast<std::string>(), option_name,
option.first.cast<std::string>(),
option.second.cast<std::uint64_t>()); option.second.cast<std::uint64_t>());
} }
} else if (option_name == "replicated_collectives_settings") { } else if (option_name == "replicated_collectives_settings") {
...@@ -4501,17 +4724,19 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -4501,17 +4724,19 @@ All parameter, weight, gradient are variables in Paddle.
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"Failed to convert value type: %s when set " "Failed to convert value type: %s when set "
"IpuStrategy option: %s", "IpuStrategy option: %s",
option.second.get_type(), option_key)); option.second.get_type(),
option_key));
} }
self.InsertStringPairOption(option_name, option_key, self.InsertStringPairOption(
option_val); option_name, option_key, option_val);
} }
} }
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid IpuStrategy option value type: %s, please check " "Invalid IpuStrategy option value type: %s, please check "
"input value for option: %s", "input value for option: %s",
element.second.get_type(), option_name)); element.second.get_type(),
option_name));
} }
} }
}) })
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册