未验证 提交 8dd0a3b9 编写于 作者: C chenjian 提交者: GitHub

record memory and op supplement info (#43550)

* record memory and op supplement info

* update

* update

* fix a bug

* fix memory recording

* fix a bug

* update

* update

* fix a bug

* update

* fix a bug

* fix a bug

* fix a bug

* Revert "fix a bug"

This reverts commit c1d4df52762ba9ae7c7e27cd2ba4fc3a7ed9c7a5.

* fix a bug

* fix format

* fix
上级 e64823c1
......@@ -24,6 +24,7 @@
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/os_info.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
#include "paddle/phi/core/kernel_context.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
......@@ -558,6 +559,11 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
op_with_kernel->Info().infer_shape_(
instr_node.InnerInferShapeContext().get());
}
infershape_event.End();
platform::RecordOpInfoSupplement(op->Type(),
op->Attrs(),
*(instr_node.InnerInferShapeContext()),
*(instr_node.InnerRuntimeContext()));
}
}
......
......@@ -31,6 +31,7 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
#include "paddle/phi/common/int_array.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/kernel_context.h"
......@@ -70,7 +71,8 @@ std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
};
static DDim GetDimsDebug(const ScopeBase& scope, const std::string& name,
static DDim GetDimsDebug(const ScopeBase& scope,
const std::string& name,
bool get_actual_dim = false) {
Variable* var = scope.FindVar(name);
if (var == nullptr) {
......@@ -264,7 +266,8 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
Type(), platform::TracerEventType::Operator, 1);
auto op_name = platform::OpName(outputs_, Type());
platform::RecordEvent op_name_record_event(
op_name, platform::TracerEventType::Operator,
op_name,
platform::TracerEventType::Operator,
FLAGS_enable_host_event_recorder_hook ? 20 : 1,
platform::EventRole::kUniqueOp);
RunImpl(scope, place);
......@@ -293,9 +296,11 @@ bool OperatorBase::HasInputs(const std::string& name) const {
std::string OperatorBase::Input(const std::string& name) const {
auto& ins = Inputs(name);
PADDLE_ENFORCE_LE(
ins.size(), 1UL,
ins.size(),
1UL,
platform::errors::InvalidArgument(
"Operator %s's input %s should contain only one variable.", type_,
"Operator %s's input %s should contain only one variable.",
type_,
name));
return ins.empty() ? kEmptyVarName : ins[0];
}
......@@ -304,9 +309,10 @@ const std::vector<std::string>& OperatorBase::Inputs(
const std::string& name) const {
auto it = inputs_.find(name);
PADDLE_ENFORCE_NE(
it, inputs_.end(),
platform::errors::NotFound("Operator %s does not have the input %s.",
type_, name));
it,
inputs_.end(),
platform::errors::NotFound(
"Operator %s does not have the input %s.", type_, name));
return it->second;
}
......@@ -321,9 +327,11 @@ bool OperatorBase::HasOutputs(const std::string& name) const {
std::string OperatorBase::Output(const std::string& name) const {
auto& outs = Outputs(name);
PADDLE_ENFORCE_LE(
outs.size(), 1UL,
outs.size(),
1UL,
platform::errors::InvalidArgument(
"Operator %s's output %s should contain only one variable.", type_,
"Operator %s's output %s should contain only one variable.",
type_,
name));
return outs.empty() ? kEmptyVarName : outs[0];
}
......@@ -332,7 +340,8 @@ const std::vector<std::string>& OperatorBase::Outputs(
const std::string& name) const {
auto it = outputs_.find(name);
PADDLE_ENFORCE_NE(
it, outputs_.end(),
it,
outputs_.end(),
platform::errors::NotFound(
"Operator %s does not have an output called %s.", type_, name));
return it->second;
......@@ -480,18 +489,20 @@ void OperatorBase::CheckAllInputOutputSet() const {
for (auto& in : info_->Proto().inputs()) {
if (!in.dispensable() && !in.extra()) {
PADDLE_ENFORCE_NE(
inputs_.find(in.name()), inputs_.end(),
platform::errors::NotFound("Operator %s's input (%s) is not set.",
Type(), in.name()));
inputs_.find(in.name()),
inputs_.end(),
platform::errors::NotFound(
"Operator %s's input (%s) is not set.", Type(), in.name()));
}
}
for (auto& out : info_->Proto().outputs()) {
if (!out.dispensable() && !out.extra()) {
PADDLE_ENFORCE_NE(
outputs_.find(out.name()), outputs_.end(),
platform::errors::NotFound("Operator %s's output (%s) is not set.",
Type(), out.name()));
outputs_.find(out.name()),
outputs_.end(),
platform::errors::NotFound(
"Operator %s's output (%s) is not set.", Type(), out.name()));
}
}
}
......@@ -564,10 +575,12 @@ const Variable* ExecutionContext::InputVar(const std::string& name) const {
if (it == ctx_.inputs.end()) return nullptr;
PADDLE_ENFORCE_LE(
it->second.size(), 1UL,
it->second.size(),
1UL,
platform::errors::InvalidArgument(
"Operator %s's input %s should contain only one variable.",
op_.Type(), name));
op_.Type(),
name));
return it->second.empty() ? nullptr : it->second[0];
}
......@@ -576,10 +589,12 @@ Variable* ExecutionContext::OutputVar(const std::string& name) const {
if (it == ctx_.outputs.end()) return nullptr;
PADDLE_ENFORCE_LE(
it->second.size(), 1UL,
it->second.size(),
1UL,
platform::errors::InvalidArgument(
"Operator %s's output %s should contain only one variable.",
op_.Type(), name));
op_.Type(),
name));
return it->second.empty() ? nullptr : it->second[0];
}
......@@ -594,10 +609,13 @@ const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
}
std::vector<const Tensor*> res;
res.reserve(vars.size());
std::transform(vars.begin(), vars.end(), std::back_inserter(res),
std::transform(vars.begin(),
vars.end(),
std::back_inserter(res),
[&](const Variable* var) -> const Tensor* {
if (var == nullptr) return nullptr;
PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(), true,
PADDLE_ENFORCE_EQ(var->IsType<LoDTensor>(),
true,
platform::errors::InvalidArgument(
"Input variable should be LoDTensor, "
"but the received type is %s.",
......@@ -617,7 +635,9 @@ std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
}
std::vector<Tensor*> res;
res.reserve(vars.size());
std::transform(vars.begin(), vars.end(), std::back_inserter(res),
std::transform(vars.begin(),
vars.end(),
std::back_inserter(res),
[&](Variable* var) -> Tensor* {
return var == nullptr ? nullptr
: var->GetMutable<LoDTensor>();
......@@ -675,7 +695,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
const auto& in = it->second;
if (in.size() == 0) return false;
PADDLE_ENFORCE_EQ(
in.size(), 1UL,
in.size(),
1UL,
platform::errors::InvalidArgument(
"Input %s should not contain more than one inputs.", name));
return in[0] != nullptr;
......@@ -693,7 +714,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
return false;
}
PADDLE_ENFORCE_EQ(
out.size(), 1UL,
out.size(),
1UL,
platform::errors::InvalidArgument(
"Output %s should not contain more than one outputs.", name));
return out[0] != nullptr;
......@@ -750,11 +772,14 @@ class RuntimeInferShapeContext : public InferShapeContext {
std::string GetInputNameByIdx(size_t idx) const override {
auto& op_proto =
paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(),
PADDLE_ENFORCE_LT(idx,
op_proto->inputs().size(),
platform::errors::OutOfRange(
"The index should be less than the size of inputs of "
"operator %s, but got index is %d and size is %d",
op_.Type(), idx, op_proto->inputs().size()));
op_.Type(),
idx,
op_proto->inputs().size()));
return op_proto->inputs()[idx].name();
}
......@@ -762,42 +787,55 @@ class RuntimeInferShapeContext : public InferShapeContext {
auto& op_proto =
paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_;
PADDLE_ENFORCE_LT(
idx, op_proto->outputs().size(),
idx,
op_proto->outputs().size(),
platform::errors::OutOfRange(
"The index should be less than the size of outputs of "
"operator %s, but got index is %d and size is %d",
op_.Type(), idx, op_proto->outputs().size()));
op_.Type(),
idx,
op_proto->outputs().size()));
return op_proto->outputs()[idx].name();
}
void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
void ShareDim(const std::string& in,
const std::string& out,
size_t i = 0,
size_t j = 0) override {
auto in_it = ctx_.inputs.find(in);
auto out_it = ctx_.outputs.find(out);
PADDLE_ENFORCE_NE(
in_it, ctx_.inputs.end(),
in_it,
ctx_.inputs.end(),
platform::errors::NotFound("Input %s does not exist.", in));
PADDLE_ENFORCE_NE(
out_it, ctx_.outputs.end(),
out_it,
ctx_.outputs.end(),
platform::errors::NotFound("Output %s does not exist.", out));
PADDLE_ENFORCE_LT(i, in_it->second.size(),
PADDLE_ENFORCE_LT(i,
in_it->second.size(),
platform::errors::InvalidArgument(
"The index of input dimension is out of range, "
"excepted index less than %zu, but received %zu.",
in_it->second.size(), i));
PADDLE_ENFORCE_LT(j, out_it->second.size(),
in_it->second.size(),
i));
PADDLE_ENFORCE_LT(j,
out_it->second.size(),
platform::errors::InvalidArgument(
"The index of output dimension is out of range, "
"excepted index less than %zu, but received %zu.",
out_it->second.size(), j));
out_it->second.size(),
j));
Variable* in_var = in_it->second[i];
Variable* out_var = out_it->second[j];
PADDLE_ENFORCE_EQ(
in_var->Type(), out_var->Type(),
in_var->Type(),
out_var->Type(),
platform::errors::InvalidArgument(
"The type of input (%s) and output (%s) are inconsistent.", in,
"The type of input (%s) and output (%s) are inconsistent.",
in,
out));
if (in_var->IsType<phi::SelectedRows>()) {
......@@ -821,19 +859,22 @@ class RuntimeInferShapeContext : public InferShapeContext {
const std::string& out) const override {
auto in_it = ctx_.inputs.find(in);
auto out_it = ctx_.outputs.find(out);
PADDLE_ENFORCE_NE(in_it, ctx_.inputs.end(),
PADDLE_ENFORCE_NE(in_it,
ctx_.inputs.end(),
platform::errors::NotFound(
"Input [%s] found error in Op [%s]", in, op_.Type()));
PADDLE_ENFORCE_NE(
out_it, ctx_.outputs.end(),
platform::errors::NotFound("Output [%s] found error in Op [%s]", out,
op_.Type()));
out_it,
ctx_.outputs.end(),
platform::errors::NotFound(
"Output [%s] found error in Op [%s]", out, op_.Type()));
auto& in_var_list = in_it->second;
auto& out_var_list = out_it->second;
PADDLE_ENFORCE_EQ(
in_var_list.size(), out_var_list.size(),
in_var_list.size(),
out_var_list.size(),
platform::errors::PreconditionNotMet(
"Op [%s]: Input var size should be equal with output var size",
op_.Type()));
......@@ -848,10 +889,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
Variable* in_var = in_var_list[i];
if (!in_var->IsType<LoDTensor>()) return;
Variable* out_var = out_var_list[i];
PADDLE_ENFORCE_EQ(out_var->IsType<LoDTensor>(), true,
PADDLE_ENFORCE_EQ(out_var->IsType<LoDTensor>(),
true,
platform::errors::PreconditionNotMet(
"The %d-th output of Output(%s) must be LoDTensor.",
i, out_var_names[i]));
i,
out_var_names[i]));
auto& in_tensor = in_var->Get<LoDTensor>();
auto* out_tensor = out_var->GetMutable<LoDTensor>();
out_tensor->set_lod(in_tensor.lod());
......@@ -862,32 +905,41 @@ class RuntimeInferShapeContext : public InferShapeContext {
}
}
void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
void ShareLoD(const std::string& in,
const std::string& out,
size_t i = 0,
size_t j = 0) const override {
auto in_it = ctx_.inputs.find(in);
auto out_it = ctx_.outputs.find(out);
PADDLE_ENFORCE_NE(
in_it, ctx_.inputs.end(),
in_it,
ctx_.inputs.end(),
platform::errors::NotFound("Input %s does not exist.", in));
PADDLE_ENFORCE_NE(
out_it, ctx_.outputs.end(),
out_it,
ctx_.outputs.end(),
platform::errors::NotFound("Output %s does not exist.", out));
PADDLE_ENFORCE_LT(i, in_it->second.size(),
PADDLE_ENFORCE_LT(i,
in_it->second.size(),
platform::errors::InvalidArgument(
"The index of input dimension is out of range, "
"excepted index less than %zu, but received %zu.",
in_it->second.size(), i));
PADDLE_ENFORCE_LT(j, out_it->second.size(),
in_it->second.size(),
i));
PADDLE_ENFORCE_LT(j,
out_it->second.size(),
platform::errors::InvalidArgument(
"The index of output dimension is out of range, "
"excepted index less than %zu, but received %zu.",
out_it->second.size(), j));
out_it->second.size(),
j));
Variable* in_var = in_it->second.at(i);
if (!in_var->IsType<LoDTensor>()) return;
Variable* out_var = out_it->second.at(j);
PADDLE_ENFORCE_EQ(
out_var->IsType<LoDTensor>(), true,
out_var->IsType<LoDTensor>(),
true,
platform::errors::InvalidArgument(
"The %zu-th output of Output(%s) must be LoDTensor.", j, out));
auto& in_tensor = in_var->Get<LoDTensor>();
......@@ -922,7 +974,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
"set in the runtime kernel."));
}
void SetLoDLevel(const std::string& out, int32_t lod_level,
void SetLoDLevel(const std::string& out,
int32_t lod_level,
size_t j = 0) const override {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"SetLoDLevel is only used in compile time. The calculation of "
......@@ -965,10 +1018,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
DDim GetInputDim(const std::string& name) const override {
const std::vector<Variable*>& vars = InputVars(name);
PADDLE_ENFORCE_EQ(
vars.size(), 1UL,
vars.size(),
1UL,
platform::errors::InvalidArgument(
"Input(%s) should hold one element, but now it holds %zu elements.",
name, vars.size()));
name,
vars.size()));
return this->GetDim(vars[0]);
}
......@@ -994,10 +1049,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
void SetOutputDim(const std::string& name, const DDim& dim) override {
auto& vars = OutputVars(name);
PADDLE_ENFORCE_EQ(
vars.size(), 1UL,
vars.size(),
1UL,
platform::errors::InvalidArgument("Output(%s) should hold one element, "
"but now it holds %zu elements.",
name, vars.size()));
name,
vars.size()));
SetDim(vars[0], dim);
}
......@@ -1034,7 +1091,9 @@ class RuntimeInferShapeContext : public InferShapeContext {
std::vector<DDim> GetDims(const std::vector<Variable*>& vars) const {
std::vector<DDim> ret;
ret.reserve(vars.size());
std::transform(vars.begin(), vars.end(), std::back_inserter(ret),
std::transform(vars.begin(),
vars.end(),
std::back_inserter(ret),
[this](Variable* var) { return this->GetDim(var); });
return ret;
}
......@@ -1060,12 +1119,14 @@ class RuntimeInferShapeContext : public InferShapeContext {
void SetDims(const std::vector<Variable*>& vars,
const std::vector<DDim>& dims) {
size_t length = vars.size();
PADDLE_ENFORCE_EQ(length, dims.size(),
PADDLE_ENFORCE_EQ(length,
dims.size(),
platform::errors::InvalidArgument(
"The number of input variables do not match the "
"number of input dimensions, the number of variables "
"is %zu, the number of dimensions is %zu.",
length, dims.size()));
length,
dims.size()));
for (size_t i = 0; i < length; ++i) {
if (vars[i] == nullptr) {
continue;
......@@ -1084,9 +1145,12 @@ class RuntimeInferShapeContext : public InferShapeContext {
const std::vector<Variable*>& vars) const {
std::vector<proto::VarType::Type> retv;
retv.resize(vars.size());
std::transform(vars.begin(), vars.end(), retv.begin(),
std::transform(vars.begin(),
vars.end(),
retv.begin(),
std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType),
this, std::placeholders::_1));
this,
std::placeholders::_1));
return retv;
}
......@@ -1098,7 +1162,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
const std::vector<Variable*>& InputVars(const std::string& name) const {
auto it = ctx_.inputs.find(name);
PADDLE_ENFORCE_NE(
it, ctx_.inputs.end(),
it,
ctx_.inputs.end(),
platform::errors::NotFound(
"Operator (%s) does not have the input (%s).", op_.Type(), name));
return it->second;
......@@ -1107,7 +1172,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
const std::vector<Variable*>& OutputVars(const std::string& name) const {
auto it = ctx_.outputs.find(name);
PADDLE_ENFORCE_NE(
it, ctx_.outputs.end(),
it,
ctx_.outputs.end(),
platform::errors::NotFound(
"Operator (%s) does not have the outputs (%s).", op_.Type(), name));
return it->second;
......@@ -1143,20 +1209,23 @@ static void CheckTensorNANOrInf(const std::string& op_type,
return;
}
PADDLE_ENFORCE_NE(
framework::TensorContainsInf(tensor), true,
platform::errors::Fatal("Operator %s output Tensor %s contains Inf.",
op_type, name));
framework::TensorContainsInf(tensor),
true,
platform::errors::Fatal(
"Operator %s output Tensor %s contains Inf.", op_type, name));
PADDLE_ENFORCE_NE(
framework::TensorContainsNAN(tensor), true,
platform::errors::Fatal("Operator %s output Tensor %s contains NAN.",
op_type, name));
framework::TensorContainsNAN(tensor),
true,
platform::errors::Fatal(
"Operator %s output Tensor %s contains NAN.", op_type, name));
}
bool OperatorWithKernel::SupportGPU() const {
auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
phi::TransToPhiKernelName(type_));
auto has_phi_kernel =
std::any_of(phi_kernels.begin(), phi_kernels.end(),
std::any_of(phi_kernels.begin(),
phi_kernels.end(),
[](phi::KernelKeyMap::const_reference kern_pair) {
return kern_pair.first.backend() == phi::Backend::GPU;
});
......@@ -1169,7 +1238,8 @@ bool OperatorWithKernel::SupportGPU() const {
} else {
auto& op_kernels = kernel_iter->second;
return std::any_of(
op_kernels.begin(), op_kernels.end(),
op_kernels.begin(),
op_kernels.end(),
[](OpKernelMap::const_reference kern_pair) {
return platform::is_gpu_place(kern_pair.first.place_);
});
......@@ -1181,7 +1251,8 @@ bool OperatorWithKernel::SupportNPU() const {
auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap(
phi::TransToPhiKernelName(type_));
auto has_phi_kernel =
std::any_of(phi_kernels.begin(), phi_kernels.end(),
std::any_of(phi_kernels.begin(),
phi_kernels.end(),
[](phi::KernelKeyMap::const_reference kern_pair) {
return kern_pair.first.backend() == phi::Backend::NPU;
});
......@@ -1194,7 +1265,8 @@ bool OperatorWithKernel::SupportNPU() const {
} else {
auto& op_kernels = kernel_iter->second;
return std::any_of(
op_kernels.begin(), op_kernels.end(),
op_kernels.begin(),
op_kernels.end(),
[](OpKernelMap::const_reference kern_pair) {
return platform::is_npu_place(kern_pair.first.place_);
});
......@@ -1214,7 +1286,8 @@ bool OperatorWithKernel::SupportsMKLDNN(
return false;
}
auto& op_kernels = op_kernel_iter->second;
return std::any_of(op_kernels.begin(), op_kernels.end(),
return std::any_of(op_kernels.begin(),
op_kernels.end(),
[data_type](OpKernelMap::const_reference kern_pair) {
return platform::is_cpu_place(kern_pair.first.place_) &&
kern_pair.first.library_type_ ==
......@@ -1496,10 +1569,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
{
platform::RecordEvent record_event("prepare_data",
platform::TracerEventType::OperatorInner,
1, platform::EventRole::kInnerOp);
1,
platform::EventRole::kInnerOp);
if (need_prepare_data_) {
transfer_scope = PrepareData(scope, *kernel_type_,
&transfered_inplace_vars, runtime_ctx);
transfer_scope = PrepareData(
scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx);
}
}
// exec scope is the scope that kernel actually executed on.
......@@ -1509,9 +1583,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
if (!all_kernels_must_compute_runtime_shape_) {
platform::RecordEvent record_event("infer_shape",
platform::TracerEventType::OperatorInner,
1, platform::EventRole::kInnerOp);
1,
platform::EventRole::kInnerOp);
RuntimeInferShapeContext infer_shape_ctx(*this, *runtime_ctx);
this->Info().infer_shape_(&infer_shape_ctx);
record_event.End();
platform::RecordOpInfoSupplement(
Type(), Attrs(), infer_shape_ctx, *runtime_ctx);
}
if (FLAGS_enable_unused_var_check) {
......@@ -1523,7 +1601,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
{
platform::RecordEvent record_event("compute",
platform::TracerEventType::OperatorInner,
1, platform::EventRole::kInnerOp);
1,
platform::EventRole::kInnerOp);
if (run_phi_kernel_) {
phi::KernelContext pt_kernel_context;
// Do data transform before building KernelContext
......@@ -1663,7 +1742,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
auto& all_op_kernels = AllOpKernels();
auto kernels_iter = all_op_kernels.find(type_);
PADDLE_ENFORCE_NE(
kernels_iter, all_op_kernels.end(),
kernels_iter,
all_op_kernels.end(),
platform::errors::Unavailable(
"There are no kernels which are registered in the %s operator.",
type_));
......@@ -1785,9 +1865,11 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
platform::errors::NotFound(
"Operator (%s) does not have kernel for %s.", type_,
PADDLE_ENFORCE_NE(
kernel_iter,
kernels.end(),
platform::errors::NotFound("Operator (%s) does not have kernel for %s.",
type_,
KernelTypeToString(expected_kernel_key)));
std::lock_guard<std::mutex> lock(cache_update_mutex_);
......@@ -1798,7 +1880,8 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
}
void OperatorWithKernel::TransferInplaceVarsBack(
const Scope& scope, const std::vector<std::string>& inplace_vars,
const Scope& scope,
const std::vector<std::string>& inplace_vars,
const Scope& transfer_scope) const {
for (auto& var_name : inplace_vars) {
VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
......@@ -1809,7 +1892,8 @@ void OperatorWithKernel::TransferInplaceVarsBack(
auto* original_tensor =
GetMutableLoDTensorOrSelectedRowsValueFromVar(origin_var);
auto* var = transfer_scope.FindVar(var_name);
PADDLE_ENFORCE_NOT_NULL(var, platform::errors::InvalidArgument(
PADDLE_ENFORCE_NOT_NULL(var,
platform::errors::InvalidArgument(
"The variable[%s] is nullptr.", var_name));
auto* transformed_tensor = GetLoDTensorOrSelectedRowsValueFromVar(*var);
auto original_dims = original_tensor->dims();
......@@ -1890,7 +1974,8 @@ void OperatorWithKernel::HandleComplexGradToRealGrad(
}
Scope* OperatorWithKernel::PrepareData(
const Scope& scope, const OpKernelType& expected_kernel_key,
const Scope& scope,
const OpKernelType& expected_kernel_key,
std::vector<std::string>* transfered_inplace_vars,
RuntimeContext* ctx) const {
Scope* new_scope = nullptr;
......@@ -1947,8 +2032,8 @@ Scope* OperatorWithKernel::PrepareData(
input_vars[i] = trans_var;
auto out = trans_var->GetMutable<LoDTensor>();
out->Resize(tensor_in->dims());
platform::MatchShapeToLayout(out, tensor_in->layout(),
DataLayout::kNHWC);
platform::MatchShapeToLayout(
out, tensor_in->layout(), DataLayout::kNHWC);
VLOG(7) << "Created reshaped dummy input based on MKL-DNN Tensor , "
"but kNHWC layout"
<< var_name_item.first << " in Operator " << type_;
......@@ -1995,8 +2080,8 @@ Scope* OperatorWithKernel::PrepareData(
if (!run_by_executor_ &&
(platform::is_gpu_place(kernel_type_for_var.place_) ||
platform::is_gpu_place(expected_kernel_key.place_))) {
new_scope = TryCreateTransferScope(kernel_type_for_var,
expected_kernel_key, &scope);
new_scope = TryCreateTransferScope(
kernel_type_for_var, expected_kernel_key, &scope);
enable_cache_transfer_scope_ = true;
}
if (!new_scope) {
......@@ -2058,7 +2143,8 @@ Scope* OperatorWithKernel::PrepareData(
}
void OperatorWithKernel::ParseInputDataType(
const Variable* var, const std::string& name,
const Variable* var,
const std::string& name,
proto::VarType::Type* data_type) const {
if (var != nullptr) {
const Tensor* t = nullptr;
......@@ -2078,17 +2164,20 @@ void OperatorWithKernel::ParseInputDataType(
}
if (t != nullptr) {
PADDLE_ENFORCE_EQ(
t->IsInitialized(), true,
t->IsInitialized(),
true,
platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
"contains uninitialized Tensor.",
Type(), name));
Type(),
name));
*data_type = paddle::framework::TransToProtoVarType(t->dtype());
}
}
}
void OperatorWithKernel::ParseMultiInputDataType(
const std::vector<Variable*>& vars, const std::string& name,
const std::vector<Variable*>& vars,
const std::string& name,
proto::VarType::Type* data_type) const {
proto::VarType::Type default_data_type =
static_cast<proto::VarType::Type>(-1);
......@@ -2112,10 +2201,12 @@ void OperatorWithKernel::ParseMultiInputDataType(
}
if (t != nullptr) {
PADDLE_ENFORCE_EQ(
t->IsInitialized(), true,
t->IsInitialized(),
true,
platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
"contains uninitialized Tensor.",
Type(), name));
Type(),
name));
proto::VarType::Type tmp =
paddle::framework::TransToProtoVarType(t->dtype());
PADDLE_ENFORCE(tmp == *data_type || *data_type == default_data_type,
......@@ -2125,7 +2216,9 @@ void OperatorWithKernel::ParseMultiInputDataType(
"consistent or reigster GetExpectedKernelType. The "
"current variable type is (%s), but the "
"previous variable type is (%s).",
Type(), name, DataTypeToString(tmp),
Type(),
name,
DataTypeToString(tmp),
DataTypeToString(*data_type)));
*data_type = tmp;
}
......@@ -2146,7 +2239,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
}
}
PADDLE_ENFORCE_NE(
data_type, dafault_data_type,
data_type,
dafault_data_type,
platform::errors::NotFound(
"DataType should be indicated by input Variable at %s.", Type()));
return data_type;
......@@ -2163,12 +2257,14 @@ proto::VarType::Type OperatorWithKernel::IndicateVarDataType(
ParseMultiInputDataType(ctx.MultiInputVar(name), name, &data_type);
}
PADDLE_ENFORCE_NE(
data_type, dafault_data_type,
data_type,
dafault_data_type,
platform::errors::InvalidArgument(
"The Input Variable(%s) of (%s) Operator used to determine kernel "
"data type is empty or not LoDTensor or SelectedRows or "
"LoDTensorArray.",
name, Type()));
name,
Type()));
return data_type;
}
......@@ -2200,11 +2296,14 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely(
t,
platform::errors::InvalidArgument(
"The Tensor of variable %s is nullptr when promote complex types."));
PADDLE_ENFORCE_EQ(t->IsInitialized(), true,
PADDLE_ENFORCE_EQ(t->IsInitialized(),
true,
platform::errors::InvalidArgument(
"The Tensor in the %s Op's Input Variable %s(%s) is "
"not initialized.",
Type(), name, ctx.InputName(name)));
Type(),
name,
ctx.InputName(name)));
return t;
}
......@@ -2216,7 +2315,8 @@ Tensor* OperatorWithKernel::GetTensorFormInputSafely(
* the kernel data type.
*/
proto::VarType::Type OperatorWithKernel::IndicateOrPromoteVarDataTypes(
const ExecutionContext& ctx, const std::string& name1,
const ExecutionContext& ctx,
const std::string& name1,
const std::string& name2) const {
// 1. Get tensor
auto* tensor_a = GetTensorFormInputSafely(ctx, name1);
......@@ -2238,10 +2338,11 @@ OpKernelType OperatorWithKernel::GetExpectedKernelType(
}
OpKernelType OperatorWithKernel::GetKernelTypeForVar(
const std::string& var_name, const Tensor& tensor,
const std::string& var_name,
const Tensor& tensor,
const OpKernelType& expected_kernel_type) const {
return OpKernelType(expected_kernel_type.data_type_, tensor.place(),
tensor.layout());
return OpKernelType(
expected_kernel_type.data_type_, tensor.place(), tensor.layout());
}
phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
......@@ -2264,16 +2365,19 @@ phi::KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs(
}
Scope* OperatorWithKernel::PreparePhiData(
const Scope& scope, const phi::Kernel& pt_kernel,
const Scope& scope,
const phi::Kernel& pt_kernel,
const phi::KernelSignature& pt_kernel_signature,
RuntimeContext* ctx) const {
const auto& input_names = pt_kernel_signature.input_names;
auto input_defs = pt_kernel.args_def().input_defs();
PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
PADDLE_ENFORCE_EQ(input_names.size(),
input_defs.size(),
platform::errors::InvalidArgument(
"The size of inputs_args names (%d) must be equal to "
"the size of kernel input_defs (%d).",
input_names.size(), input_defs.size()));
input_names.size(),
input_defs.size()));
Scope* new_scope = nullptr;
auto& name_map = Inputs();
const std::unordered_set<std::string>* no_buffer_ins = nullptr;
......@@ -2362,7 +2466,8 @@ Scope* OperatorWithKernel::PreparePhiData(
}
void OperatorWithKernel::BuildPhiKernelContext(
const RuntimeContext& ctx, platform::DeviceContext* dev_ctx,
const RuntimeContext& ctx,
platform::DeviceContext* dev_ctx,
phi::KernelContext* pt_kernel_context) const {
pt_kernel_context->SetDeviceContext(dev_ctx);
......@@ -2374,23 +2479,29 @@ void OperatorWithKernel::BuildPhiKernelContext(
auto attr_defs = pt_kernel_->args_def().attribute_defs();
auto output_defs = pt_kernel_->args_def().output_defs();
PADDLE_ENFORCE_EQ(input_names.size(), input_defs.size(),
PADDLE_ENFORCE_EQ(input_names.size(),
input_defs.size(),
platform::errors::InvalidArgument(
"The size of inputs_args names (%d) must be equal to "
"the size of kernel input_defs (%d).",
input_names.size(), input_defs.size()));
input_names.size(),
input_defs.size()));
PADDLE_ENFORCE_EQ(output_names.size(), output_defs.size(),
PADDLE_ENFORCE_EQ(output_names.size(),
output_defs.size(),
platform::errors::InvalidArgument(
"The size of outputs_args names (%d) must be equal to "
"the size of kernel output_defs (%d).",
output_names.size(), output_defs.size()));
output_names.size(),
output_defs.size()));
PADDLE_ENFORCE_EQ(attr_names.size(), attr_defs.size(),
PADDLE_ENFORCE_EQ(attr_names.size(),
attr_defs.size(),
platform::errors::InvalidArgument(
"The size of attribute_args names (%d) must be equal "
"to the size of kernel attribute_defs (%d).",
attr_names.size(), attr_defs.size()));
attr_names.size(),
attr_defs.size()));
for (size_t i = 0; i < input_names.size(); ++i) {
auto it = ctx.inputs.find(input_names[i]);
......@@ -2572,7 +2683,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
break;
case phi::AttributeType::SCALARS: {
PADDLE_ENFORCE_NE(
attr_iter, Attrs().end(),
attr_iter,
Attrs().end(),
platform::errors::NotFound("(%s) is not found in AttributeMap when "
"buildind static KernelContext.",
attr_names[i]));
......@@ -2636,7 +2748,8 @@ void OperatorWithKernel::BuildPhiKernelContext(
} break;
default: {
PADDLE_ENFORCE_NE(
attr_iter, Attrs().end(),
attr_iter,
Attrs().end(),
platform::errors::NotFound("(%s) is not found in AttributeMap when "
"buildind static KernelContext.",
attr_names[i]));
......
cc_library(
allocator
SRCS allocator.cc
DEPS place stats)
DEPS place stats profiler)
cc_library(
cpu_allocator
SRCS cpu_allocator.cc
......@@ -21,7 +21,7 @@ cc_library(
cc_library(
naive_best_fit_allocator
SRCS naive_best_fit_allocator.cc
DEPS allocator buddy_allocator profiler)
DEPS allocator buddy_allocator)
cc_test(
naive_best_fit_allocator_test
SRCS naive_best_fit_allocator_test.cc
......
......@@ -32,7 +32,8 @@
#endif
PADDLE_DEFINE_EXPORTED_bool(
init_allocated_mem, false,
init_allocated_mem,
false,
"It is a mistake that the values of the memory allocated by "
"BuddyAllocator are always zeroed in some op's implementation. "
"To find this error in time, we use init_allocated_mem to indicate "
......@@ -77,7 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
std::call_once(init_flag, []() {
a = new detail::BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
platform::CpuMinChunkSize(), platform::CpuMaxChunkSize());
platform::CpuMinChunkSize(),
platform::CpuMaxChunkSize());
});
return a;
......@@ -95,7 +97,8 @@ void *Alloc<platform::CPUPlace>(const platform::CPUPlace &place, size_t size) {
}
template <>
void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
void Free<platform::CPUPlace>(const platform::CPUPlace &place,
void *p,
size_t size) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p);
......@@ -125,7 +128,8 @@ void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) {
return p;
}
template <>
void Free<platform::IPUPlace>(const platform::IPUPlace &place, void *p,
void Free<platform::IPUPlace>(const platform::IPUPlace &place,
void *p,
size_t size) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p);
......@@ -154,7 +158,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
}
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
ret,
XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], no enough memory", ret));
if (FLAGS_init_allocated_mem) {
......@@ -171,7 +176,8 @@ void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
}
template <>
void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
void Free<platform::XPUPlace>(const platform::XPUPlace &place,
void *p,
size_t size) {
#ifdef PADDLE_WITH_XPU
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
......@@ -234,11 +240,13 @@ class NPUBuddyAllocatorList {
BuddyAllocator *Get(int npu_id) {
auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), npu_id));
PADDLE_ENFORCE_LT(pos, devices_.size(),
PADDLE_ENFORCE_LT(pos,
devices_.size(),
platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of "
"devices is %d, the index is %d",
devices_.size(), pos));
devices_.size(),
pos));
std::call_once(*init_flags_[pos], [this, pos] {
platform::SetNPUDeviceId(devices_[pos]);
......@@ -246,7 +254,8 @@ class NPUBuddyAllocatorList {
new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::NPUAllocator(devices_[pos])),
platform::NPUMinChunkSize(),
platform::NPUMaxChunkSize(), EXTRA_PADDING_SIZE));
platform::NPUMaxChunkSize(),
EXTRA_PADDING_SIZE));
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' "
......@@ -312,8 +321,10 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in NPU %d, avaliable %s, total %s, NpuMinChunkSize "
"%s, NpuMaxChunkSize %s, NPU memory used: %s.",
string::HumanReadableSize(size), place.device,
string::HumanReadableSize(avail), string::HumanReadableSize(total),
string::HumanReadableSize(size),
place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::NPUPlace>(place))));
......@@ -331,7 +342,8 @@ void *Alloc<platform::NPUPlace>(const platform::NPUPlace &place, size_t size) {
}
template <>
void Free<platform::NPUPlace>(const platform::NPUPlace &place, void *p,
void Free<platform::NPUPlace>(const platform::NPUPlace &place,
void *p,
size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
......@@ -384,7 +396,8 @@ void *Alloc<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
template <>
void Free<platform::NPUPinnedPlace>(const platform::NPUPinnedPlace &place,
void *p, size_t size) {
void *p,
size_t size) {
#ifdef PADDLE_WITH_ASCEND_CL
GetNPUPinnedBuddyAllocator()->Free(p);
#else
......@@ -430,18 +443,21 @@ class GPUBuddyAllocatorList {
BuddyAllocator *Get(int gpu_id) {
auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), gpu_id));
PADDLE_ENFORCE_LT(pos, devices_.size(),
PADDLE_ENFORCE_LT(pos,
devices_.size(),
platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of "
"devices is %d, the index is %d",
devices_.size(), pos));
devices_.size(),
pos));
std::call_once(*init_flags_[pos], [this, pos] {
platform::SetDeviceId(devices_[pos]);
allocators_[pos].reset(new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(
allocators_[pos].reset(
new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::GPUAllocator(devices_[pos])),
platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()));
platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize()));
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
<< "'FLAGS_fraction_of_gpu_memory_to_use' "
......@@ -493,8 +509,10 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in GPU %d, avaliable %s, total %s, GpuMinChunkSize "
"%s, GpuMaxChunkSize %s, GPU memory used: %s.",
string::HumanReadableSize(size), place.device,
string::HumanReadableSize(avail), string::HumanReadableSize(total),
string::HumanReadableSize(size),
place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::CUDAPlace>(place))));
......@@ -515,7 +533,8 @@ void *Alloc<platform::CUDAPlace>(const platform::CUDAPlace &place,
}
template <>
void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
void Free<platform::CUDAPlace>(const platform::CUDAPlace &place,
void *p,
size_t size) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
GetGPUBuddyAllocator(place.device)->Free(p);
......@@ -584,7 +603,8 @@ void *Alloc<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
template <>
void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
void *p, size_t size) {
void *p,
size_t size) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
GetCUDAPinnedBuddyAllocator()->Free(p);
#else
......@@ -630,18 +650,21 @@ class MLUBuddyAllocatorList {
BuddyAllocator *Get(int mlu_id) {
auto pos = std::distance(
devices_.begin(), std::find(devices_.begin(), devices_.end(), mlu_id));
PADDLE_ENFORCE_LT(pos, devices_.size(),
PADDLE_ENFORCE_LT(pos,
devices_.size(),
platform::errors::OutOfRange(
"The index exceeds the size of devices, the size of "
"devices is %d, the index is %d",
devices_.size(), pos));
devices_.size(),
pos));
std::call_once(*init_flags_[pos], [this, pos] {
platform::SetMLUDeviceId(devices_[pos]);
allocators_[pos].reset(new BuddyAllocator(
std::unique_ptr<detail::SystemAllocator>(
allocators_[pos].reset(
new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
new detail::MLUAllocator(devices_[pos])),
platform::MLUMinChunkSize(), platform::MLUMaxChunkSize()));
platform::MLUMinChunkSize(),
platform::MLUMaxChunkSize()));
VLOG(10) << "\n\nNOTE:\n"
<< "You can set GFlags environment variable "
<< "(mlu reuse gpu GFlags) "
......@@ -693,8 +716,10 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in MLU %d, avaliable %s, total %s, MLUMinChunkSize "
"%s, MLUMinChunkSize %s, MLU memory used: %s.",
string::HumanReadableSize(size), place.device,
string::HumanReadableSize(avail), string::HumanReadableSize(total),
string::HumanReadableSize(size),
place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(buddy_allocator->GetMinChunkSize()),
string::HumanReadableSize(buddy_allocator->GetMaxChunkSize()),
string::HumanReadableSize(Used<platform::MLUPlace>(place))));
......@@ -711,7 +736,8 @@ void *Alloc<platform::MLUPlace>(const platform::MLUPlace &place, size_t size) {
}
template <>
void Free<platform::MLUPlace>(const platform::MLUPlace &place, void *p,
void Free<platform::MLUPlace>(const platform::MLUPlace &place,
void *p,
size_t size) {
#ifdef PADDLE_WITH_MLU
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
......@@ -759,10 +785,12 @@ class BuddyAllocatorList {
}
BuddyAllocator *Get(int dev_id) {
PADDLE_ENFORCE_NE(init_flags_.find(dev_id), init_flags_.end(),
PADDLE_ENFORCE_NE(init_flags_.find(dev_id),
init_flags_.end(),
platform::errors::OutOfRange(
"Cannot find %s %d, please check visible devices.",
device_type_, dev_id));
device_type_,
dev_id));
std::call_once(*init_flags_[dev_id], [this, dev_id] {
phi::DeviceManager::SetDevice(device_type_, dev_id);
......@@ -773,7 +801,8 @@ class BuddyAllocatorList {
new detail::CustomAllocator(device_type_, dev_id)),
phi::DeviceManager::GetMinChunkSize(place),
phi::DeviceManager::GetMaxChunkSize(place),
phi::DeviceManager::GetExtraPaddingSize(place), device_type_));
phi::DeviceManager::GetExtraPaddingSize(place),
device_type_));
});
return allocators_[dev_id].get();
......@@ -813,8 +842,11 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
PADDLE_THROW(platform::errors::ResourceExhausted(
"Cannot allocate %s in %s:%d, avaliable %s, total %s, used "
"%s. ",
string::HumanReadableSize(size), place.GetDeviceType(), place.device,
string::HumanReadableSize(avail), string::HumanReadableSize(total),
string::HumanReadableSize(size),
place.GetDeviceType(),
place.device,
string::HumanReadableSize(avail),
string::HumanReadableSize(total),
string::HumanReadableSize(total - avail)));
} else {
if (FLAGS_init_allocated_mem) {
......@@ -830,7 +862,8 @@ void *Alloc<platform::CustomPlace>(const platform::CustomPlace &place,
}
template <>
void Free<platform::CustomPlace>(const platform::CustomPlace &place, void *p,
void Free<platform::CustomPlace>(const platform::CustomPlace &place,
void *p,
size_t size) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
......@@ -922,8 +955,6 @@ namespace allocation {
phi::Allocation *NaiveBestFitAllocator::AllocateImpl(size_t size) {
void *ptr = paddle::platform::VisitPlace(place_, legacy::AllocVisitor(size));
auto *tmp_alloc = new Allocation(ptr, size, place_);
platform::MemEvenRecorder::Instance().PushMemRecord(
static_cast<void *>(tmp_alloc), place_, size);
return tmp_alloc;
}
......@@ -931,8 +962,6 @@ void NaiveBestFitAllocator::FreeImpl(phi::Allocation *allocation) {
paddle::platform::VisitPlace(
allocation->place(),
legacy::FreeVisitor(allocation->ptr(), allocation->size()));
platform::MemEvenRecorder::Instance().PopMemRecord(
static_cast<void *>(allocation), place_);
delete allocation;
}
......
......@@ -15,6 +15,7 @@
#include "paddle/fluid/memory/allocation/pinned_allocator.h"
#include "paddle/fluid/memory/stats.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
namespace paddle {
namespace memory {
namespace allocation {
......@@ -26,6 +27,10 @@ void CPUPinnedAllocator::FreeImpl(phi::Allocation *allocation) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaFreeHost(allocation->ptr()));
#endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, -allocation->size());
platform::RecordMemEvent(allocation->ptr(),
allocation->place(),
allocation->size(),
platform::TracerMemEventType::ReservedFree);
delete allocation;
}
phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
......@@ -36,6 +41,10 @@ phi::Allocation *CPUPinnedAllocator::AllocateImpl(size_t size) {
PADDLE_ENFORCE_GPU_SUCCESS(cudaHostAlloc(&ptr, size, cudaHostAllocPortable));
#endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
platform::RecordMemEvent(ptr,
platform::CUDAPinnedPlace(),
size,
platform::TracerMemEventType::ReservedAllocate);
return new Allocation(ptr, size, platform::CUDAPinnedPlace());
}
} // namespace allocation
......
......@@ -16,6 +16,7 @@
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/stats.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
namespace paddle {
namespace memory {
......@@ -30,14 +31,18 @@ class StatAllocator : public Allocator {
protected:
void FreeImpl(phi::Allocation* allocation) override {
if (platform::is_cpu_place(allocation->place())) {
HOST_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-allocation->size());
if (platform::is_cpu_place(allocation->place()) ||
platform::is_cuda_pinned_place(allocation->place())) {
HOST_MEMORY_STAT_UPDATE(
Allocated, allocation->place().GetDeviceId(), -allocation->size());
} else {
DEVICE_MEMORY_STAT_UPDATE(Allocated, allocation->place().GetDeviceId(),
-allocation->size());
DEVICE_MEMORY_STAT_UPDATE(
Allocated, allocation->place().GetDeviceId(), -allocation->size());
}
platform::RecordMemEvent(allocation->ptr(),
allocation->place(),
allocation->size(),
platform::TracerMemEventType::Free);
underlying_allocator_->Free(allocation);
}
......@@ -48,12 +53,16 @@ class StatAllocator : public Allocator {
const platform::Place& place = allocation->place();
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
HOST_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
allocation->size());
HOST_MEMORY_STAT_UPDATE(
Allocated, place.GetDeviceId(), allocation->size());
} else {
DEVICE_MEMORY_STAT_UPDATE(Allocated, place.GetDeviceId(),
allocation->size());
DEVICE_MEMORY_STAT_UPDATE(
Allocated, place.GetDeviceId(), allocation->size());
}
platform::RecordMemEvent(allocation->ptr(),
allocation->place(),
allocation->size(),
platform::TracerMemEventType::Allocate);
return allocation.release();
}
......
......@@ -41,6 +41,7 @@ limitations under the License. */
#endif
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
DECLARE_bool(use_pinned_memory);
DECLARE_double(fraction_of_gpu_memory_to_use);
......@@ -64,11 +65,13 @@ void* AlignedMalloc(size_t size) {
#else
int error = posix_memalign(&p, alignment, size);
PADDLE_ENFORCE_EQ(
error, 0,
error,
0,
platform::errors::ResourceExhausted(
"Fail to alloc memory of %ld size, error code is %d.", size, error));
#endif
PADDLE_ENFORCE_NOT_NULL(p, platform::errors::ResourceExhausted(
PADDLE_ENFORCE_NOT_NULL(p,
platform::errors::ResourceExhausted(
"Fail to alloc memory of %ld size.", size));
return p;
}
......@@ -95,7 +98,8 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {
}
HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
platform::RecordMemEvent(
p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
return p;
}
......@@ -114,6 +118,8 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
#endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
platform::RecordMemEvent(
p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
}
bool CPUAllocator::UseGpu() const { return false; }
......@@ -146,7 +152,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size);
limit_size,
limit_size);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
......@@ -161,21 +168,29 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) {
"please set it to a higher value but less than 1.0.\n"
" The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
gpu_id_, string::HumanReadableSize(size), gpu_id_,
string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
gpu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg));
gpu_id_,
string::HumanReadableSize(size),
gpu_id_,
string::HumanReadableSize(allocated),
string::HumanReadableSize(avail),
gpu_id_,
FLAGS_fraction_of_gpu_memory_to_use,
err_msg));
}
}
void GPUAllocator::Free(void* p, size_t size, size_t index) {
PADDLE_ENFORCE_EQ(index, 0,
PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument(
"The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(gpu_alloc_size_, size,
PADDLE_ENFORCE_GE(gpu_alloc_size_,
size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)",
size, gpu_alloc_size_));
size,
gpu_alloc_size_));
gpu_alloc_size_ -= size;
platform::RecordedGpuFree(p, size, gpu_id_);
......@@ -213,6 +228,8 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
*index = 1; // PINNED memory
cuda_pinnd_alloc_size_ += size;
HOST_MEMORY_STAT_UPDATE(Reserved, 0, size);
platform::RecordMemEvent(
p, CPUPlace(), size, platform::TracerMemEventType::ReservedAllocate);
return p;
} else {
LOG(WARNING) << "cudaHostAlloc failed.";
......@@ -224,21 +241,25 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
gpuError_t err;
PADDLE_ENFORCE_EQ(index, 1,
PADDLE_ENFORCE_EQ(index,
1,
platform::errors::InvalidArgument(
"The index should be 1, but got %d", index));
PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_, size,
PADDLE_ENFORCE_GE(cuda_pinnd_alloc_size_,
size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated cuda pinned memory (%d)",
size, cuda_pinnd_alloc_size_));
size,
cuda_pinnd_alloc_size_));
cuda_pinnd_alloc_size_ -= size;
#ifdef PADDLE_WITH_HIP
err = hipHostFree(p);
if (err != hipErrorDeinitialized) {
PADDLE_ENFORCE_EQ(
err, hipSuccess,
err,
hipSuccess,
platform::errors::Fatal(
"hipFreeHost failed in GPUPinnedAllocator, error code is %d", err));
}
......@@ -252,13 +273,16 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
// cudaFreeHost succeeds.
if (err != cudaErrorCudartUnloading) {
PADDLE_ENFORCE_EQ(
err, 0,
err,
0,
platform::errors::Fatal(
"cudaFreeHost failed in GPUPinnedAllocator, error code is %d",
err));
}
#endif
HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
platform::RecordMemEvent(
p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
}
bool CUDAPinnedAllocator::UseGpu() const { return false; }
......@@ -289,7 +313,8 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum GPU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size);
limit_size,
limit_size);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
......@@ -304,22 +329,29 @@ void* NPUAllocator::Alloc(size_t* index, size_t size) {
"please set it to a higher value but less than 1.0.\n"
" The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
npu_id_, string::HumanReadableSize(size), npu_id_,
string::HumanReadableSize(avail), npu_id_,
FLAGS_fraction_of_gpu_memory_to_use, err_msg));
npu_id_,
string::HumanReadableSize(size),
npu_id_,
string::HumanReadableSize(avail),
npu_id_,
FLAGS_fraction_of_gpu_memory_to_use,
err_msg));
}
}
void NPUAllocator::Free(void* p, size_t size, size_t index) {
VLOG(4) << "Free " << p << " size " << size;
PADDLE_ENFORCE_EQ(index, 0,
PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument(
"The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(npu_alloc_size_, size,
PADDLE_ENFORCE_GE(npu_alloc_size_,
size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)",
size, npu_alloc_size_));
size,
npu_alloc_size_));
npu_alloc_size_ -= size;
platform::RecordedNPUFree(p, size, npu_id_);
......@@ -358,21 +390,25 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
void NPUPinnedAllocator::Free(void* p, size_t size, size_t index) {
aclError err;
PADDLE_ENFORCE_EQ(index, 1,
PADDLE_ENFORCE_EQ(index,
1,
platform::errors::InvalidArgument(
"The index should be 1, but got %d", index));
PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_, size,
PADDLE_ENFORCE_GE(npu_pinnd_alloc_size_,
size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated npu pinned memory (%d)",
size, npu_pinnd_alloc_size_));
size,
npu_pinnd_alloc_size_));
npu_pinnd_alloc_size_ -= size;
err = platform::NPUHostFree(p);
if (err != ACL_ERROR_NONE) {
PADDLE_ENFORCE_EQ(
err, 0,
err,
0,
platform::errors::Fatal(
"NPUHostFree failed in NPUPinnedAllocator, error code is %d", err));
}
......@@ -407,7 +443,8 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
"larger value. Currently `FLAGS_gpu_memory_limit_mb` is %d, so the "
"maximum MLU memory usage is limited to %d MB.\n"
" The command is `export FLAGS_gpu_memory_limit_mb=xxx`.",
limit_size, limit_size);
limit_size,
limit_size);
}
PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
......@@ -422,21 +459,29 @@ void* MLUAllocator::Alloc(size_t* index, size_t size) {
"please set it to a higher value but less than 1.0.\n"
" The command is "
"`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n",
mlu_id_, string::HumanReadableSize(size), mlu_id_,
string::HumanReadableSize(allocated), string::HumanReadableSize(avail),
mlu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg));
mlu_id_,
string::HumanReadableSize(size),
mlu_id_,
string::HumanReadableSize(allocated),
string::HumanReadableSize(avail),
mlu_id_,
FLAGS_fraction_of_gpu_memory_to_use,
err_msg));
}
}
void MLUAllocator::Free(void* p, size_t size, size_t index) {
PADDLE_ENFORCE_EQ(index, 0,
PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument(
"The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(mlu_alloc_size_, size,
PADDLE_ENFORCE_GE(mlu_alloc_size_,
size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)",
size, mlu_alloc_size_));
size,
mlu_alloc_size_));
mlu_alloc_size_ -= size;
platform::RecordedMLUFree(p, size, mlu_id_);
......@@ -465,7 +510,9 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
"\n\nOut of memory error on %s %d. "
"total memory is %s, used memory is %s, "
"available memory is only %s.\n\n",
dev_type_, dev_id_, string::HumanReadableSize(total),
dev_type_,
dev_id_,
string::HumanReadableSize(total),
string::HumanReadableSize(total - avail),
string::HumanReadableSize(avail)));
}
......@@ -474,14 +521,17 @@ void* CustomAllocator::Alloc(size_t* index, size_t size) {
void CustomAllocator::Free(void* p, size_t size, size_t index) {
VLOG(4) << "CustomAllocator::Free " << p << " size " << size;
PADDLE_ENFORCE_EQ(index, 0,
PADDLE_ENFORCE_EQ(index,
0,
platform::errors::InvalidArgument(
"The index should be 0, index is %d", index));
PADDLE_ENFORCE_GE(plug_alloc_size, size,
PADDLE_ENFORCE_GE(plug_alloc_size,
size,
platform::errors::InvalidArgument(
"The size of memory (%d) to free exceeds the size of "
"allocated gpu memory (%d)",
size, plug_alloc_size));
size,
plug_alloc_size));
plug_alloc_size -= size;
auto place = platform::CustomPlace(dev_type_, dev_id_);
auto device = phi::DeviceManager::GetDeviceWithPlace(place);
......
......@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/common/place.h"
#ifdef PADDLE_WITH_XPU
......@@ -33,8 +33,12 @@ namespace memory {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
template <>
void Copy<platform::CPUPlace, platform::CustomPlace>(
platform::CPUPlace dst_place, void* dst, platform::CustomPlace src_place,
const void* src, size_t num, void* stream) {
platform::CPUPlace dst_place,
void* dst,
platform::CustomPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
......@@ -52,8 +56,12 @@ void Copy<platform::CPUPlace, platform::CustomPlace>(
template <>
void Copy<platform::CustomPlace, platform::CPUPlace>(
platform::CustomPlace dst_place, void* dst, platform::CPUPlace src_place,
const void* src, size_t num, void* stream) {
platform::CustomPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
auto dst_type = platform::PlaceHelper::GetDeviceType(dst_place);
......@@ -70,8 +78,12 @@ void Copy<platform::CustomPlace, platform::CPUPlace>(
template <>
void Copy<platform::CustomPlace, platform::CustomPlace>(
platform::CustomPlace dst_place, void* dst, platform::CustomPlace src_place,
const void* src, size_t num, void* stream) {
platform::CustomPlace dst_place,
void* dst,
platform::CustomPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
auto src_type = platform::PlaceHelper::GetDeviceType(src_place);
......@@ -102,9 +114,11 @@ void Copy<platform::CustomPlace, platform::CustomPlace>(
#endif // PADDLE_WITH_CUSTOM_DEVICE
template <>
void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace,
void* dst,
platform::CPUPlace,
const void* src, size_t num) {
const void* src,
size_t num) {
if (UNLIKELY(num == 0)) return;
VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num;
std::memcpy(dst, src, num);
......@@ -115,7 +129,8 @@ template <>
void Copy<platform::IPUPlace, platform::CPUPlace>(platform::IPUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src, size_t num) {
const void* src,
size_t num) {
if (UNLIKELY(num == 0)) return;
std::memcpy(dst, src, num);
}
......@@ -123,7 +138,8 @@ template <>
void Copy<platform::CPUPlace, platform::IPUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::IPUPlace src_place,
const void* src, size_t num) {
const void* src,
size_t num) {
if (UNLIKELY(num == 0)) return;
std::memcpy(dst, src, num);
}
......@@ -131,15 +147,18 @@ template <>
void Copy<platform::IPUPlace, platform::IPUPlace>(platform::IPUPlace dst_place,
void* dst,
platform::IPUPlace src_place,
const void* src, size_t num) {
const void* src,
size_t num) {
if (UNLIKELY(num == 0)) return;
std::memcpy(dst, src, num);
}
// NOTE: only for (CPUPlace and IPUPlace) -> (IPUPlace).
template <>
void Copy<phi::IPUPlace, phi::Place>(phi::IPUPlace dst_place, void* dst,
phi::Place src_place, const void* src,
void Copy<phi::IPUPlace, phi::Place>(phi::IPUPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num) {
if (src_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_src;
......@@ -152,8 +171,10 @@ void Copy<phi::IPUPlace, phi::Place>(phi::IPUPlace dst_place, void* dst,
// NOTE: only for (IPUPlace) -> (CPUPlace and IPUPlace).
template <>
void Copy<phi::Place, phi::IPUPlace>(phi::Place dst_place, void* dst,
phi::IPUPlace src_place, const void* src,
void Copy<phi::Place, phi::IPUPlace>(phi::Place dst_place,
void* dst,
phi::IPUPlace src_place,
const void* src,
size_t num) {
if (dst_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_dst;
......@@ -170,7 +191,8 @@ template <>
void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src, size_t num) {
const void* src,
size_t num) {
if (num <= 0) {
VLOG(1) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
return;
......@@ -182,7 +204,8 @@ template <>
void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::XPUPlace src_place,
const void* src, size_t num) {
const void* src,
size_t num) {
if (num <= 0) {
VLOG(1) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
return;
......@@ -194,7 +217,8 @@ template <>
void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
void* dst,
platform::XPUPlace src_place,
const void* src, size_t num) {
const void* src,
size_t num) {
if (num <= 0) {
VLOG(1) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
return;
......@@ -204,8 +228,10 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
// NOTE: only for (CPUPlace and XPUPlace) -> (XPUPlace).
template <>
void Copy<phi::XPUPlace, phi::Place>(phi::XPUPlace dst_place, void* dst,
phi::Place src_place, const void* src,
void Copy<phi::XPUPlace, phi::Place>(phi::XPUPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num) {
if (src_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_src;
......@@ -218,8 +244,10 @@ void Copy<phi::XPUPlace, phi::Place>(phi::XPUPlace dst_place, void* dst,
// NOTE: only for (XPUPlace) -> (CPUPlace and XPUPlace).
template <>
void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place, void* dst,
phi::XPUPlace src_place, const void* src,
void Copy<phi::Place, phi::XPUPlace>(phi::Place dst_place,
void* dst,
phi::XPUPlace src_place,
const void* src,
size_t num) {
if (dst_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_dst;
......@@ -236,7 +264,8 @@ template <>
void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src, size_t num,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
......@@ -248,7 +277,10 @@ void Copy<platform::NPUPlace, platform::CPUPlace>(platform::NPUPlace dst_place,
if (stream) {
platform::RecordEvent record_event(
"NpuMemcpyAsync:CPU->NPU", platform::TracerEventType::UserDefined, 1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_HOST_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream));
} else {
// On NPU, async operation after sync operation is ok, while sync operation
......@@ -267,7 +299,8 @@ template <>
void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::NPUPlace src_place,
const void* src, size_t num,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
......@@ -279,7 +312,10 @@ void Copy<platform::CPUPlace, platform::NPUPlace>(platform::CPUPlace dst_place,
if (stream) {
platform::RecordEvent record_event(
"NpuMemcpyAsync:NPU->CPU", platform::TracerEventType::UserDefined, 1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_DEVICE_TO_HOST,
reinterpret_cast<aclrtStream>(stream));
} else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
......@@ -295,7 +331,8 @@ template <>
void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
void* dst,
platform::NPUPlace src_place,
const void* src, size_t num,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
......@@ -307,7 +344,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
platform::RecordEvent record_event("NpuMemcpyAsync(same_npu):NPU->NPU",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_DEVICE_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream));
} else {
platform::DeviceContextPool& pool =
......@@ -329,7 +369,10 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
platform::RecordEvent record_event("NpuMemcpyPeerAsync:NPU->NPU",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE,
platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_DEVICE_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream));
} else {
platform::DeviceContextPool& pool =
......@@ -346,8 +389,11 @@ void Copy<platform::NPUPlace, platform::NPUPlace>(platform::NPUPlace dst_place,
template <>
void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
platform::CPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
const void* src, size_t num) {
platform::CPUPlace dst_place,
void* dst,
platform::NPUPinnedPlace src_place,
const void* src,
size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place;
if (UNLIKELY(num == 0)) return;
......@@ -356,8 +402,11 @@ void Copy<platform::CPUPlace, platform::NPUPinnedPlace>(
template <>
void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
platform::NPUPinnedPlace dst_place, void* dst, platform::CPUPlace src_place,
const void* src, size_t num) {
platform::NPUPinnedPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src,
size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place;
if (UNLIKELY(num == 0)) return;
......@@ -366,8 +415,11 @@ void Copy<platform::NPUPinnedPlace, platform::CPUPlace>(
template <>
void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
platform::NPUPinnedPlace dst_place, void* dst,
platform::NPUPinnedPlace src_place, const void* src, size_t num) {
platform::NPUPinnedPlace dst_place,
void* dst,
platform::NPUPinnedPlace src_place,
const void* src,
size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place;
if (UNLIKELY(num == 0)) return;
......@@ -376,8 +428,12 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPinnedPlace>(
template <>
void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
platform::NPUPinnedPlace dst_place, void* dst, platform::NPUPlace src_place,
const void* src, size_t num, void* stream) {
platform::NPUPinnedPlace dst_place,
void* dst,
platform::NPUPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
platform::SetNPUDeviceId(src_place.device);
......@@ -389,7 +445,10 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
platform::RecordEvent record_event("NpuMemcpyAsync:NPU->NPUPinned",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST,
platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_DEVICE_TO_HOST,
reinterpret_cast<aclrtStream>(stream));
} else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
......@@ -404,8 +463,12 @@ void Copy<platform::NPUPinnedPlace, platform::NPUPlace>(
template <>
void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
platform::NPUPlace dst_place, void* dst, platform::NPUPinnedPlace src_place,
const void* src, size_t num, void* stream) {
platform::NPUPlace dst_place,
void* dst,
platform::NPUPinnedPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
platform::SetNPUDeviceId(dst_place.device);
......@@ -417,7 +480,10 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
platform::RecordEvent record_event("NpuMemcpyAsync:NPUPinned->NPU",
platform::TracerEventType::UserDefined,
1);
platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE,
platform::NPUMemcpyAsync(dst,
src,
num,
ACL_MEMCPY_HOST_TO_DEVICE,
reinterpret_cast<aclrtStream>(stream));
} else {
// On NPU, async operation after sync operation is ok, while sync operation
......@@ -435,9 +501,12 @@ void Copy<platform::NPUPlace, platform::NPUPinnedPlace>(
// NOTE: only for CPUPlace, NPUPlace and NPUPinnedPlace.
template <>
void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
phi::Place src_place, const void* src,
size_t num, aclrtStream stream) {
void Copy<phi::Place, phi::Place>(phi::Place dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
aclrtStream stream) {
if (src_place.GetType() == phi::AllocationType::CPU &&
dst_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_dst, place_src;
......@@ -504,52 +573,76 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (CPUPlace).
template <>
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
phi::Place src_place, const void* src,
size_t num, aclrtStream stream) {
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
aclrtStream stream) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
}
// NOTE: only for (CPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace).
template <>
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
phi::CPUPlace src_place, const void* src,
size_t num, aclrtStream stream) {
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
void* dst,
phi::CPUPlace src_place,
const void* src,
size_t num,
aclrtStream stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
}
// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPlace)
template <>
void Copy<phi::NPUPlace, phi::Place>(phi::NPUPlace dst_place, void* dst,
phi::Place src_place, const void* src,
size_t num, aclrtStream stream) {
Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place,
src, num, stream);
void Copy<phi::NPUPlace, phi::Place>(phi::NPUPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
aclrtStream stream) {
Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
dst,
src_place,
src,
num,
stream);
}
// NOTE: only for (NPUPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
template <>
void Copy<phi::Place, phi::NPUPlace>(phi::Place dst_place, void* dst,
phi::NPUPlace src_place, const void* src,
size_t num, aclrtStream stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()),
src, num, stream);
void Copy<phi::Place, phi::NPUPlace>(phi::Place dst_place,
void* dst,
phi::NPUPlace src_place,
const void* src,
size_t num,
aclrtStream stream) {
Copy(dst_place,
dst,
phi::Place(src_place.GetType(), src_place.GetDeviceId()),
src,
num,
stream);
}
// NOTE: only for (CPUPlace, NPUPlace and NPUPinnedPlace) -> (NPUPinnedPlace)
template <>
void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place,
void* dst, phi::Place src_place,
const void* src, size_t num,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
aclrtStream stream) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
}
// NOTE: only for (NPUPinnedPlace) -> (CPUPlace, NPUPlace and NPUPinnedPlace)
template <>
void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place, void* dst,
void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place,
void* dst,
phi::NPUPinnedPlace src_place,
const void* src, size_t num,
const void* src,
size_t num,
aclrtStream stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
}
......@@ -557,16 +650,20 @@ void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place, void* dst,
// NOTE: only for (CPUPlace) -> (NPUPinnedPlace)
template <>
void Copy<phi::NPUPinnedPlace, phi::Place>(phi::NPUPinnedPlace dst_place,
void* dst, phi::Place src_place,
const void* src, size_t num) {
void* dst,
phi::Place src_place,
const void* src,
size_t num) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr);
}
// NOTE: only for (NPUPinnedPlace) -> (CPUPlace)
template <>
void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place, void* dst,
void Copy<phi::Place, phi::NPUPinnedPlace>(phi::Place dst_place,
void* dst,
phi::NPUPinnedPlace src_place,
const void* src, size_t num) {
const void* src,
size_t num) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr);
}
#endif
......@@ -608,8 +705,12 @@ inline void SyncCUDAStream() {
template <>
void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, void* stream) {
platform::CPUPlace dst_place,
void* dst,
platform::CUDAPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(src_place.device);
......@@ -619,10 +720,16 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
platform::RecordEvent record_event(
"GpuMemcpyAsync:GPU->CPU", platform::TracerEventType::UserDefined, 1);
#ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
platform::GpuMemcpyAsync(dst,
src,
num,
hipMemcpyDeviceToHost,
reinterpret_cast<gpuStream_t>(stream));
#else
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost,
platform::GpuMemcpyAsync(dst,
src,
num,
cudaMemcpyDeviceToHost,
reinterpret_cast<gpuStream_t>(stream));
#endif
} else {
......@@ -642,8 +749,12 @@ void Copy<platform::CPUPlace, platform::CUDAPlace>(
template <>
void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
const void* src, size_t num, void* stream) {
platform::CUDAPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(dst_place.device);
......@@ -653,10 +764,16 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
platform::RecordEvent record_event(
"GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
#ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
platform::GpuMemcpyAsync(dst,
src,
num,
hipMemcpyHostToDevice,
reinterpret_cast<gpuStream_t>(stream));
#else
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice,
platform::GpuMemcpyAsync(dst,
src,
num,
cudaMemcpyHostToDevice,
reinterpret_cast<gpuStream_t>(stream));
#endif
} else {
......@@ -676,8 +793,12 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
template <>
void Copy<platform::CUDAPlace, platform::CUDAPlace>(
platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
const void* src, size_t num, void* stream) {
platform::CUDAPlace dst_place,
void* dst,
platform::CUDAPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
......@@ -689,10 +810,16 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
platform::TracerEventType::UserDefined,
1);
#ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToDevice,
platform::GpuMemcpyAsync(dst,
src,
num,
hipMemcpyDeviceToDevice,
reinterpret_cast<gpuStream_t>(stream));
#else
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice,
platform::GpuMemcpyAsync(dst,
src,
num,
cudaMemcpyDeviceToDevice,
reinterpret_cast<gpuStream_t>(stream));
#endif
} else {
......@@ -710,22 +837,29 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
platform::RecordEvent record_event("GpuMemcpyPeerAsync:GPU->GPU",
platform::TracerEventType::UserDefined,
1);
platform::GpuMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
num, reinterpret_cast<gpuStream_t>(stream));
platform::GpuMemcpyPeerAsync(dst,
dst_place.device,
src,
src_place.device,
num,
reinterpret_cast<gpuStream_t>(stream));
} else {
platform::RecordEvent record_event("GpuMemcpyPeerSync:GPU->GPU",
platform::TracerEventType::UserDefined,
1);
platform::GpuMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
num);
platform::GpuMemcpyPeerSync(
dst, dst_place.device, src, src_place.device, num);
}
}
}
template <>
void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
platform::CPUPlace dst_place, void* dst,
platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
platform::CPUPlace dst_place,
void* dst,
platform::CUDAPinnedPlace src_place,
const void* src,
size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place;
if (UNLIKELY(num == 0)) return;
......@@ -734,8 +868,11 @@ void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
template <>
void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
platform::CUDAPinnedPlace dst_place, void* dst,
platform::CPUPlace src_place, const void* src, size_t num) {
platform::CUDAPinnedPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src,
size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place;
if (UNLIKELY(num == 0)) return;
......@@ -744,8 +881,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
template <>
void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
platform::CUDAPinnedPlace dst_place, void* dst,
platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
platform::CUDAPinnedPlace dst_place,
void* dst,
platform::CUDAPinnedPlace src_place,
const void* src,
size_t num) {
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
<< dst_place;
if (UNLIKELY(num == 0)) return;
......@@ -754,8 +894,12 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
template <>
void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
platform::CUDAPinnedPlace dst_place, void* dst,
platform::CUDAPlace src_place, const void* src, size_t num, void* stream) {
platform::CUDAPinnedPlace dst_place,
void* dst,
platform::CUDAPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
platform::SetDeviceId(src_place.device);
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
......@@ -765,10 +909,16 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
platform::TracerEventType::UserDefined,
1);
#ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyDeviceToHost,
platform::GpuMemcpyAsync(dst,
src,
num,
hipMemcpyDeviceToHost,
reinterpret_cast<gpuStream_t>(stream));
#else
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost,
platform::GpuMemcpyAsync(dst,
src,
num,
cudaMemcpyDeviceToHost,
reinterpret_cast<gpuStream_t>(stream));
#endif
} else {
......@@ -785,8 +935,11 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
template <>
void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
platform::CUDAPlace dst_place, void* dst,
platform::CUDAPinnedPlace src_place, const void* src, size_t num,
platform::CUDAPlace dst_place,
void* dst,
platform::CUDAPinnedPlace src_place,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
......@@ -798,10 +951,16 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
platform::TracerEventType::UserDefined,
1);
#ifdef PADDLE_WITH_HIP
platform::GpuMemcpyAsync(dst, src, num, hipMemcpyHostToDevice,
platform::GpuMemcpyAsync(dst,
src,
num,
hipMemcpyHostToDevice,
reinterpret_cast<gpuStream_t>(stream));
#else
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice,
platform::GpuMemcpyAsync(dst,
src,
num,
cudaMemcpyHostToDevice,
reinterpret_cast<gpuStream_t>(stream));
#endif
} else {
......@@ -818,9 +977,12 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
// NOTE: only for CPUPlace、CUDAPlace and CUDAPinnedPlace.
template <>
void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
phi::Place src_place, const void* src,
size_t num, void* stream) {
void Copy<phi::Place, phi::Place>(phi::Place dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
void* stream) {
if (src_place.GetType() == phi::AllocationType::CPU &&
dst_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_dst, place_src;
......@@ -887,52 +1049,76 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
// NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CPUPlace).
template <>
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
phi::Place src_place, const void* src,
size_t num, void* stream) {
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
void* stream) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
}
// NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace).
template <>
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
phi::CPUPlace src_place, const void* src,
size_t num, void* stream) {
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
void* dst,
phi::CPUPlace src_place,
const void* src,
size_t num,
void* stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
}
// NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPlace)
template <>
void Copy<phi::GPUPlace, phi::Place>(phi::GPUPlace dst_place, void* dst,
phi::Place src_place, const void* src,
size_t num, void* stream) {
Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place,
src, num, stream);
void Copy<phi::GPUPlace, phi::Place>(phi::GPUPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
void* stream) {
Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
dst,
src_place,
src,
num,
stream);
}
// NOTE: only for (CUDAPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace)
template <>
void Copy<phi::Place, phi::GPUPlace>(phi::Place dst_place, void* dst,
phi::GPUPlace src_place, const void* src,
size_t num, void* stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()),
src, num, stream);
void Copy<phi::Place, phi::GPUPlace>(phi::Place dst_place,
void* dst,
phi::GPUPlace src_place,
const void* src,
size_t num,
void* stream) {
Copy(dst_place,
dst,
phi::Place(src_place.GetType(), src_place.GetDeviceId()),
src,
num,
stream);
}
// NOTE: only for (CPUPlace, CUDAPlace and CUDAPinnedPlace) -> (CUDAPinnedPlace)
template <>
void Copy<phi::GPUPinnedPlace, phi::Place>(phi::GPUPinnedPlace dst_place,
void* dst, phi::Place src_place,
const void* src, size_t num,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
void* stream) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
}
// NOTE: only for (CUDAPinnedPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace)
template <>
void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, void* dst,
void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place,
void* dst,
phi::GPUPinnedPlace src_place,
const void* src, size_t num,
const void* src,
size_t num,
void* stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
}
......@@ -940,16 +1126,20 @@ void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, void* dst,
// NOTE: only for (CPUPlace) -> (CUDAPinnedPlace)
template <>
void Copy<phi::GPUPinnedPlace, phi::Place>(phi::GPUPinnedPlace dst_place,
void* dst, phi::Place src_place,
const void* src, size_t num) {
void* dst,
phi::Place src_place,
const void* src,
size_t num) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, nullptr);
}
// NOTE: only for (CUDAPinnedPlace) -> (CPUPlace)
template <>
void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place, void* dst,
void Copy<phi::Place, phi::GPUPinnedPlace>(phi::Place dst_place,
void* dst,
phi::GPUPinnedPlace src_place,
const void* src, size_t num) {
const void* src,
size_t num) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, nullptr);
}
#endif
......@@ -959,7 +1149,8 @@ template <>
void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::MLUPlace src_place,
const void* src, size_t num,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
......@@ -970,8 +1161,8 @@ void Copy<platform::CPUPlace, platform::MLUPlace>(platform::CPUPlace dst_place,
platform::RecordEvent record_event("MLUMemcpyD2HAsync:MLU->CPU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyD2HAsync(dst, src, num,
reinterpret_cast<mluStream>(stream));
platform::MLUMemcpyD2HAsync(
dst, src, num, reinterpret_cast<mluStream>(stream));
} else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
......@@ -988,7 +1179,8 @@ template <>
void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src, size_t num,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
......@@ -999,8 +1191,8 @@ void Copy<platform::MLUPlace, platform::CPUPlace>(platform::MLUPlace dst_place,
platform::RecordEvent record_event("MLUMemcpyH2DAsync:CPU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyH2DAsync(dst, src, num,
reinterpret_cast<mluStream>(stream));
platform::MLUMemcpyH2DAsync(
dst, src, num, reinterpret_cast<mluStream>(stream));
} else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
static_cast<platform::MLUDeviceContext*>(pool.Get(src_place))->Wait();
......@@ -1017,7 +1209,8 @@ template <>
void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
void* dst,
platform::MLUPlace src_place,
const void* src, size_t num,
const void* src,
size_t num,
void* stream) {
if (UNLIKELY(num == 0)) return;
......@@ -1029,8 +1222,8 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
platform::RecordEvent record_event("MLUMemcpyD2DAsync(same_mlu):MLU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyD2DAsync(dst, src, num,
reinterpret_cast<mluStream>(stream));
platform::MLUMemcpyD2DAsync(
dst, src, num, reinterpret_cast<mluStream>(stream));
} else {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
......@@ -1050,25 +1243,32 @@ void Copy<platform::MLUPlace, platform::MLUPlace>(platform::MLUPlace dst_place,
platform::RecordEvent record_event("MLUMemcpyPeerAsync:MLU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyPeerAsync(dst, dst_place.device, src, src_place.device,
num, reinterpret_cast<mluStream>(stream));
platform::MLUMemcpyPeerAsync(dst,
dst_place.device,
src,
src_place.device,
num,
reinterpret_cast<mluStream>(stream));
} else {
VLOG(4) << "Sync memory::Copy " << num << " Bytes from " << src_place
<< " to " << dst_place;
platform::RecordEvent record_event("MLUMemcpyPeerSync:MLU->MLU",
platform::TracerEventType::UserDefined,
1);
platform::MLUMemcpyPeerSync(dst, dst_place.device, src, src_place.device,
num);
platform::MLUMemcpyPeerSync(
dst, dst_place.device, src, src_place.device, num);
}
}
}
// NOTE: only for CPUPlace and MLUPlace.
template <>
void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
phi::Place src_place, const void* src,
size_t num, void* stream) {
void Copy<phi::Place, phi::Place>(phi::Place dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
void* stream) {
if (src_place.GetType() == phi::AllocationType::CPU &&
dst_place.GetType() == phi::AllocationType::CPU) {
platform::CPUPlace place_dst, place_src;
......@@ -1110,35 +1310,55 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
// NOTE: only for (CPUPlace and MLUPlace) -> (MLUPlace)
template <>
void Copy<phi::MLUPlace, phi::Place>(phi::MLUPlace dst_place, void* dst,
phi::Place src_place, const void* src,
size_t num, void* stream) {
Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()), dst, src_place,
src, num, stream);
void Copy<phi::MLUPlace, phi::Place>(phi::MLUPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
void* stream) {
Copy(phi::Place(dst_place.GetType(), dst_place.GetDeviceId()),
dst,
src_place,
src,
num,
stream);
}
// NOTE: only for (MLUPlace) -> (CPUPlace and MLUPlace)
template <>
void Copy<phi::Place, phi::MLUPlace>(phi::Place dst_place, void* dst,
phi::MLUPlace src_place, const void* src,
size_t num, void* stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType(), src_place.GetDeviceId()),
src, num, stream);
void Copy<phi::Place, phi::MLUPlace>(phi::Place dst_place,
void* dst,
phi::MLUPlace src_place,
const void* src,
size_t num,
void* stream) {
Copy(dst_place,
dst,
phi::Place(src_place.GetType(), src_place.GetDeviceId()),
src,
num,
stream);
}
// NOTE: only for (MLUPlace) -> (CPUPlace) with mluStream.
template <>
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
phi::Place src_place, const void* src,
size_t num, void* stream) {
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
void* stream) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
}
// NOTE: only for (CPUPlace) -> (MLUPlace) with mluStream.
template <>
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
phi::CPUPlace src_place, const void* src,
size_t num, void* stream) {
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
void* dst,
phi::CPUPlace src_place,
const void* src,
size_t num,
void* stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
}
......@@ -1146,8 +1366,10 @@ void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
// NOTE: Only for CPUPlace, XPUPlace and PinnedPlace.
template <>
void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
phi::Place src_place, const void* src,
void Copy<phi::Place, phi::Place>(phi::Place dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num) {
if (UNLIKELY(num == 0)) return;
VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
......@@ -1224,16 +1446,20 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
// NOTE: Only for (CPUPlace) -> (CPUPlace and PinnedPlace).
template <>
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
phi::CPUPlace src_place, const void* src,
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
void* dst,
phi::CPUPlace src_place,
const void* src,
size_t num) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num);
}
// NOTE: Only for (CPUPlace and PinnedPlace) -> (CPUPlace).
template <>
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
phi::Place src_place, const void* src,
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num);
}
......@@ -1243,9 +1469,12 @@ void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
!defined(PADDLE_WITH_MLU)
template <>
void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
phi::Place src_place, const void* src,
size_t num, void* stream) {
void Copy<phi::Place, phi::Place>(phi::Place dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
void* stream) {
if (src_place.GetType() == phi::AllocationType::CPU && // NOLINT
dst_place.GetType() == phi::AllocationType::CUSTOM) {
platform::CPUPlace place_src;
......@@ -1265,17 +1494,23 @@ void Copy<phi::Place, phi::Place>(phi::Place dst_place, void* dst,
}
template <>
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place, void* dst,
phi::Place src_place, const void* src,
size_t num, void* stream) {
void Copy<phi::CPUPlace, phi::Place>(phi::CPUPlace dst_place,
void* dst,
phi::Place src_place,
const void* src,
size_t num,
void* stream) {
Copy(phi::Place(dst_place.GetType()), dst, src_place, src, num, stream);
}
// NOTE: only for (CPUPlace) -> (CPUPlace, CUDAPlace and CUDAPinnedPlace).
template <>
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place, void* dst,
phi::CPUPlace src_place, const void* src,
size_t num, void* stream) {
void Copy<phi::Place, phi::CPUPlace>(phi::Place dst_place,
void* dst,
phi::CPUPlace src_place,
const void* src,
size_t num,
void* stream) {
Copy(dst_place, dst, phi::Place(src_place.GetType()), src, num, stream);
}
#endif
......
......@@ -354,7 +354,9 @@ if(WITH_GPU)
enforce
dynload_cuda
new_profiler
stats)
stats
op_proto_maker
shape_inference)
nv_library(
device_memory_aligment
SRCS device_memory_aligment.cc
......@@ -363,7 +365,14 @@ elseif(WITH_ROCM)
hip_library(
profiler
SRCS profiler.cc profiler.cu
DEPS os_info device_tracer gpu_info enforce new_profiler stats)
DEPS os_info
device_tracer
gpu_info
enforce
new_profiler
stats
op_proto_maker
shape_inference)
hip_library(
device_memory_aligment
SRCS device_memory_aligment.cc
......@@ -372,7 +381,13 @@ else()
cc_library(
profiler
SRCS profiler.cc
DEPS os_info device_tracer enforce new_profiler stats)
DEPS os_info
device_tracer
enforce
new_profiler
stats
op_proto_maker
shape_inference)
cc_library(
device_memory_aligment
SRCS device_memory_aligment.cc
......
......@@ -29,6 +29,7 @@ limitations under the License. */
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/platform/monitor.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
#include "paddle/fluid/string/split.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
......@@ -51,10 +52,12 @@ DECLARE_uint64(reallocate_gpu_memory_in_mb);
DECLARE_bool(enable_cublas_tensor_op_math);
DECLARE_uint64(gpu_memory_limit_mb);
PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log, false,
PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log,
false,
"Whether to print the message of gpu memory usage "
"at exit, mainly used for UT and CI.");
PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb, true,
PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb,
true,
"Whether to print the message of gpu memory usage "
"MB as a unit of measurement.");
......@@ -66,7 +69,10 @@ namespace platform {
void GpuMemoryUsage(size_t *available, size_t *total) {
size_t actual_available, actual_total;
RecordedGpuMemGetInfo(available, total, &actual_available, &actual_total,
RecordedGpuMemGetInfo(available,
total,
&actual_available,
&actual_total,
platform::GetCurrentDeviceId());
}
......@@ -94,7 +100,8 @@ size_t GpuMaxAllocSize() {
static size_t GpuAllocSize(bool realloc) {
size_t available_to_alloc = GpuAvailableMemToAlloc();
PADDLE_ENFORCE_GT(
available_to_alloc, 0,
available_to_alloc,
0,
platform::errors::ResourceExhausted("Not enough available GPU memory."));
// If FLAGS_initial_gpu_memory_in_mb is 0, then initial memory will be
// allocated by fraction
......@@ -105,7 +112,8 @@ static size_t GpuAllocSize(bool realloc) {
? flag_mb << 20
: available_to_alloc * FLAGS_fraction_of_gpu_memory_to_use);
PADDLE_ENFORCE_GE(
available_to_alloc, alloc_bytes,
available_to_alloc,
alloc_bytes,
platform::errors::ResourceExhausted("Not enough available GPU memory."));
VLOG(10) << "Alloc size is " << (alloc_bytes >> 20)
<< " MiB, is it Re-alloc: " << realloc;
......@@ -192,13 +200,16 @@ class RecordedGpuMallocHelper {
});
PADDLE_ENFORCE_GE(
dev_id, 0,
dev_id,
0,
platform::errors::OutOfRange(
"Device id must be not less than 0, but got %d.", dev_id));
PADDLE_ENFORCE_LT(
dev_id, instances_.size(),
dev_id,
instances_.size(),
platform::errors::OutOfRange("Device id %d exceeds gpu card number %d.",
dev_id, instances_.size()));
dev_id,
instances_.size()));
return instances_[dev_id].get();
}
......@@ -207,7 +218,8 @@ class RecordedGpuMallocHelper {
* or cudaSuccess would be returned, and the cudaGetLastError() flag
* would be clear.
*/
gpuError_t Malloc(void **ptr, size_t size,
gpuError_t Malloc(void **ptr,
size_t size,
bool malloc_managed_memory = false) {
LockGuardPtr<std::mutex> lock(mtx_);
if (UNLIKELY(NeedRecord() && cur_size_.load() + size > limit_size_)) {
......@@ -236,7 +248,10 @@ class RecordedGpuMallocHelper {
cur_size_.fetch_add(size);
STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, size);
platform::RecordMemEvent(ptr,
GPUPlace(dev_id_),
size,
platform::TracerMemEventType::ReservedAllocate);
#ifdef PADDLE_WITH_TESTING
gpu_ptrs.insert(*ptr);
#endif
......@@ -275,6 +290,10 @@ class RecordedGpuMallocHelper {
cur_size_.fetch_sub(size);
STAT_INT_SUB("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
DEVICE_MEMORY_STAT_UPDATE(Reserved, dev_id_, -size);
platform::RecordMemEvent(ptr,
GPUPlace(dev_id_),
size,
platform::TracerMemEventType::ReservedFree);
} else {
platform::GpuGetLastError(); // clear the error flag when
// cudaErrorCudartUnloading /
......@@ -300,7 +319,9 @@ class RecordedGpuMallocHelper {
#endif
}
bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
bool GetMemInfo(size_t *avail,
size_t *total,
size_t *actual_avail,
size_t *actual_total) {
{
CUDADeviceGuard guard(dev_id_);
......@@ -335,7 +356,8 @@ class RecordedGpuMallocHelper {
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10020
CUresult MemCreate(CUmemGenericAllocationHandle *handle, size_t size,
CUresult MemCreate(CUmemGenericAllocationHandle *handle,
size_t size,
const CUmemAllocationProp *prop,
unsigned long long flags) { // NOLINT
auto result =
......@@ -371,7 +393,9 @@ class RecordedGpuMallocHelper {
std::once_flag RecordedGpuMallocHelper::once_flag_;
gpuError_t RecordedGpuMalloc(void **ptr, size_t size, int dev_id,
gpuError_t RecordedGpuMalloc(void **ptr,
size_t size,
int dev_id,
bool malloc_managed_memory) {
return RecordedGpuMallocHelper::Instance(dev_id)->Malloc(
ptr, size, malloc_managed_memory);
......@@ -383,22 +407,28 @@ void RecordedGpuFree(void *p, size_t size, int dev_id) {
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10020
CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle, size_t size,
CUresult RecordedGpuMemCreate(CUmemGenericAllocationHandle *handle,
size_t size,
const CUmemAllocationProp *prop,
unsigned long long flags, int dev_id) { // NOLINT
return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(handle, size,
prop, flags);
unsigned long long flags,
int dev_id) { // NOLINT
return RecordedGpuMallocHelper::Instance(dev_id)->MemCreate(
handle, size, prop, flags);
}
CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle, size_t size,
CUresult RecordedGpuMemRelease(CUmemGenericAllocationHandle handle,
size_t size,
int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->MemRelease(handle, size);
}
#endif
#endif
bool RecordedGpuMemGetInfo(size_t *avail, size_t *total, size_t *actual_avail,
size_t *actual_total, int dev_id) {
bool RecordedGpuMemGetInfo(size_t *avail,
size_t *total,
size_t *actual_avail,
size_t *actual_total,
int dev_id) {
return RecordedGpuMallocHelper::Instance(dev_id)->GetMemInfo(
avail, total, actual_avail, actual_total);
}
......@@ -493,26 +523,35 @@ void GpuDestroyStream(gpuStream_t stream) {
void GpuDeviceSync() { phi::backends::gpu::GpuDeviceSync(); }
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
gpuMemcpyKind kind, gpuStream_t stream) {
void GpuMemcpyAsync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind,
gpuStream_t stream) {
phi::backends::gpu::GpuMemcpyAsync(dst, src, count, kind, stream);
}
void GpuMemcpySync(void *dst, const void *src, size_t count,
void GpuMemcpySync(void *dst,
const void *src,
size_t count,
gpuMemcpyKind kind) {
phi::backends::gpu::GpuMemcpySync(dst, src, count, kind);
}
void GpuMemcpyPeerAsync(void *dst, int dst_device, const void *src,
int src_device, size_t count, gpuStream_t stream) {
phi::backends::gpu::GpuMemcpyPeerAsync(dst, dst_device, src, src_device,
count, stream);
void GpuMemcpyPeerAsync(void *dst,
int dst_device,
const void *src,
int src_device,
size_t count,
gpuStream_t stream) {
phi::backends::gpu::GpuMemcpyPeerAsync(
dst, dst_device, src, src_device, count, stream);
}
void GpuMemcpyPeerSync(void *dst, int dst_device, const void *src,
int src_device, size_t count) {
phi::backends::gpu::GpuMemcpyPeerSync(dst, dst_device, src, src_device,
count);
void GpuMemcpyPeerSync(
void *dst, int dst_device, const void *src, int src_device, size_t count) {
phi::backends::gpu::GpuMemcpyPeerSync(
dst, dst_device, src, src_device, count);
}
void GpuMemsetAsync(void *dst, int value, size_t count, gpuStream_t stream) {
......
......@@ -30,12 +30,16 @@ limitations under the License. */
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/nvtx.h"
#endif
#include "paddle/fluid/framework/op_proto_maker.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/os_info.h"
PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler, false,
PADDLE_DEFINE_EXPORTED_bool(enable_rpc_profiler,
false,
"Enable rpc profiler or not.");
DEFINE_bool(enable_host_event_recorder_hook, false,
DEFINE_bool(enable_host_event_recorder_hook,
false,
"enable HostEventRecorder, hook Profiler");
namespace paddle {
......@@ -43,8 +47,11 @@ namespace platform {
MemEvenRecorder MemEvenRecorder::recorder;
Event::Event(EventType type, std::string name, uint32_t thread_id,
EventRole role, std::string attr)
Event::Event(EventType type,
std::string name,
uint32_t thread_id,
EventRole role,
std::string attr)
: type_(type),
name_(name),
thread_id_(thread_id),
......@@ -68,8 +75,10 @@ double Event::CudaElapsedMs(const Event &e) const {
#endif
}
RecordEvent::RecordEvent(const char *name, const TracerEventType type,
uint32_t level, const EventRole role) {
RecordEvent::RecordEvent(const char *name,
const TracerEventType type,
uint32_t level,
const EventRole role) {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook) {
......@@ -100,8 +109,10 @@ RecordEvent::RecordEvent(const char *name, const TracerEventType type,
start_ns_ = PosixInNsec();
}
RecordEvent::RecordEvent(const std::string &name, const TracerEventType type,
uint32_t level, const EventRole role) {
RecordEvent::RecordEvent(const std::string &name,
const TracerEventType type,
uint32_t level,
const EventRole role) {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
if (g_enable_nvprof_hook) {
......@@ -130,8 +141,10 @@ RecordEvent::RecordEvent(const std::string &name, const TracerEventType type,
start_ns_ = PosixInNsec();
}
RecordEvent::RecordEvent(const std::string &name, const std::string &attr,
const TracerEventType type, uint32_t level,
RecordEvent::RecordEvent(const std::string &name,
const std::string &attr,
const TracerEventType type,
uint32_t level,
const EventRole role) {
#ifndef _WIN32
#ifdef PADDLE_WITH_CUDA
......@@ -215,8 +228,8 @@ void RecordEvent::End() {
DeviceTracer *tracer = GetDeviceTracer();
if (tracer) {
uint64_t end_ns = PosixInNsec();
tracer->AddCPURecords(CurAnnotationName(), start_ns_, end_ns, BlockDepth(),
g_thread_id);
tracer->AddCPURecords(
CurAnnotationName(), start_ns_, end_ns, BlockDepth(), g_thread_id);
}
ClearCurAnnotation();
PopEvent(*name_, role_);
......@@ -226,7 +239,8 @@ void RecordEvent::End() {
is_enabled_ = false;
}
RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
RecordInstantEvent::RecordInstantEvent(const char *name,
TracerEventType type,
uint32_t level) {
if (UNLIKELY(HostTraceLevel::GetInstance().NeedTrace(level) == false)) {
return;
......@@ -236,21 +250,242 @@ RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
name, start_end_ns, start_end_ns, EventRole::kOrdinary, type);
}
void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
RecordOpInfoSupplement::RecordOpInfoSupplement(
const std::string &type,
const framework::AttributeMap &attrs,
const framework::InferShapeContext &shape_ctx,
const framework::RuntimeContext &ctx) {
if (FLAGS_enable_host_event_recorder_hook == false) {
return;
}
std::map<std::string, std::vector<framework::DDim>> input_shapes;
std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
for (auto it = ctx.inputs.begin(); it != ctx.inputs.end(); it++) {
input_shapes[it->first] = shape_ctx.GetInputsDim(it->first);
dtypes[it->first] = shape_ctx.GetInputsVarType(it->first);
}
const std::vector<std::string> *callstack_ptr = nullptr;
std::vector<std::string> callstack;
auto iter = attrs.find(
framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
if (iter != attrs.end()) {
callstack_ptr = &BOOST_GET_CONST(std::vector<std::string>, iter->second);
callstack = *callstack_ptr;
}
HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance().RecordEvent(
PosixInNsec(), type, input_shapes, dtypes, callstack);
}
RecordMemEvent::RecordMemEvent(const void *ptr,
const phi::Place &place,
size_t size,
const TracerMemEventType type) {
if (g_state == ProfilerState::kDisabled &&
FLAGS_enable_host_event_recorder_hook == false) {
return;
}
if (type == TracerMemEventType::Allocate) {
uint64_t current_allocated;
uint64_t peak_allocated;
uint64_t current_reserved = 0; // 0 means keep the same as before
uint64_t peak_reserved = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
current_allocated =
HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
} else {
current_allocated =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
}
platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
place,
size,
type,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
} else if (type == TracerMemEventType::ReservedAllocate) {
uint64_t current_reserved;
uint64_t peak_reserved;
uint64_t current_allocated = 0; // 0 means keep the same as before
uint64_t peak_allocated = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
current_reserved =
HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
} else {
current_reserved =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
}
platform::MemEvenRecorder::Instance().PushMemRecord(ptr,
place,
size,
type,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
} else if (type == TracerMemEventType::Free) {
uint64_t current_allocated;
uint64_t peak_allocated;
uint64_t current_reserved = 0; // 0 means keep the same as before
uint64_t peak_reserved = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
current_allocated =
HOST_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
HOST_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
} else {
current_allocated =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Allocated, place.GetDeviceId());
peak_allocated =
DEVICE_MEMORY_STAT_PEAK_VALUE(Allocated, place.GetDeviceId());
}
platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
place,
size,
type,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
} else if (type == TracerMemEventType::ReservedFree) {
uint64_t current_reserved;
uint64_t peak_reserved;
uint64_t current_allocated = 0; // 0 means keep the same as before
uint64_t peak_allocated = 0; // 0 means keep the same as before
if (platform::is_cpu_place(place) ||
platform::is_cuda_pinned_place(place)) {
current_reserved =
HOST_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
HOST_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
} else {
current_reserved =
DEVICE_MEMORY_STAT_CURRENT_VALUE(Reserved, place.GetDeviceId());
peak_reserved =
DEVICE_MEMORY_STAT_PEAK_VALUE(Reserved, place.GetDeviceId());
}
platform::MemEvenRecorder::Instance().PopMemRecord(ptr,
place,
size,
type,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
}
}
void MemEvenRecorder::PushMemRecord(const void *ptr,
const Place &place,
size_t size) {
if (g_state == ProfilerState::kDisabled) return;
if (g_state == ProfilerState::kDisabled) {
return;
}
std::lock_guard<std::mutex> guard(mtx_);
auto &events = address_memevent_[place];
PADDLE_ENFORCE_EQ(events.count(ptr), 0,
PADDLE_ENFORCE_EQ(events.count(ptr),
0,
platform::errors::InvalidArgument(
"The Place can't exist in the stage of PushMemRecord"));
events.emplace(ptr, std::unique_ptr<RecordMemEvent>(
events.emplace(ptr,
std::unique_ptr<RecordMemEvent>(
new MemEvenRecorder::RecordMemEvent(place, size)));
}
void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
void MemEvenRecorder::PushMemRecord(const void *ptr,
const Place &place,
size_t size,
TracerMemEventType type,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved) {
std::lock_guard<std::mutex> guard(mtx_);
if (FLAGS_enable_host_event_recorder_hook) { // new MemRecord
HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
PosixInNsec(),
reinterpret_cast<uint64_t>(ptr),
type,
size,
place,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
return;
}
if (type == TracerMemEventType::ReservedAllocate) {
// old profiler only analyse memory managed by paddle.
return;
}
if (g_state == ProfilerState::kDisabled) return;
auto &events = address_memevent_[place];
PADDLE_ENFORCE_EQ(events.count(ptr),
0,
platform::errors::InvalidArgument(
"The Place can't exist in the stage of PushMemRecord"));
events.emplace(ptr,
std::unique_ptr<RecordMemEvent>(
new MemEvenRecorder::RecordMemEvent(place, size)));
}
void MemEvenRecorder::PopMemRecord(const void *ptr, const Place &place) {
if (g_state == ProfilerState::kDisabled) {
return;
}
std::lock_guard<std::mutex> guard(mtx_);
auto &events = address_memevent_[place];
auto iter = events.find(ptr);
// The ptr maybe not in address_memevent
if (iter != events.end()) {
events.erase(iter);
}
}
void MemEvenRecorder::PopMemRecord(const void *ptr,
const Place &place,
size_t size,
TracerMemEventType type,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved) {
std::lock_guard<std::mutex> guard(mtx_);
if (FLAGS_enable_host_event_recorder_hook) { // new MemRecord
HostEventRecorder<CommonMemEvent>::GetInstance().RecordEvent(
PosixInNsec(),
reinterpret_cast<uint64_t>(ptr),
type,
-size,
place,
current_allocated,
current_reserved,
peak_allocated,
peak_reserved);
return;
}
if (type == TracerMemEventType::ReservedFree) {
// old profiler only analyse memory managed by paddle.
return;
}
if (g_state == ProfilerState::kDisabled) return;
auto &events = address_memevent_[place];
auto iter = events.find(ptr);
// The ptr maybe not in address_memevent
......@@ -279,8 +514,13 @@ MemEvenRecorder::RecordMemEvent::~RecordMemEvent() {
auto annotation_free = CurAnnotationName();
if (tracer) {
tracer->AddMemInfoRecord(start_ns_, end_ns_, bytes_, place_, alloc_in_,
annotation_free, g_mem_thread_id);
tracer->AddMemInfoRecord(start_ns_,
end_ns_,
bytes_,
place_,
alloc_in_,
annotation_free,
g_mem_thread_id);
}
PopMemEvent(start_ns_, end_ns_, bytes_, place_, annotation_free);
}
......@@ -307,22 +547,38 @@ RecordBlock::~RecordBlock() {
if (tracer) {
// We try to put all blocks at the same nested depth in the
// same timeline lane. and distinguish the using thread_id.
tracer->AddCPURecords(name_, start_ns_, PosixInNsec(), BlockDepth(),
g_thread_id);
tracer->AddCPURecords(
name_, start_ns_, PosixInNsec(), BlockDepth(), g_thread_id);
}
ClearCurBlock();
}
void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &annotation) {
GetMemEventList().Record(EventType::kPushRange, start_ns, end_ns, bytes,
place, g_mem_thread_id, annotation);
}
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place &place, const std::string &annotation) {
GetMemEventList().Record(EventType::kPopRange, start_ns, end_ns, bytes, place,
g_mem_thread_id, annotation);
void PushMemEvent(uint64_t start_ns,
uint64_t end_ns,
size_t bytes,
const Place &place,
const std::string &annotation) {
GetMemEventList().Record(EventType::kPushRange,
start_ns,
end_ns,
bytes,
place,
g_mem_thread_id,
annotation);
}
void PopMemEvent(uint64_t start_ns,
uint64_t end_ns,
size_t bytes,
const Place &place,
const std::string &annotation) {
GetMemEventList().Record(EventType::kPopRange,
start_ns,
end_ns,
bytes,
place,
g_mem_thread_id,
annotation);
}
void Mark(const std::string &name) {
......@@ -334,17 +590,19 @@ void Mark(const std::string &name) {
GetEventList().Record(EventType::kMark, name, g_thread_id);
}
Event *PushEvent(const std::string &name, const EventRole role,
Event *PushEvent(const std::string &name,
const EventRole role,
std::string attr) {
return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role,
attr);
return GetEventList().Record(
EventType::kPushRange, name, g_thread_id, role, attr);
}
void PopEvent(const std::string &name, const EventRole role, std::string attr) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id, role, attr);
}
void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled,
PADDLE_ENFORCE_NE(state,
ProfilerState::kDisabled,
platform::errors::InvalidArgument(
"Can't enable profiling, since the input state is"
"ProfilerState::kDisabled"));
......@@ -380,7 +638,8 @@ void ResetProfiler() {
(*it)->Clear();
}
for (auto it = g_all_mem_event_lists.begin();
it != g_all_mem_event_lists.end(); ++it) {
it != g_all_mem_event_lists.end();
++it) {
(*it)->Clear();
}
}
......@@ -576,8 +835,8 @@ static void EmulateEventPushAndPop(
std::string name =
prefix_stk.empty() ? evt.name : prefix_stk.top() + "/" + evt.name;
const char *attr = (evt.attr == nullptr ? "none" : evt.attr);
Event *orig_evt = cur_thr_list->Record(EventType::kPushRange, name, tid,
evt.role, attr);
Event *orig_evt = cur_thr_list->Record(
EventType::kPushRange, name, tid, evt.role, attr);
(*out)[tid][evt.end_ns] = std::make_pair(orig_evt, evt.start_ns);
cur_thr_list->Record(EventType::kPopRange, name, tid, evt.role, attr);
}
......@@ -593,8 +852,8 @@ static void EmulateCPURecordsAdd(
for (const auto &thr_sec : host_sec.thr_sections) {
uint64_t tid = thr_sec.thread_id;
for (const auto &evt : thr_sec.events) {
tracer->AddCPURecords(evt.name, evt.start_ns, evt.end_ns, BlockDepth(),
tid);
tracer->AddCPURecords(
evt.name, evt.start_ns, evt.end_ns, BlockDepth(), tid);
}
}
}
......
......@@ -30,6 +30,8 @@ limitations under the License. */
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.pb.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/mem_tracing.h"
#include "paddle/fluid/platform/profiler/supplement_tracing.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#endif
......@@ -102,6 +104,22 @@ struct MemEvenRecorder {
public:
void PushMemRecord(const void* ptr, const Place& place, size_t size);
void PopMemRecord(const void* ptr, const Place& place);
void PushMemRecord(const void* ptr,
const Place& place,
size_t size,
TracerMemEventType type,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved);
void PopMemRecord(const void* ptr,
const Place& place,
size_t size,
TracerMemEventType type,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved);
void Flush();
static MemEvenRecorder& Instance() { return recorder; }
......@@ -160,7 +178,8 @@ struct EventList {
std::vector<T> Reduce() {
std::vector<T> result;
for (auto& block : event_blocks) {
result.insert(result.begin(), std::make_move_iterator(block.begin()),
result.insert(result.begin(),
std::make_move_iterator(block.begin()),
std::make_move_iterator(block.end()));
}
event_blocks.clear();
......@@ -173,13 +192,21 @@ struct EventList {
};
void Mark(const std::string& name);
void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place, const std::string& annotation);
void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
const Place& place, const std::string& annotation);
Event* PushEvent(const std::string& name, const EventRole role,
void PushMemEvent(uint64_t start_ns,
uint64_t end_ns,
size_t bytes,
const Place& place,
const std::string& annotation);
void PopMemEvent(uint64_t start_ns,
uint64_t end_ns,
size_t bytes,
const Place& place,
const std::string& annotation);
Event* PushEvent(const std::string& name,
const EventRole role,
const std::string attr = "none");
void PopEvent(const std::string& name, const EventRole role,
void PopEvent(const std::string& name,
const EventRole role,
const std::string attr = "none");
// Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
......
cc_library(
host_tracer
SRCS host_tracer.cc
DEPS enforce)
DEPS enforce ddim var_type_traits)
cc_library(
cuda_tracer
SRCS cuda_tracer.cc cupti_data_process.cc
......@@ -10,7 +10,7 @@ add_subdirectory(mlu)
cc_library(
event_node
SRCS event_node.cc
DEPS enforce)
DEPS enforce place)
cc_library(
profiler_utils
SRCS utils.cc
......
......@@ -18,16 +18,21 @@
#include <functional>
#include <string>
#include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/event.h" // import EventRole, TODO(TIEXING): remove later
#include "paddle/fluid/platform/profiler/trace_event.h"
#include "paddle/phi/core/ddim.h"
namespace paddle {
namespace platform {
struct CommonEvent {
public:
CommonEvent(const char *name, uint64_t start_ns, uint64_t end_ns,
EventRole role, TracerEventType type)
CommonEvent(const char *name,
uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type)
: name(name),
start_ns(start_ns),
end_ns(end_ns),
......@@ -35,8 +40,12 @@ struct CommonEvent {
type(type) {}
CommonEvent(std::function<void *(size_t)> arena_allocator,
const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
EventRole role, TracerEventType type, const std::string &attr_str)
const std::string &name_str,
uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type,
const std::string &attr_str)
: start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1);
......@@ -47,8 +56,11 @@ struct CommonEvent {
}
CommonEvent(std::function<void *(size_t)> arena_allocator,
const std::string &name_str, uint64_t start_ns, uint64_t end_ns,
EventRole role, TracerEventType type)
const std::string &name_str,
uint64_t start_ns,
uint64_t end_ns,
EventRole role,
TracerEventType type)
: start_ns(start_ns), end_ns(end_ns), role(role), type(type) {
auto buf = static_cast<char *>(arena_allocator(name_str.length() + 1));
strncpy(buf, name_str.c_str(), name_str.length() + 1);
......@@ -63,5 +75,61 @@ struct CommonEvent {
const char *attr = nullptr; // not owned, designed for performance
};
struct CommonMemEvent {
public:
CommonMemEvent(uint64_t timestamp_ns,
uint64_t addr,
TracerMemEventType type,
int64_t increase_bytes,
const Place &place,
uint64_t current_allocated,
uint64_t current_reserved,
uint64_t peak_allocated,
uint64_t peak_reserved)
: timestamp_ns(timestamp_ns),
addr(addr),
type(type),
increase_bytes(increase_bytes),
place(place),
peak_allocated(peak_allocated),
peak_reserved(peak_reserved) {}
uint64_t timestamp_ns;
uint64_t addr;
TracerMemEventType type;
int64_t increase_bytes;
Place place;
uint64_t current_allocated;
uint64_t current_reserved;
uint64_t peak_allocated;
uint64_t peak_reserved;
};
struct OperatorSupplementOriginEvent {
public:
OperatorSupplementOriginEvent(
std::function<void *(size_t)> arena_allocator,
uint64_t timestamp_ns,
const std::string &type_name,
const std::map<std::string, std::vector<framework::DDim>> &input_shapes,
const std::map<std::string, std::vector<framework::proto::VarType::Type>>
&dtypes,
const std::vector<std::string> callstack)
: timestamp_ns(timestamp_ns),
input_shapes(input_shapes),
dtypes(dtypes),
callstack(callstack) {
auto buf = static_cast<char *>(arena_allocator(type_name.length() + 1));
strncpy(buf, type_name.c_str(), type_name.length() + 1);
op_type = buf;
}
uint64_t timestamp_ns;
const char *op_type = nullptr; // not owned, designed for performance
// input shapes
std::map<std::string, std::vector<framework::DDim>> input_shapes;
std::map<std::string, std::vector<framework::proto::VarType::Type>> dtypes;
// call stack
const std::vector<std::string> callstack;
};
} // namespace platform
} // namespace paddle
......@@ -11,9 +11,10 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/profiler/host_tracer.h"
#include <sstream>
#include "glog/logging.h"
#include "paddle/fluid/platform/flags.h"
#include "paddle/fluid/platform/profiler/common_event.h"
......@@ -21,7 +22,8 @@
// Used to filter events, works like glog VLOG(level).
// RecordEvent will works if host_trace_level >= level.
PADDLE_DEFINE_EXPORTED_int64(host_trace_level, 1,
PADDLE_DEFINE_EXPORTED_int64(host_trace_level,
1,
"RecordEvent will works "
"if host_trace_level >= level.");
......@@ -50,6 +52,79 @@ void ProcessHostEvents(const HostEventSection<CommonEvent>& host_events,
}
}
void ProcessHostMemEvents(
const HostEventSection<CommonMemEvent>& host_mem_events,
TraceEventCollector* collector) {
for (const auto& thr_sec : host_mem_events.thr_sections) {
uint64_t tid = thr_sec.thread_id;
if (thr_sec.thread_name != kDefaultThreadName) {
collector->AddThreadName(tid, thr_sec.thread_name);
}
for (const auto& evt : thr_sec.events) {
MemTraceEvent event;
event.timestamp_ns = evt.timestamp_ns;
event.addr = evt.addr;
event.type = evt.type;
event.increase_bytes = evt.increase_bytes;
event.place = evt.place.DebugString();
event.current_allocated = evt.current_allocated;
event.current_reserved = evt.current_reserved;
event.peak_allocated = evt.peak_allocated;
event.peak_reserved = evt.peak_reserved;
event.process_id = host_mem_events.process_id;
event.thread_id = tid;
collector->AddMemEvent(std::move(event));
}
}
}
void ProcessOperatorSupplementEvents(
const HostEventSection<OperatorSupplementOriginEvent>& op_supplement_events,
TraceEventCollector* collector) {
for (const auto& thr_sec : op_supplement_events.thr_sections) {
uint64_t tid = thr_sec.thread_id;
if (thr_sec.thread_name != kDefaultThreadName) {
collector->AddThreadName(tid, thr_sec.thread_name);
}
for (const auto& evt : thr_sec.events) {
OperatorSupplementEvent event;
event.timestamp_ns = evt.timestamp_ns;
event.op_type = evt.op_type;
std::map<std::string, std::vector<std::vector<int64_t>>> input_shapes;
std::map<std::string, std::vector<std::string>> dtypes;
std::string callstack;
for (auto it = evt.input_shapes.begin(); it != evt.input_shapes.end();
it++) {
for (auto idx = 0lu; idx < it->second.size(); idx++) {
input_shapes[it->first].push_back(std::vector<int64_t>());
for (auto dim_idx = 0; dim_idx < it->second.at(idx).size();
dim_idx++) {
input_shapes[it->first][idx].push_back(
it->second.at(idx).at(dim_idx));
}
}
}
for (auto it = evt.dtypes.begin(); it != evt.dtypes.end(); it++) {
for (auto idx = 0lu; idx < it->second.size(); idx++) {
dtypes[it->first].push_back(
framework::proto::VarType::Type_Name(it->second.at(idx)));
}
}
std::ostringstream result_string;
for (auto it = evt.callstack.begin(); it != evt.callstack.end(); it++) {
result_string << (*it) << std::endl;
}
event.input_shapes = input_shapes;
event.dtypes = dtypes;
event.callstack = result_string.str();
event.process_id = op_supplement_events.process_id;
event.thread_id = tid;
collector->AddOperatorSupplementEvent(std::move(event));
}
}
}
} // namespace
void HostTracer::PrepareTracing() {
......@@ -60,16 +135,21 @@ void HostTracer::PrepareTracing() {
void HostTracer::StartTracing() {
PADDLE_ENFORCE_EQ(
state_ == TracerState::READY || state_ == TracerState::STOPED, true,
state_ == TracerState::READY || state_ == TracerState::STOPED,
true,
platform::errors::PreconditionNotMet("TracerState must be READY"));
HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
HostEventRecorder<CommonMemEvent>::GetInstance().GatherEvents();
HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance()
.GatherEvents();
HostTraceLevel::GetInstance().SetLevel(options_.trace_level);
state_ = TracerState::STARTED;
}
void HostTracer::StopTracing() {
PADDLE_ENFORCE_EQ(
state_, TracerState::STARTED,
state_,
TracerState::STARTED,
platform::errors::PreconditionNotMet("TracerState must be STARTED"));
HostTraceLevel::GetInstance().SetLevel(HostTraceLevel::kDisabled);
state_ = TracerState::STOPED;
......@@ -77,11 +157,19 @@ void HostTracer::StopTracing() {
void HostTracer::CollectTraceData(TraceEventCollector* collector) {
PADDLE_ENFORCE_EQ(
state_, TracerState::STOPED,
state_,
TracerState::STOPED,
platform::errors::PreconditionNotMet("TracerState must be STOPED"));
HostEventSection<CommonEvent> host_events =
HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
ProcessHostEvents(host_events, collector);
HostEventSection<CommonMemEvent> host_mem_events =
HostEventRecorder<CommonMemEvent>::GetInstance().GatherEvents();
ProcessHostMemEvents(host_mem_events, collector);
HostEventSection<OperatorSupplementOriginEvent> op_supplement_events =
HostEventRecorder<OperatorSupplementOriginEvent>::GetInstance()
.GatherEvents();
ProcessOperatorSupplementEvents(op_supplement_events, collector);
}
} // namespace platform
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler/trace_event.h"
namespace paddle {
namespace platform {
// Memory event tracing. A trace marks memory manipulation such as allocation
// and free.
// The events can be used to draw memory variation curve.
class RecordMemEvent {
public:
/**
* @param ptr: Pointer address allocated or free.
* @param place: Device for this memory event.
* @param size: Memory size allocated or free.
* @param type: Denote manipulation type for this memory event.
*/
explicit RecordMemEvent(
const void* ptr,
const Place& place,
size_t size,
const TracerMemEventType type = TracerMemEventType::Allocate);
};
} // namespace platform
} // namespace paddle
......@@ -23,6 +23,8 @@
#ifdef PADDLE_WITH_HIP
#include <hip/hip_runtime.h>
#endif
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/profiler/event_python.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/platform/profiler/profiler.h"
......@@ -41,10 +43,10 @@ TEST(ProfilerTest, TestHostTracer) {
profiler->Prepare();
profiler->Start();
{
RecordInstantEvent("TestTraceLevel_record1", TracerEventType::UserDefined,
2);
RecordInstantEvent("TestTraceLevel_record2", TracerEventType::UserDefined,
3);
RecordInstantEvent(
"TestTraceLevel_record1", TracerEventType::UserDefined, 2);
RecordInstantEvent(
"TestTraceLevel_record2", TracerEventType::UserDefined, 3);
}
auto profiler_result = profiler->Stop();
auto nodetree = profiler_result->GetNodeTrees();
......@@ -93,3 +95,49 @@ TEST(ProfilerTest, TestCudaTracer) {
EXPECT_GT(runtime_events.size(), 0u);
#endif
}
TEST(ProfilerTest, TestHostTracerForMem) {
using paddle::platform::CPUPlace;
using paddle::platform::EnableHostEventRecorder;
using paddle::platform::MemTraceEventNode;
using paddle::platform::Profiler;
using paddle::platform::ProfilerOptions;
using paddle::platform::ProfilerResult;
using paddle::platform::RecordEvent;
using paddle::platform::RecordInstantEvent;
using paddle::platform::RecordMemEvent;
using paddle::platform::TracerEventType;
using paddle::platform::TracerMemEventType;
ProfilerOptions options;
options.trace_level = 1;
options.trace_switch = 3;
auto profiler = Profiler::Create(options);
EXPECT_TRUE(profiler);
EnableHostEventRecorder();
profiler->Prepare();
profiler->Start();
{
RecordEvent event1(
"TestTracerForMem_phase1", TracerEventType::UserDefined, 1);
RecordMemEvent(reinterpret_cast<void*>(0),
CPUPlace(),
1024,
TracerMemEventType::Allocate);
RecordMemEvent(
reinterpret_cast<void*>(0), CPUPlace(), 1024, TracerMemEventType::Free);
}
{
RecordEvent event2(
"TestTracerForMem_phase2", TracerEventType::UserDefined, 1);
RecordMemEvent(reinterpret_cast<void*>(1024),
CPUPlace(),
1024,
TracerMemEventType::Allocate);
RecordMemEvent(reinterpret_cast<void*>(1024),
CPUPlace(),
1024,
TracerMemEventType::Free);
}
auto profiler_result = profiler->Stop();
auto nodetree = profiler_result->GetNodeTrees();
}
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/shape_inference.h"
#include "paddle/fluid/framework/type_defs.h"
#include "paddle/fluid/platform/profiler/trace_event.h"
namespace paddle {
namespace framework {
class RuntimeContext;
}
namespace platform {
class RecordOpInfoSupplement {
public:
/**
* @param type: Operator type name.
* @param attrs: Attribute map of op.
* @param shape_ctx: Infershape context object.
* @param ctx: Runtime context object.
*/
explicit RecordOpInfoSupplement(const std::string& type,
const framework::AttributeMap& attrs,
const framework::InferShapeContext& shape_ctx,
const framework::RuntimeContext& ctx);
};
} // namespace platform
} // namespace paddle
......@@ -382,7 +382,8 @@ static T PyObjectCast(PyObject *obj) {
} catch (py::cast_error &) {
PADDLE_THROW(platform::errors::InvalidArgument(
"Python object is not type of %s, the real type is %s",
typeid(T).name(), obj->ob_type->tp_name));
typeid(T).name(),
obj->ob_type->tp_name));
}
}
......@@ -441,7 +442,8 @@ static std::vector<std::string> inline GetNameList(
}
static void inline CreateVariableIfNotExit(
const py::handle &py_handle, const framework::Scope &scope,
const py::handle &py_handle,
const framework::Scope &scope,
const framework::Executor *exe = nullptr) {
std::vector<std::string> vec_res;
......@@ -479,7 +481,8 @@ static void inline CreateVariableIfNotExit(
PyObject *py_var_desc =
PyObject_GetAttrString(PyList_GET_ITEM(py_obj, i), kVarDescField);
PADDLE_ENFORCE_NOT_NULL(
py_var_desc, platform::errors::InvalidArgument(
py_var_desc,
platform::errors::InvalidArgument(
"The var_desc of parameter to set is None"));
auto var_desc = PyObjectCast<framework::VarDesc>(py_var_desc);
Py_DECREF(py_var_desc);
......@@ -515,7 +518,8 @@ static void AssertStaticGraphAndDygraphGradMakerNoDiff() {
}
}
}
PADDLE_ENFORCE_EQ(ops.empty(), true,
PADDLE_ENFORCE_EQ(ops.empty(),
true,
platform::errors::Unimplemented(
"OperatorWithKernel [%s] have only static graph grad "
"maker or have only dygraph grad maker, which is not "
......@@ -537,8 +541,10 @@ static int GetNCCLVersion() {
#endif
template <typename PlaceType>
static void TensorCopyFrom(framework::Tensor *dst, const framework::Tensor &src,
const PlaceType &place, int64_t batch_size) {
static void TensorCopyFrom(framework::Tensor *dst,
const framework::Tensor &src,
const PlaceType &place,
int64_t batch_size) {
if (batch_size < 0) {
framework::TensorCopy(src, place, dst);
} else {
......@@ -624,7 +630,8 @@ PYBIND11_MODULE(core_noavx, m) {
PyCapsule_GetPointer(dltensor->ptr(), "dltensor"));
PADDLE_ENFORCE_NOT_NULL(
dmt, platform::errors::InvalidArgument(
dmt,
platform::errors::InvalidArgument(
"from_dlpack received an invalid capsule. "
"Note that a DLPack tensor can be consumed only once."));
......@@ -644,7 +651,8 @@ PYBIND11_MODULE(core_noavx, m) {
});
m.def("_create_loaded_parameter",
[](const py::handle &vec_var_list, const Scope &scope,
[](const py::handle &vec_var_list,
const Scope &scope,
const Executor *executor) {
CreateVariableIfNotExit(vec_var_list, scope, executor);
});
......@@ -682,8 +690,9 @@ PYBIND11_MODULE(core_noavx, m) {
<< ", sci_mode=" << print_opt.sci_mode;
});
m.def("broadcast_shape", [](const std::vector<int64_t> &x_dim,
const std::vector<int64_t> &y_dim) {
m.def(
"broadcast_shape",
[](const std::vector<int64_t> &x_dim, const std::vector<int64_t> &y_dim) {
return phi::vectorize(operators::details::BroadcastTwoDims(
phi::make_ddim(x_dim), phi::make_ddim(y_dim), -1));
});
......@@ -808,14 +817,22 @@ PYBIND11_MODULE(core_noavx, m) {
self.EmplaceBackOutput(std::move(CastPyArg2Tensor(obj, 1)));
}
})
.def("add_attr", [](paddle::CustomOpKernelContext &self,
bool attr) { self.EmplaceBackAttr(attr); })
.def("add_attr", [](paddle::CustomOpKernelContext &self,
int attr) { self.EmplaceBackAttr(attr); })
.def("add_attr", [](paddle::CustomOpKernelContext &self,
float attr) { self.EmplaceBackAttr(attr); })
.def("add_attr", [](paddle::CustomOpKernelContext &self,
int64_t attr) { self.EmplaceBackAttr(attr); })
.def("add_attr",
[](paddle::CustomOpKernelContext &self, bool attr) {
self.EmplaceBackAttr(attr);
})
.def("add_attr",
[](paddle::CustomOpKernelContext &self, int attr) {
self.EmplaceBackAttr(attr);
})
.def("add_attr",
[](paddle::CustomOpKernelContext &self, float attr) {
self.EmplaceBackAttr(attr);
})
.def("add_attr",
[](paddle::CustomOpKernelContext &self, int64_t attr) {
self.EmplaceBackAttr(attr);
})
.def("add_attr",
[](paddle::CustomOpKernelContext &self, const std::string &attr) {
self.EmplaceBackAttr(attr);
......@@ -829,13 +846,14 @@ PYBIND11_MODULE(core_noavx, m) {
.def("add_attr",
[](paddle::CustomOpKernelContext &self,
const std::vector<int64_t> &attr) { self.EmplaceBackAttr(attr); })
.def("add_attr", [](paddle::CustomOpKernelContext &self,
.def("add_attr",
[](paddle::CustomOpKernelContext &self,
const std::vector<std::string> &attr) {
self.EmplaceBackAttr(attr);
});
py::class_<framework::Tensor> framework_tensor(m, "Tensor",
py::buffer_protocol());
py::class_<framework::Tensor> framework_tensor(
m, "Tensor", py::buffer_protocol());
g_framework_tensor_pytype =
reinterpret_cast<PyTypeObject *>(framework_tensor.ptr());
framework_tensor
......@@ -918,80 +936,135 @@ PYBIND11_MODULE(core_noavx, m) {
self.mutable_data<float>(place);
})
.def("_mutable_data",
[](framework::Tensor &self, paddle::platform::CPUPlace &place,
[](framework::Tensor &self,
paddle::platform::CPUPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_mutable_data",
[](framework::Tensor &self, paddle::platform::CustomPlace &place,
[](framework::Tensor &self,
paddle::platform::CustomPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_mutable_data",
[](framework::Tensor &self, paddle::platform::XPUPlace &place,
[](framework::Tensor &self,
paddle::platform::XPUPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_mutable_data",
[](framework::Tensor &self, paddle::platform::CUDAPlace &place,
[](framework::Tensor &self,
paddle::platform::CUDAPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_mutable_data",
[](framework::Tensor &self, paddle::platform::CUDAPinnedPlace &place,
[](framework::Tensor &self,
paddle::platform::CUDAPinnedPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_mutable_data",
[](framework::Tensor &self, paddle::platform::MLUPlace &place,
[](framework::Tensor &self,
paddle::platform::MLUPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_clear", &framework::Tensor::clear)
.def("_mutable_data",
[](framework::Tensor &self, paddle::platform::NPUPlace &place,
[](framework::Tensor &self,
paddle::platform::NPUPlace &place,
paddle::framework::proto::VarType::Type type) {
return reinterpret_cast<uintptr_t>(
self.mutable_data(place, framework::TransToPhiDataType(type)));
})
.def("_copy_from", &TensorCopyFrom<paddle::platform::CPUPlace>,
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
.def("_copy_from", &TensorCopyFrom<paddle::platform::CustomPlace>,
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
.def("_copy_from", &TensorCopyFrom<paddle::platform::XPUPlace>,
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
.def("_copy_from", &TensorCopyFrom<paddle::platform::CUDAPlace>,
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
.def("_copy_from", &TensorCopyFrom<paddle::platform::NPUPlace>,
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
.def("_copy_from", &TensorCopyFrom<paddle::platform::CUDAPinnedPlace>,
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
.def("_copy_from", &TensorCopyFrom<paddle::platform::MLUPlace>,
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
.def("_copy_from", &TensorCopyFrom<paddle::platform::Place>,
py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1)
.def("set", SetTensorFromPyArray<paddle::platform::CPUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::CustomPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::XPUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::CUDAPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::NPUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::IPUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::MLUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
.def("_copy_from",
&TensorCopyFrom<paddle::platform::CPUPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::CustomPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::XPUPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::CUDAPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::NPUPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::CUDAPinnedPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::MLUPlace>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("_copy_from",
&TensorCopyFrom<paddle::platform::Place>,
py::arg("tensor"),
py::arg("place"),
py::arg("batch_size") = -1)
.def("set",
SetTensorFromPyArray<paddle::platform::CPUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::CustomPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::XPUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::CUDAPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::NPUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::IPUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::MLUPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false)
.def("set",
SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
py::arg("array"),
py::arg("place"),
py::arg("zero_copy") = false,
R"DOC(
Set the data of Tensor on place with given numpy array.
......@@ -1077,9 +1150,9 @@ PYBIND11_MODULE(core_noavx, m) {
ostr << self;
return ostr.str();
}) /* ------ End of original Tensor ------ */
.def(
"__init__",
[](framework::Tensor &instance, const std::vector<std::vector<size_t>>
.def("__init__",
[](framework::Tensor &instance,
const std::vector<std::vector<size_t>>
&recursive_sequence_lengths) {
LoD new_lod;
new_lod.reserve(recursive_sequence_lengths.size());
......@@ -1088,7 +1161,8 @@ PYBIND11_MODULE(core_noavx, m) {
std::back_inserter(new_lod));
LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
PADDLE_ENFORCE_EQ(
CheckLoD(new_offset_lod, -1), true,
CheckLoD(new_offset_lod, -1),
true,
platform::errors::InvalidArgument(
"The provided recursive_sequence_lengths info is "
"invalid, "
......@@ -1115,12 +1189,14 @@ PYBIND11_MODULE(core_noavx, m) {
new_lod.reserve(lod.size());
std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
PADDLE_ENFORCE_EQ(
CheckLoD(new_lod, vectorize(self.dims()).front()), true,
CheckLoD(new_lod, vectorize(self.dims()).front()),
true,
platform::errors::InvalidArgument(
"The provided LoD is invalid, the LoD is %s", new_lod));
self.set_lod(new_lod);
},
py::arg("lod"), R"DOC(
py::arg("lod"),
R"DOC(
Set LoD of the Tensor.
Args:
......@@ -1142,7 +1218,8 @@ PYBIND11_MODULE(core_noavx, m) {
)DOC")
.def(
"set_recursive_sequence_lengths",
[](framework::Tensor &self, const std::vector<std::vector<size_t>>
[](framework::Tensor &self,
const std::vector<std::vector<size_t>>
&recursive_sequence_lengths) {
// the input recursive_sequence_lengths is length-based
// level-of-detail info
......@@ -1153,7 +1230,8 @@ PYBIND11_MODULE(core_noavx, m) {
std::back_inserter(new_lod));
LoD new_offset_lod = ConvertToOffsetBasedLoD(new_lod);
PADDLE_ENFORCE_EQ(
CheckLoD(new_offset_lod, vectorize(self.dims()).front()), true,
CheckLoD(new_offset_lod, vectorize(self.dims()).front()),
true,
platform::errors::InvalidArgument(
"The provided recursive_sequence_lengths info is "
"invalid, "
......@@ -1162,7 +1240,8 @@ PYBIND11_MODULE(core_noavx, m) {
new_lod));
self.set_lod(new_offset_lod);
},
py::arg("recursive_sequence_lengths"), R"DOC(
py::arg("recursive_sequence_lengths"),
R"DOC(
Set LoD of the Tensor according to recursive sequence lengths.
For example, if recursive_sequence_lengths=[[2, 3]], which means
......@@ -1630,7 +1709,8 @@ PYBIND11_MODULE(core_noavx, m) {
new (&instance) phi::SelectedRows();
})
.def("__init__",
[](phi::SelectedRows &instance, const std::vector<int64_t> rows,
[](phi::SelectedRows &instance,
const std::vector<int64_t> rows,
const int64_t &height) {
new (&instance) phi::SelectedRows(rows, height);
})
......@@ -1693,8 +1773,10 @@ All parameter, weight, gradient are variables in Paddle.
[](Variable &self, Strings str_list) {
*self.GetMutable<Strings>() = str_list;
})
.def("set_vocab", [](Variable &self,
Vocab vocab) { *self.GetMutable<Vocab>() = vocab; })
.def("set_vocab",
[](Variable &self, Vocab vocab) {
*self.GetMutable<Vocab>() = vocab;
})
.def(
"get_string_tensor",
[](Variable &self) { return self.GetMutable<Strings>(); },
......@@ -1732,7 +1814,8 @@ All parameter, weight, gradient are variables in Paddle.
.def(
"get_reader",
[](Variable &self) -> framework::ReaderHolder * {
PADDLE_ENFORCE_EQ(self.IsType<framework::ReaderHolder>(), true,
PADDLE_ENFORCE_EQ(self.IsType<framework::ReaderHolder>(),
true,
platform::errors::InvalidArgument(
"The variable is not type of ReaderHolder."));
return self.GetMutable<framework::ReaderHolder>();
......@@ -1743,7 +1826,8 @@ All parameter, weight, gradient are variables in Paddle.
[](Variable &self) -> Scope * {
auto scope_vec = self.GetMutable<std::vector<framework::Scope *>>();
PADDLE_ENFORCE_GT(
scope_vec->size(), 0,
scope_vec->size(),
0,
platform::errors::InvalidArgument(
"The size of scope_vec should be greater than 0"));
return scope_vec->front();
......@@ -1801,7 +1885,9 @@ All parameter, weight, gradient are variables in Paddle.
out (core.Variable): the found or created variable.
)DOC",
py::return_value_policy::reference)
.def("find_var", &Scope::FindVar, py::arg("name"),
.def("find_var",
&Scope::FindVar,
py::arg("name"),
R"DOC(
Find variable named :code:`name` in the current scope or
its parent scope. Return None if not found.
......@@ -1814,7 +1900,9 @@ All parameter, weight, gradient are variables in Paddle.
)DOC",
py::return_value_policy::reference)
.def("size", &Scope::Size)
.def("erase", &Scope::EraseVars, py::arg("names"),
.def("erase",
&Scope::EraseVars,
py::arg("names"),
R"DOC(
Find variable named :code:`name` in the current scope or
its parent scope. Return None if not found.
......@@ -1827,7 +1915,8 @@ All parameter, weight, gradient are variables in Paddle.
)DOC",
py::return_value_policy::reference)
.def(
"new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
"new_scope",
[](Scope &self) -> Scope * { return &self.NewScope(); },
R"DOC(
Create a new sub-scope of the current scope.
......@@ -1835,7 +1924,8 @@ All parameter, weight, gradient are variables in Paddle.
out (core._Scope): the created sub-scope.
)DOC",
py::return_value_policy::reference)
.def("drop_kids", &Scope::DropKids,
.def("drop_kids",
&Scope::DropKids,
R"DOC(
Delete all sub-scopes of the current scope.
)DOC")
......@@ -1865,7 +1955,8 @@ All parameter, weight, gradient are variables in Paddle.
if (info.HasOpProtoAndChecker()) {
std::string str;
PADDLE_ENFORCE_EQ(
info.Proto().SerializeToString(&str), true,
info.Proto().SerializeToString(&str),
true,
platform::errors::Fatal(
"Serialize OpProto Error. This could be a bug of Paddle."));
ret_values.emplace_back(str);
......@@ -1886,18 +1977,20 @@ All parameter, weight, gradient are variables in Paddle.
}
return res;
});
m.def(
"get_grad_op_desc", [](const OpDesc &op_desc,
m.def("get_grad_op_desc",
[](const OpDesc &op_desc,
const std::unordered_set<std::string> &no_grad_set,
const std::vector<BlockDesc *> &grad_sub_block) {
std::unordered_map<std::string, std::string> grad_to_var;
std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
framework::OpInfoMap::Instance()
.Get(op_desc.Type())
.GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
grad_sub_block);
.GradOpMaker()(
op_desc, no_grad_set, &grad_to_var, grad_sub_block);
std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
std::transform(grad_op_descs.begin(), grad_op_descs.end(),
std::transform(
grad_op_descs.begin(),
grad_op_descs.end(),
grad_op_desc_ptrs.begin(),
[](std::unique_ptr<OpDesc> &p) { return p.release(); });
return std::make_pair(grad_op_desc_ptrs, grad_to_var);
......@@ -1914,7 +2007,8 @@ All parameter, weight, gradient are variables in Paddle.
return framework::OpInfoMap::Instance().Get(op_type).HasInferInplace();
});
m.def("infer_no_need_buffer_slots",
[](const std::string op_type, const framework::VariableNameMap &inputs,
[](const std::string op_type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs) {
auto infer_func = framework::OpInfoMap::Instance()
......@@ -1927,7 +2021,8 @@ All parameter, weight, gradient are variables in Paddle.
return empty;
}
});
m.def("prune", [](const ProgramDesc &origin,
m.def("prune",
[](const ProgramDesc &origin,
const std::set<std::string> &feeded_var_names,
const std::vector<std::array<size_t, 2>> &targets) {
ProgramDesc prog_with_targets(origin);
......@@ -2168,7 +2263,8 @@ All parameter, weight, gradient are variables in Paddle.
#endif
return devices;
});
py::class_<platform::CustomPlace> customplace(m, "CustomPlace",
py::class_<platform::CustomPlace> customplace(m,
"CustomPlace",
R"DOC(
CustomPlace is a descriptor of a device.
It represents a custom device on which a tensor will be allocated and a model will run.
......@@ -2182,7 +2278,8 @@ All parameter, weight, gradient are variables in Paddle.
g_customplace_pytype = reinterpret_cast<PyTypeObject *>(customplace.ptr());
customplace
.def("__init__",
[](platform::CustomPlace &self, const std::string &device_type,
[](platform::CustomPlace &self,
const std::string &device_type,
int dev_id) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
if (UNLIKELY(dev_id < 0)) {
......@@ -2190,7 +2287,8 @@ All parameter, weight, gradient are variables in Paddle.
"Invalid CustomPlace(%s, %d), device id must be 0 "
"or "
"positive integer",
device_type, dev_id);
device_type,
dev_id);
std::exit(-1);
}
......@@ -2211,7 +2309,11 @@ All parameter, weight, gradient are variables in Paddle.
"inside "
"[0, %d), because %s "
"number on your machine is %d",
device_type, dev_id, dev_count, device_type, dev_count);
device_type,
dev_id,
dev_count,
device_type,
dev_count);
std::exit(-1);
}
}
......@@ -2221,7 +2323,8 @@ All parameter, weight, gradient are variables in Paddle.
"Invalid CustomPlace(%s, %d), the device type is "
"not registered "
"as a custom device.",
device_type, dev_id);
device_type,
dev_id);
std::exit(-1);
}
#else
......@@ -2293,7 +2396,8 @@ All parameter, weight, gradient are variables in Paddle.
LOG(ERROR) << string::Sprintf(
"Invalid CUDAPlace(%d), must inside [0, %d), because GPU "
"number on your machine is %d",
dev_id, platform::GetGPUDeviceCount(),
dev_id,
platform::GetGPUDeviceCount(),
platform::GetGPUDeviceCount());
std::exit(-1);
}
......@@ -2359,7 +2463,8 @@ All parameter, weight, gradient are variables in Paddle.
LOG(ERROR) << string::Sprintf(
"Invalid XPUPlace(%d), must inside [0, %d), because XPU "
"number on your machine is %d",
dev_id, platform::GetXPUDeviceCount(),
dev_id,
platform::GetXPUDeviceCount(),
platform::GetXPUDeviceCount());
std::exit(-1);
}
......@@ -2524,7 +2629,8 @@ All parameter, weight, gradient are variables in Paddle.
LOG(ERROR) << string::Sprintf(
"Invalid NPUPlace(%d), must inside [0, %d), because NPU "
"number on your machine is %d",
dev_id, platform::GetNPUDeviceCount(),
dev_id,
platform::GetNPUDeviceCount(),
platform::GetNPUDeviceCount());
std::exit(-1);
}
......@@ -2640,7 +2746,8 @@ All parameter, weight, gradient are variables in Paddle.
LOG(ERROR) << string::Sprintf(
"Invalid MLUPlace(%d), must inside [0, %d), because MLU "
"number on your machine is %d",
dev_id, platform::GetMLUDeviceCount(),
dev_id,
platform::GetMLUDeviceCount(),
platform::GetMLUDeviceCount());
std::exit(-1);
}
......@@ -2713,8 +2820,10 @@ All parameter, weight, gradient are variables in Paddle.
.def("mlu_device_id", [](platform::Place &self) { return self.device; })
.def("custom_device_id",
[](platform::Place &self) { return self.device; })
.def("set_place", [](platform::Place &self,
const platform::Place &other) { self = other; })
.def("set_place",
[](platform::Place &self, const platform::Place &other) {
self = other;
})
.def("set_place",
[](platform::Place &self, const platform::CPUPlace &cpu_place) {
self = cpu_place;
......@@ -2759,7 +2868,8 @@ All parameter, weight, gradient are variables in Paddle.
true,
platform::errors::InvalidArgument(
"Cannot parse user input to OpDesc"));
PADDLE_ENFORCE_EQ(desc.IsInitialized(), true,
PADDLE_ENFORCE_EQ(desc.IsInitialized(),
true,
platform::errors::InvalidArgument(
"The provided OpDesc is not "
"initialized, the reason is: %s",
......@@ -2767,43 +2877,50 @@ All parameter, weight, gradient are variables in Paddle.
return OpRegistry::CreateOp(desc);
})
.def("run",
[](OperatorBase &self, const Scope &scope,
[](OperatorBase &self,
const Scope &scope,
const platform::CPUPlace &place) {
pybind11::gil_scoped_release release;
self.Run(scope, place);
})
.def("run",
[](OperatorBase &self, const Scope &scope,
[](OperatorBase &self,
const Scope &scope,
const platform::XPUPlace &place) {
pybind11::gil_scoped_release release;
self.Run(scope, place);
})
.def("run",
[](OperatorBase &self, const Scope &scope,
[](OperatorBase &self,
const Scope &scope,
const platform::NPUPlace &place) {
pybind11::gil_scoped_release release;
self.Run(scope, place);
})
.def("run",
[](OperatorBase &self, const Scope &scope,
[](OperatorBase &self,
const Scope &scope,
const platform::CUDAPlace &place) {
pybind11::gil_scoped_release release;
self.Run(scope, place);
})
.def("run",
[](OperatorBase &self, const Scope &scope,
[](OperatorBase &self,
const Scope &scope,
const platform::CUDAPinnedPlace &place) {
pybind11::gil_scoped_release release;
self.Run(scope, place);
})
.def("run",
[](OperatorBase &self, const Scope &scope,
[](OperatorBase &self,
const Scope &scope,
const platform::MLUPlace &place) {
pybind11::gil_scoped_release release;
self.Run(scope, place);
})
.def("run",
[](OperatorBase &self, const Scope &scope,
[](OperatorBase &self,
const Scope &scope,
const platform::CustomPlace &place) {
pybind11::gil_scoped_release release;
self.Run(scope, place);
......@@ -2843,13 +2960,17 @@ All parameter, weight, gradient are variables in Paddle.
py::class_<framework::Executor>(m, "Executor")
.def(py::init<const platform::Place &>())
.def("close", &Executor::Close)
.def("run_from_dataset", &Executor::RunFromDataset,
.def("run_from_dataset",
&Executor::RunFromDataset,
py::call_guard<py::gil_scoped_release>())
.def("release_trainer", &Executor::ReleaseTrainer,
.def("release_trainer",
&Executor::ReleaseTrainer,
py::call_guard<py::gil_scoped_release>())
.def("init_for_dataset",
[](Executor &self, const ProgramDesc &prog,
const std::string &trainer_desc, Scope *scope,
[](Executor &self,
const ProgramDesc &prog,
const std::string &trainer_desc,
Scope *scope,
Dataset *dataset) -> std::shared_ptr<TrainerBase> {
pybind11::gil_scoped_release release;
return self.InitForDataset(prog, trainer_desc, scope, dataset);
......@@ -2860,40 +2981,62 @@ All parameter, weight, gradient are variables in Paddle.
self.RunFromDataset(trainer);
})
.def("run_prepared_ctx",
[](Executor &self, ExecutorPrepareContext *ctx, Scope *scope,
[](Executor &self,
ExecutorPrepareContext *ctx,
Scope *scope,
std::map<std::string, const LoDTensor *> *feed_targets,
std::map<std::string, FetchType *> *fetch_targets,
bool create_local_scope = true, bool create_vars = true,
bool create_local_scope = true,
bool create_vars = true,
const std::string &feed_holder_name = "feed",
const std::string &fetch_holder_name = "fetch") {
pybind11::gil_scoped_release release;
self.RunPreparedContext(ctx, scope, feed_targets, fetch_targets,
create_local_scope, create_vars,
feed_holder_name, fetch_holder_name);
self.RunPreparedContext(ctx,
scope,
feed_targets,
fetch_targets,
create_local_scope,
create_vars,
feed_holder_name,
fetch_holder_name);
})
.def("run_prepared_ctx",
[](Executor &self, ExecutorPrepareContext *ctx, Scope *scope,
bool create_local_scope = true, bool create_vars = true,
[](Executor &self,
ExecutorPrepareContext *ctx,
Scope *scope,
bool create_local_scope = true,
bool create_vars = true,
bool keep_kids = false) {
pybind11::gil_scoped_release release;
self.RunPreparedContext(ctx, scope, create_local_scope,
create_vars, keep_kids);
self.RunPreparedContext(
ctx, scope, create_local_scope, create_vars, keep_kids);
})
.def("prepare",
[](Executor &self, const ProgramDesc &program, int block_id,
[](Executor &self,
const ProgramDesc &program,
int block_id,
const std::vector<std::string> &skip_ref_cnt_vars =
std::vector<std::string>(),
bool force_disable_gc = false) {
pybind11::gil_scoped_release release;
return self.Prepare(program, block_id, skip_ref_cnt_vars,
force_disable_gc);
return self.Prepare(
program, block_id, skip_ref_cnt_vars, force_disable_gc);
})
.def("create_variables", &Executor::CreateVariables)
.def("run", [](Executor &self, const ProgramDesc &prog, Scope *scope,
int block_id, bool create_local_scope, bool create_vars,
.def("run",
[](Executor &self,
const ProgramDesc &prog,
Scope *scope,
int block_id,
bool create_local_scope,
bool create_vars,
const std::vector<std::string> &fetch_vars) {
pybind11::gil_scoped_release release;
self.Run(prog, scope, block_id, create_local_scope, create_vars,
self.Run(prog,
scope,
block_id,
create_local_scope,
create_vars,
fetch_vars);
});
......@@ -2906,8 +3049,10 @@ All parameter, weight, gradient are variables in Paddle.
});
py::class_<framework::StandaloneExecutor>(m, "StandaloneExecutor")
.def(py::init<const platform::Place &, const ProgramDesc &,
const ProgramDesc &, Scope *>())
.def(py::init<const platform::Place &,
const ProgramDesc &,
const ProgramDesc &,
Scope *>())
.def("run",
[](StandaloneExecutor &self,
const std::unordered_map<std::string, py::array> &input_dict,
......@@ -2951,7 +3096,8 @@ All parameter, weight, gradient are variables in Paddle.
return py::cast(std::move(ret));
})
.def("run",
[](StandaloneExecutor &self, std::vector<std::string> feed_names,
[](StandaloneExecutor &self,
std::vector<std::string> feed_names,
std::vector<std::string> fetch_names) {
paddle::framework::FetchList ret;
{
......@@ -3036,20 +3182,27 @@ All parameter, weight, gradient are variables in Paddle.
m.def("device_memory_stat_peak_value", memory::DeviceMemoryStatPeakValue);
m.def(
"run_cmd",
[](const std::string &cmd, int time_out = -1,
[](const std::string &cmd,
int time_out = -1,
int sleep_inter = -1) -> const std::string {
return paddle::framework::shell_get_command_output(cmd, time_out,
sleep_inter);
return paddle::framework::shell_get_command_output(
cmd, time_out, sleep_inter);
},
py::arg("cmd"), py::arg("time_out") = -1, py::arg("sleep_inter") = -1);
py::arg("cmd"),
py::arg("time_out") = -1,
py::arg("sleep_inter") = -1);
m.def(
"shell_execute_cmd",
[](const std::string &cmd, int time_out = 0, int sleep_inter = 0,
[](const std::string &cmd,
int time_out = 0,
int sleep_inter = 0,
bool redirect_stderr = false) -> std::vector<std::string> {
return paddle::framework::shell_execute_cmd(cmd, time_out, sleep_inter,
redirect_stderr);
return paddle::framework::shell_execute_cmd(
cmd, time_out, sleep_inter, redirect_stderr);
},
py::arg("cmd"), py::arg("time_out") = 0, py::arg("sleep_inter") = 0,
py::arg("cmd"),
py::arg("time_out") = 0,
py::arg("sleep_inter") = 0,
py::arg("redirect_stderr") = false);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......@@ -3064,13 +3217,16 @@ All parameter, weight, gradient are variables in Paddle.
#endif
m.def("set_feed_variable",
static_cast<void (*)(Scope *, const LoDTensor &, const std::string &,
size_t)>(&framework::SetFeedVariable));
static_cast<void (*)(
Scope *, const LoDTensor &, const std::string &, size_t)>(
&framework::SetFeedVariable));
m.def("set_feed_variable",
static_cast<void (*)(Scope *, const Strings &, const std::string &,
size_t)>(&framework::SetFeedVariable));
static_cast<void (*)(
Scope *, const Strings &, const std::string &, size_t)>(
&framework::SetFeedVariable));
m.def("get_fetch_variable",
[](const Scope &scope, const std::string &var_name,
[](const Scope &scope,
const std::string &var_name,
size_t index) -> py::object {
auto &var = framework::GetFetchVariable(scope, var_name, index);
if (data_is_lod_tensor(var)) {
......@@ -3125,7 +3281,8 @@ All parameter, weight, gradient are variables in Paddle.
.def("__len__", [](LoDTensorArray &self) { return self.size(); })
.def("__setitem__",
[](LoDTensorArray &self, size_t i, const LoDTensor &t) {
PADDLE_ENFORCE_LT(i, self.size(),
PADDLE_ENFORCE_LT(i,
self.size(),
platform::errors::InvalidArgument(
"The index to set is larger than the size "
"of LoDTensorArray."));
......@@ -3139,7 +3296,8 @@ All parameter, weight, gradient are variables in Paddle.
self.back().ShareDataWith(t);
self.back().set_lod(t.lod());
},
py::arg("tensor"), R"DOC(
py::arg("tensor"),
R"DOC(
Append a LoDensor to LoDTensorArray.
Args:
......@@ -3376,16 +3534,18 @@ All parameter, weight, gradient are variables in Paddle.
m.def("reset_profiler", platform::ResetProfiler);
m.def("register_pass", [](const std::string &pass_type, py::object callable) {
PADDLE_ENFORCE_EQ(
framework::ir::PassRegistry::Instance().Has(pass_type), false,
framework::ir::PassRegistry::Instance().Has(pass_type),
false,
platform::errors::AlreadyExists("Pass '%s' is registered more than "
"once. Please use another name.",
pass_type));
callable.inc_ref();
framework::ir::PassRegistry::Instance().Insert(pass_type, [pass_type,
callable]() {
framework::ir::PassRegistry::Instance().Insert(
pass_type, [pass_type, callable]() {
py::gil_scoped_acquire guard;
std::unique_ptr<framework::ir::Pass> pass(
new framework::ir::GeneratePass(py::cast<std::string>(callable())));
new framework::ir::GeneratePass(
py::cast<std::string>(callable())));
return pass;
});
});
......@@ -3397,11 +3557,32 @@ All parameter, weight, gradient are variables in Paddle.
m.def("size_of_dtype", framework::SizeOfType);
py::class_<paddle::platform::ProfilerResult>(m, "_ProfilerResult")
.def(py::init<>())
.def("get_data", &paddle::platform::ProfilerResult::GetData,
.def("get_data",
&paddle::platform::ProfilerResult::GetData,
py::return_value_policy::automatic_reference)
.def("save", &paddle::platform::ProfilerResult::Save)
.def("get_extra_info", &paddle::platform::ProfilerResult::GetExtraInfo);
py::class_<paddle::platform::MemPythonNode>(m, "MemPythonNode")
.def(py::init<>())
.def_readwrite("timestamp_ns",
&paddle::platform::MemPythonNode::timestamp_ns)
.def_readwrite("addr", &paddle::platform::MemPythonNode::addr)
.def_readwrite("type", &paddle::platform::MemPythonNode::type)
.def_readwrite("process_id", &paddle::platform::MemPythonNode::process_id)
.def_readwrite("thread_id", &paddle::platform::MemPythonNode::thread_id)
.def_readwrite("increase_bytes",
&paddle::platform::MemPythonNode::increase_bytes)
.def_readwrite("place", &paddle::platform::MemPythonNode::place)
.def_readwrite("current_allocated",
&paddle::platform::MemPythonNode::current_allocated)
.def_readwrite("current_reserved",
&paddle::platform::MemPythonNode::current_reserved)
.def_readwrite("peak_allocated",
&paddle::platform::MemPythonNode::peak_allocated)
.def_readwrite("peak_reserved",
&paddle::platform::MemPythonNode::peak_reserved);
py::class_<paddle::platform::DevicePythonNode>(m, "DevicePythonNode")
.def(py::init<>())
.def_readwrite("name", &paddle::platform::DevicePythonNode::name)
......@@ -3424,15 +3605,22 @@ All parameter, weight, gradient are variables in Paddle.
.def_readwrite("process_id",
&paddle::platform::HostPythonNode::process_id)
.def_readwrite("thread_id", &paddle::platform::HostPythonNode::thread_id)
.def_readwrite("input_shapes",
&paddle::platform::HostPythonNode::input_shapes)
.def_readwrite("dtypes", &paddle::platform::HostPythonNode::dtypes)
.def_readwrite("callstack", &paddle::platform::HostPythonNode::callstack)
.def_readwrite("children_node",
&paddle::platform::HostPythonNode::children_node_ptrs)
.def_readwrite("runtime_node",
&paddle::platform::HostPythonNode::runtime_node_ptrs)
.def_readwrite("device_node",
&paddle::platform::HostPythonNode::device_node_ptrs);
&paddle::platform::HostPythonNode::device_node_ptrs)
.def_readwrite("mem_node",
&paddle::platform::HostPythonNode::mem_node_ptrs);
py::class_<paddle::platform::Profiler>(m, "_Profiler")
.def("create", &paddle::platform::Profiler::Create,
.def("create",
&paddle::platform::Profiler::Create,
py::return_value_policy::take_ownership)
.def("is_cupti_supported", &paddle::platform::Profiler::IsCuptiSupported)
.def("is_cnpapi_supported",
......@@ -3466,6 +3654,14 @@ All parameter, weight, gradient are variables in Paddle.
}))
.def("end", [](platform::RecordEvent *event) { event->End(); });
py::enum_<paddle::platform::TracerMemEventType>(m, "TracerMemEventType")
.value("Allocate", paddle::platform::TracerMemEventType::Allocate)
.value("Free", paddle::platform::TracerMemEventType::Free)
.value("ReservedAllocate",
paddle::platform::TracerMemEventType::ReservedAllocate)
.value("ReservedFree",
paddle::platform::TracerMemEventType::ReservedFree);
py::enum_<paddle::platform::TracerEventType>(m, "TracerEventType")
.value("Operator", paddle::platform::TracerEventType::Operator)
.value("Dataloader", paddle::platform::TracerEventType::Dataloader)
......@@ -3509,22 +3705,29 @@ All parameter, weight, gradient are variables in Paddle.
[](ir::Pass &self, const std::string &name, const std::string &attr) {
self.Set<std::string>(name, new std::string(attr));
})
.def("set", [](ir::Pass &self, const std::string &name,
bool val) { self.Set<bool>(name, new bool(val)); })
.def("set", [](ir::Pass &self, const std::string &name,
int val) { self.Set<const int>(name, new int(val)); })
.def("set",
[](ir::Pass &self, const std::string &name,
[](ir::Pass &self, const std::string &name, bool val) {
self.Set<bool>(name, new bool(val));
})
.def("set",
[](ir::Pass &self, const std::string &name, int val) {
self.Set<const int>(name, new int(val));
})
.def("set",
[](ir::Pass &self,
const std::string &name,
std::vector<std::string> set) {
self.Set(name, new std::vector<std::string>(set));
})
.def("set",
[](ir::Pass &self, const std::string &name,
[](ir::Pass &self,
const std::string &name,
std::unordered_set<std::string> set) {
self.Set(name, new std::unordered_set<std::string>(set));
})
.def("set",
[](ir::Pass &self, const std::string &name,
[](ir::Pass &self,
const std::string &name,
std::unordered_set<int> set) {
self.Set(name, new std::unordered_set<int>(set));
})
......@@ -3769,7 +3972,8 @@ All parameter, weight, gradient are variables in Paddle.
"reduce_strategy",
[](const BuildStrategy &self) { return self.reduce_; },
[](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true,
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
......@@ -3799,7 +4003,8 @@ All parameter, weight, gradient are variables in Paddle.
[](const BuildStrategy &self) { return self.gradient_scale_; },
[](BuildStrategy &self,
BuildStrategy::GradientScaleStrategy strategy) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true,
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
......@@ -3864,7 +4069,8 @@ All parameter, weight, gradient are variables in Paddle.
"debug_graphviz_path",
[](const BuildStrategy &self) { return self.debug_graphviz_path_; },
[](BuildStrategy &self, const std::string &path) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true,
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
......@@ -3891,7 +4097,8 @@ All parameter, weight, gradient are variables in Paddle.
return self.enable_sequential_execution_;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true,
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
......@@ -3917,7 +4124,8 @@ All parameter, weight, gradient are variables in Paddle.
return self.remove_unnecessary_lock_;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true,
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
......@@ -3995,7 +4203,8 @@ All parameter, weight, gradient are variables in Paddle.
return self.fuse_elewise_add_act_ops_;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true,
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
......@@ -4020,7 +4229,8 @@ All parameter, weight, gradient are variables in Paddle.
"fuse_gemm_epilogue",
[](const BuildStrategy &self) { return self.fuse_gemm_epilogue_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true,
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
......@@ -4045,7 +4255,8 @@ All parameter, weight, gradient are variables in Paddle.
"fuse_bn_act_ops",
[](const BuildStrategy &self) { return self.fuse_bn_act_ops_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true,
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
......@@ -4070,7 +4281,8 @@ All parameter, weight, gradient are variables in Paddle.
"fuse_bn_add_act_ops",
[](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true,
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
......@@ -4095,7 +4307,8 @@ All parameter, weight, gradient are variables in Paddle.
"enable_auto_fusion",
[](const BuildStrategy &self) { return self.enable_auto_fusion_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true,
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
......@@ -4123,7 +4336,8 @@ All parameter, weight, gradient are variables in Paddle.
return self.fuse_relu_depthwise_conv_;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true,
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
......@@ -4153,7 +4367,8 @@ All parameter, weight, gradient are variables in Paddle.
self.fuse_broadcast_ops_ == paddle::none;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true,
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, "
"cannot be configured again."));
......@@ -4184,7 +4399,8 @@ All parameter, weight, gradient are variables in Paddle.
self.fuse_all_optimizer_ops_ == paddle::none;
},
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true,
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, "
"cannot be configured again."));
......@@ -4194,7 +4410,8 @@ All parameter, weight, gradient are variables in Paddle.
"sync_batch_norm",
[](const BuildStrategy &self) { return self.sync_batch_norm_; },
[](BuildStrategy &self, bool b) {
PADDLE_ENFORCE_NE(self.IsFinalized(), true,
PADDLE_ENFORCE_NE(self.IsFinalized(),
true,
platform::errors::PreconditionNotMet(
"BuildStrategy has been finlaized, cannot be "
"configured again."));
......@@ -4348,9 +4565,13 @@ All parameter, weight, gradient are variables in Paddle.
});
pe.def(py::init<const std::vector<platform::Place> &,
const std::vector<std::string> &, const std::string &,
Scope *, std::vector<Scope *> &, const ExecutionStrategy &,
const BuildStrategy &, ir::Graph *>())
const std::vector<std::string> &,
const std::string &,
Scope *,
std::vector<Scope *> &,
const ExecutionStrategy &,
const BuildStrategy &,
ir::Graph *>())
// NOTE: even we return a vec<Scope*>* to Python use reference policy.
// We still cannot get local_scope from this vector, since the element
// of vec<Scope*> will be freed by Python GC. We can only return Scope*
......@@ -4439,7 +4660,8 @@ All parameter, weight, gradient are variables in Paddle.
PADDLE_THROW(platform::errors::Unimplemented(
"Failed to convert type: %s when set IpuStrategy "
"option: %s",
option.get_type(), option_name));
option.get_type(),
option_name));
}
self.InsertStringOption(option_name, option_val);
}
......@@ -4447,7 +4669,8 @@ All parameter, weight, gradient are variables in Paddle.
if (option_name.rfind("location_", 0) == 0) {
for (auto option : element.second.cast<py::dict>()) {
self.SetTensorLocation(
option_name, option.first.cast<std::string>(),
option_name,
option.first.cast<std::string>(),
option.second.cast<std::uint64_t>());
}
} else if (option_name == "replicated_collectives_settings") {
......@@ -4501,17 +4724,19 @@ All parameter, weight, gradient are variables in Paddle.
PADDLE_THROW(platform::errors::Unimplemented(
"Failed to convert value type: %s when set "
"IpuStrategy option: %s",
option.second.get_type(), option_key));
option.second.get_type(),
option_key));
}
self.InsertStringPairOption(option_name, option_key,
option_val);
self.InsertStringPairOption(
option_name, option_key, option_val);
}
}
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Invalid IpuStrategy option value type: %s, please check "
"input value for option: %s",
element.second.get_type(), option_name));
element.second.get_type(),
option_name));
}
}
})
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册