未验证 提交 c1fd1b1c 编写于 作者: Y YuanRisheng 提交者: GitHub

[PTen]Make inplace_op and vector<DenseTensor> input compatible with old architecture (#37674)

* add inplace op adaptation

* optimize inplace logic and fix bugs when run kernel that has args of vector<DenseTensor>

* refactor logic that transform variable to densetensor

* update func name
上级 f306965d
...@@ -1181,9 +1181,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, ...@@ -1181,9 +1181,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
} }
BuildPtenKernelContext(*runtime_ctx, dev_ctx); BuildPtenKernelContext(*runtime_ctx, dev_ctx);
(*pt_kernel_)(pt_kernel_context_.get()); (*pt_kernel_)(pt_kernel_context_.get());
WriteBackToOutputs(runtime_ctx); WriteBackToOutputs(runtime_ctx);
pt_kernel_context_->ClearData(); pt_kernel_context_->ClearData();
} else { } else {
(*kernel_func_)( (*kernel_func_)(
...@@ -1814,45 +1812,31 @@ void OperatorWithKernel::BuildPtenKernelContext( ...@@ -1814,45 +1812,31 @@ void OperatorWithKernel::BuildPtenKernelContext(
size_t start_idx = size_t start_idx =
(i == 0 ? 0 : pt_kernel_context_->InputRangeAt(i - 1).second); (i == 0 ? 0 : pt_kernel_context_->InputRangeAt(i - 1).second);
size_t end_idx = start_idx + ins_vector.size(); size_t end_idx = start_idx + ins_vector.size();
auto current_vector_size = pt_kernel_context_->InputsSize();
// The current size of input/output in pt_kernel_context_ is at least equal
// the start_idx. For the reason of reusing the allocted of inputs or // If the memory needed is less than the current memory allocated, we will
// outputs in pt_kernel_context_, the current size of input/output can be // reuse the current memory by using ReMakePtenDenseTensorFromVar.
// greater then the index of which the tensort wanted to set to, so it will // Otherwise,we will create new storage.
// use ReMakePtenDenseTensorFromVar to make pten tensor. for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
if (pt_kernel_context_->InputsSize() == start_idx) { if (current_vector_size > start_idx + offset) {
paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs; auto& input_ptr =
for (auto* var : ins_vector) { pt_kernel_context_->MutableInputPtrAt(start_idx + offset);
tmp_inputs.emplace_back( if (input_ptr == nullptr) {
experimental::MakePtenTensorBaseFromVar(*var, in_def)); input_ptr = experimental::MakePtenTensorBaseFromVar(
} *ins_vector[offset], in_def);
pt_kernel_context_->EmplaceBackInputs(std::move(tmp_inputs)); } else {
} else if (pt_kernel_context_->InputsSize() > start_idx) {
size_t input_size = pt_kernel_context_->InputsSize();
for (size_t j = 0; j < ins_vector.size(); ++j) {
if (input_size > start_idx + j) {
experimental::ReMakePtenDenseTensorFromVar( experimental::ReMakePtenDenseTensorFromVar(
*ins_vector[j], in_def, *ins_vector[offset], in_def,
pt_kernel_context_->MutableInputAt<pten::DenseTensor>(start_idx + pt_kernel_context_->MutableInputAt<pten::DenseTensor>(start_idx +
j)); offset));
// TODO(chentianyu03): When multi input kernel, open this code
/*
} else {
pt_kernel_context_->EmplaceBackInputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(*ins_vector[j],
in_def));
*/
} }
} else {
pt_kernel_context_->EmplaceBackInputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(*ins_vector[offset],
in_def));
} }
pt_kernel_context_->MutableInputRangeAt(i) =
std::make_pair(start_idx, end_idx);
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Error start index when trying to set new tensor to inputs, start "
"index is `%d`, but current pt_kernel_context_.inputs.size() is "
"`%d`.",
start_idx, pt_kernel_context_->InputsSize()));
} }
pt_kernel_context_->AssignInputRange(std::make_pair(start_idx, end_idx), i);
} }
for (size_t i = 0; i < output_names.size(); ++i) { for (size_t i = 0; i < output_names.size(); ++i) {
...@@ -1862,46 +1846,25 @@ void OperatorWithKernel::BuildPtenKernelContext( ...@@ -1862,46 +1846,25 @@ void OperatorWithKernel::BuildPtenKernelContext(
size_t start_idx = size_t start_idx =
(i == 0 ? 0 : pt_kernel_context_->OutputRangeAt(i - 1).second); (i == 0 ? 0 : pt_kernel_context_->OutputRangeAt(i - 1).second);
size_t end_idx = start_idx + outs_vector.size(); size_t end_idx = start_idx + outs_vector.size();
auto current_vector_size = pt_kernel_context_->OutputsSize();
// The current size of input/output in pt_kernel_context_ is at least equal
// the start_idx. For the reason of reusing the allocted of inputs or // If the memory needed is less than the current memory allocated, we will
// outputs in pt_kernel_context_, the current size of input/output can be // reuse the current memory by using ReMakePtenDenseTensorFromVar.
// greater then the index of which the tensort wanted to set to, so it will // Otherwise,we will create new storage.
// use ReMakePtenDenseTensorFromVar to make pten tensor. for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
if (pt_kernel_context_->OutputsSize() == start_idx) { if (current_vector_size > start_idx + offset) {
paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs; experimental::ReMakePtenDenseTensorFromVar(
for (auto* var : outs_vector) { outs_vector[offset], out_def,
tmp_outputs.emplace_back( pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
experimental::MakePtenTensorBaseFromVar(var, out_def)); offset));
} } else {
pt_kernel_context_->EmplaceBackOutputs(std::move(tmp_outputs)); pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
} else if (pt_kernel_context_->OutputsSize() > start_idx) { experimental::MakePtenTensorBaseFromVar(outs_vector[offset],
size_t output_size = pt_kernel_context_->OutputsSize(); out_def));
for (size_t j = 0; j < outs_vector.size(); ++j) {
if (output_size > start_idx + j) {
experimental::ReMakePtenDenseTensorFromVar(
outs_vector[j], out_def,
pt_kernel_context_->MutableOutputAt<pten::DenseTensor>(start_idx +
j));
// TODO(chentianyu03): When multi output kernel, open this code
/*
} else {
pt_kernel_context_->EmplaceBackOutputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(outs_vector[j],
out_def));
*/
}
} }
pt_kernel_context_->MutableOutputRangeAt(i) =
std::make_pair(start_idx, end_idx);
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Error start index when trying to set new tensor to inputs, start "
"index is `%d`, but current pt_kernel_context_.outputs.size() is "
"`%d`.",
start_idx, pt_kernel_context_->OutputsSize()));
} }
pt_kernel_context_->AssignOutputRange(std::make_pair(start_idx, end_idx),
i);
} }
for (size_t i = 0; i < attr_names.size(); ++i) { for (size_t i = 0; i < attr_names.size(); ++i) {
......
...@@ -299,44 +299,28 @@ static void BuildDygraphPtenKernelContext( ...@@ -299,44 +299,28 @@ static void BuildDygraphPtenKernelContext(
size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second); size_t start_idx = (i == 0 ? 0 : kernel_ctx->InputRangeAt(i - 1).second);
size_t end_idx = start_idx + ins_vector.size(); size_t end_idx = start_idx + ins_vector.size();
auto current_vector_size = kernel_ctx->InputsSize();
// The current size of input/output in pt_kernel_context_ is at least equal
// the start_idx. For the reason of reusing the allocted of inputs or // If the memory needed is less than the current memory allocated, we will
// outputs in pt_kernel_context_, the current size of input/output can be // reuse the current memory by using ReMakePtenDenseTensorFromVar.
// greater then the index of which the tensort wanted to set to, so it will // Otherwise,we will create new storage.
// use ReMakePtenDenseTensorFromVar to make pten tensor. for (size_t offset = 0; offset < ins_vector.size(); ++offset) {
if (kernel_ctx->InputsSize() == start_idx) { const auto& variable = ins_vector[offset]->Var();
paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_inputs; if (current_vector_size > start_idx + offset) {
for (const auto& var : ins_vector) { auto& input_ptr = kernel_ctx->MutableInputPtrAt(start_idx + offset);
const auto& variable = var->Var(); if (input_ptr == nullptr) {
tmp_inputs.emplace_back( input_ptr = experimental::MakePtenTensorBaseFromVar(variable, in_def);
experimental::MakePtenTensorBaseFromVar(variable, in_def)); } else {
}
kernel_ctx->EmplaceBackInputs(std::move(tmp_inputs));
} else if (kernel_ctx->InputsSize() > start_idx) {
size_t input_size = kernel_ctx->InputsSize();
for (size_t j = 0; j < ins_vector.size(); ++j) {
if (input_size > start_idx + j) {
experimental::ReMakePtenDenseTensorFromVar( experimental::ReMakePtenDenseTensorFromVar(
ins_vector[j]->Var(), in_def, variable, in_def, kernel_ctx->MutableInputAt<pten::DenseTensor>(
kernel_ctx->MutableInputAt<pten::DenseTensor>(start_idx + j)); start_idx + offset));
// TODO(chentianyu03): When multi input kernel, open this code
/*
} else {
kernel_ctx->EmplaceBackInputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(ins_vector[j]->Var(),
in_def));
*/
} }
} else {
kernel_ctx->EmplaceBackInputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(variable, in_def));
} }
kernel_ctx->MutableInputRangeAt(i) = std::make_pair(start_idx, end_idx);
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Error start index when trying to set new tensor to inputs, start "
"index is `%d`, but current pt_kernel_context_.inputs.size() is "
"`%d`.",
start_idx, kernel_ctx->InputsSize()));
} }
kernel_ctx->AssignInputRange(std::make_pair(start_idx, end_idx), i);
} }
for (size_t i = 0; i < output_names.size(); ++i) { for (size_t i = 0; i < output_names.size(); ++i) {
...@@ -345,44 +329,22 @@ static void BuildDygraphPtenKernelContext( ...@@ -345,44 +329,22 @@ static void BuildDygraphPtenKernelContext(
size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second); size_t start_idx = (i == 0 ? 0 : kernel_ctx->OutputRangeAt(i - 1).second);
size_t end_idx = start_idx + outs_vector.size(); size_t end_idx = start_idx + outs_vector.size();
auto current_vector_size = kernel_ctx->OutputsSize();
// The current size of input/output in pt_kernel_context_ is at least equal // If the memory needed is less than the current memory allocated, we will
// the start_idx. For the reason of reusing the allocted of inputs or // reuse the current memory by using ReMakePtenDenseTensorFromVar.
// outputs in pt_kernel_context_, the current size of input/output can be // Otherwise,we will create new storage.
// greater then the index of which the tensort wanted to set to, so it will for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
// use ReMakePtenDenseTensorFromVar to make pten tensor. if (current_vector_size > start_idx + offset) {
if (kernel_ctx->OutputsSize() == start_idx) { experimental::ReMakePtenDenseTensorFromVar(
paddle::SmallVector<std::shared_ptr<pten::TensorBase>> tmp_outputs; outs_vector[offset]->MutableVar(), out_def,
for (auto& var : outs_vector) { kernel_ctx->MutableOutputAt<pten::DenseTensor>(start_idx + offset));
auto* variable = var->MutableVar(); } else {
tmp_outputs.emplace_back( kernel_ctx->EmplaceBackOutputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(variable, out_def)); experimental::MakePtenTensorBaseFromVar(
} outs_vector[offset]->MutableVar(), out_def));
kernel_ctx->EmplaceBackOutputs(std::move(tmp_outputs));
} else if (kernel_ctx->OutputsSize() > start_idx) {
size_t output_size = kernel_ctx->OutputsSize();
for (size_t j = 0; j < outs_vector.size(); ++j) {
if (output_size > i + j) {
experimental::ReMakePtenDenseTensorFromVar(
outs_vector[j]->MutableVar(), out_def,
kernel_ctx->MutableOutputAt<pten::DenseTensor>(i + j));
// TODO(chentianyu03): When multi output kernel, open this code
/*
} else {
kernel_ctx->EmplaceBackOutputWithoutSetRange(
experimental::MakePtenTensorBaseFromVar(
outs_vector[j]->MutableVar(), out_def));
*/
}
} }
kernel_ctx->MutableOutputRangeAt(i) = std::make_pair(start_idx, end_idx);
} else {
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Error start index when trying to set new tensor to inputs, start "
"index is `%d`, but current pt_kernel_context_.outputs.size() is "
"`%d`.",
start_idx, kernel_ctx->OutputsSize()));
} }
kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
} }
for (size_t i = 0; i < attr_names.size(); ++i) { for (size_t i = 0; i < attr_names.size(); ++i) {
......
...@@ -104,14 +104,18 @@ class KernelContext { ...@@ -104,14 +104,18 @@ class KernelContext {
return static_cast<const TensorType&>(*(inputs_.at(idx))); return static_cast<const TensorType&>(*(inputs_.at(idx)));
} }
std::shared_ptr<TensorBase>& MutableInputPtrAt(size_t idx) {
return inputs_.at(idx);
}
template <typename TensorType> template <typename TensorType>
std::vector<TensorType> InputBetween(size_t start, size_t end) const { std::vector<TensorType> MoveInputsBetween(size_t start, size_t end) {
std::vector<TensorType> v; std::vector<TensorType> v;
for (size_t i = start; i < end; ++i) { for (size_t i = start; i < end; ++i) {
auto t = std::dynamic_pointer_cast<TensorType>(inputs_.at(i)); auto t = std::dynamic_pointer_cast<TensorType>(inputs_.at(i));
v.emplace_back(std::move(*t.get())); v.emplace_back(std::move(*t.get()));
inputs_.at(i) = nullptr;
} }
return v; return v;
} }
...@@ -123,12 +127,32 @@ class KernelContext { ...@@ -123,12 +127,32 @@ class KernelContext {
return output_range_.at(idx); return output_range_.at(idx);
} }
std::pair<int, int>& MutableInputRangeAt(size_t idx) { void AssignInputRange(std::pair<int, int>&& range, size_t idx) {
return input_range_[idx]; if (idx < input_range_.size()) {
input_range_[idx] = range;
} else if (idx == input_range_.size()) {
input_range_.emplace_back(range);
} else {
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"Invalid idx when trying to set InputRange, "
"index is `%d`, it is greater than the size(%d) of InputRange.",
idx,
input_range_.size()));
}
} }
std::pair<int, int>& MutableOutputRangeAt(size_t idx) { void AssignOutputRange(std::pair<int, int>&& range, size_t idx) {
return output_range_[idx]; if (idx < output_range_.size()) {
output_range_[idx] = range;
} else if (idx == output_range_.size()) {
output_range_.emplace_back(range);
} else {
PADDLE_THROW(paddle::platform::errors::PreconditionNotMet(
"Invalid idx when trying to set InputRange, "
"index is `%d`, it is greater than the size(%d) of InputRange.",
idx,
output_range_.size()));
}
} }
template <typename TensorType> template <typename TensorType>
...@@ -165,8 +189,10 @@ class KernelContext { ...@@ -165,8 +189,10 @@ class KernelContext {
// Only deal with DenseTensor now // Only deal with DenseTensor now
void ClearData() { void ClearData() {
for (auto& in : inputs_) { for (auto& in : inputs_) {
CompatibleDenseTensorUtils::ClearStorage( if (in) {
static_cast<DenseTensor*>(in.get())); CompatibleDenseTensorUtils::ClearStorage(
static_cast<DenseTensor*>(in.get()));
}
} }
for (auto& out : outputs_) { for (auto& out : outputs_) {
CompatibleDenseTensorUtils::ClearStorage( CompatibleDenseTensorUtils::ClearStorage(
......
...@@ -88,26 +88,26 @@ using XPUContext = paddle::platform::XPUDeviceContext; ...@@ -88,26 +88,26 @@ using XPUContext = paddle::platform::XPUDeviceContext;
} \ } \
} }
#define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type) \ #define PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(tensor_type) \
template <typename... Tail> \ template <typename... Tail> \
struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> { \ struct KernelCallHelper<const std::vector<tensor_type>&, Tail...> { \
template <int dev_ctx_idx, \ template <int dev_ctx_idx, \
int in_idx, \ int in_idx, \
int attr_idx, \ int attr_idx, \
int out_idx, \ int out_idx, \
typename... PreviousArgs> \ typename... PreviousArgs> \
static void Compute(KernelContext* ctx, PreviousArgs&... pargs) { \ static void Compute(KernelContext* ctx, PreviousArgs&... pargs) { \
static_assert(attr_idx == 0, \ static_assert(attr_idx == 0, \
"Kernel's Input should appear before Attributes."); \ "Kernel's Input should appear before Attributes."); \
static_assert(out_idx == 0, \ static_assert(out_idx == 0, \
"Kernel's Input should appear before Outputs."); \ "Kernel's Input should appear before Outputs."); \
const std::pair<int, int> range = ctx->InputRangeAt(in_idx); \ const std::pair<int, int> range = ctx->InputRangeAt(in_idx); \
std::vector<tensor_type> arg = std::move( \ std::vector<tensor_type> arg = std::move( \
ctx->InputBetween<tensor_type>(range.first, range.second)); \ ctx->MoveInputsBetween<tensor_type>(range.first, range.second)); \
KernelCallHelper<Tail...>:: \ KernelCallHelper<Tail...>:: \
template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>( \ template Compute<dev_ctx_idx, in_idx + 1, attr_idx, out_idx>( \
ctx, pargs..., arg); \ ctx, pargs..., arg); \
} \ } \
} }
#define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type) \ #define PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(attr_type) \
......
...@@ -51,7 +51,7 @@ void ReshapeFromVectorVal(const CPUContext& dev_ctx, ...@@ -51,7 +51,7 @@ void ReshapeFromVectorVal(const CPUContext& dev_ctx,
const std::vector<int64_t>& shape, const std::vector<int64_t>& shape,
DenseTensor* out) { DenseTensor* out) {
auto out_meta = InferMetaFromVecValue(x.meta(), shape); auto out_meta = InferMetaFromVecValue(x.meta(), shape);
if (&x == out) { if (x.data() == out->data() && x.numel() == out->numel()) {
out->Resize(out_meta.dims); out->Resize(out_meta.dims);
return; return;
} }
...@@ -185,3 +185,34 @@ PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mid", ...@@ -185,3 +185,34 @@ PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mid",
CPU, CPU,
ANY, ANY,
pten::ReshapeFromVectorValWithXShape) {} pten::ReshapeFromVectorValWithXShape) {}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host",
CPU,
ANY,
pten::ReshapeFromDT) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host.mid",
CPU,
ANY,
pten::ReshapeFromDTWithXShape) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost",
CPU,
ANY,
pten::ReshapeFromVectorDT) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost.mid",
CPU,
ANY,
pten::ReshapeFromVectorDTWithXShape) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
...@@ -51,7 +51,7 @@ void ReshapeFromVectorVal(const CUDAContext& dev_ctx, ...@@ -51,7 +51,7 @@ void ReshapeFromVectorVal(const CUDAContext& dev_ctx,
const std::vector<int64_t>& shape, const std::vector<int64_t>& shape,
DenseTensor* out) { DenseTensor* out) {
auto out_meta = InferMetaFromVecValue(x.meta(), shape); auto out_meta = InferMetaFromVecValue(x.meta(), shape);
if (&x == out) { if (x.data() == out->data() && x.numel() == out->numel()) {
out->Resize(out_meta.dims); out->Resize(out_meta.dims);
return; return;
} }
...@@ -193,3 +193,35 @@ PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mid", ...@@ -193,3 +193,35 @@ PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mid",
CUDA, CUDA,
ANY, ANY,
pten::ReshapeFromVectorValWithXShape) {} pten::ReshapeFromVectorValWithXShape) {}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host",
CUDA,
ANY,
pten::ReshapeFromDT) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.host.mid",
CUDA,
ANY,
pten::ReshapeFromDTWithXShape) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost",
CUDA,
ANY,
pten::ReshapeFromVectorDT) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
PT_REGISTER_KERNEL_WITH_NO_TYPE("reshape2.mulhost.mid",
CUDA,
ANY,
pten::ReshapeFromVectorDTWithXShape) {
kernel->InputAt(1).SetBackend(pten::Backend::CPU);
kernel->InputAt(1).SetDataType(paddle::experimental::DataType::INT32);
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册