“dcc239032362ef1d9a808021534d6293f2e3d08d”上不存在“...asr1/git@gitcode.net:paddlepaddle/DeepSpeech.git”
未验证 提交 01247e33 编写于 作者: H HongyuJia 提交者: GitHub

[Opt Performance] Optimize custom operator performance (#52597)

* [Opt Performance] Optimize custom operator performance, reconstruct python API auto-gen, add cache and use const inference

* opt AutoGradMeta implementation

* remove profiler codes

* fix unit test

* change year, 2021->2023

* fix int64_t parse bug
上级 90c3bddf
...@@ -236,7 +236,8 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>, ...@@ -236,7 +236,8 @@ RunCustomOpNode::operator()(paddle::small_vector<std::vector<paddle::Tensor>,
VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_ << "_grad"; VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_ << "_grad";
// handle inplace map // handle inplace map
ctx.MapPlainOutputs(grad_inputs_name, grad_outputs_names, grad_inplace_map); ctx.UpdatePlainOutputs(
grad_inputs_name, grad_outputs_names, grad_inplace_map);
(*paddle::OpMetaInfoHelper::GetKernelFn(kernel_map.at(op_type_)[1]))(&ctx); (*paddle::OpMetaInfoHelper::GetKernelFn(kernel_map.at(op_type_)[1]))(&ctx);
ctx.AssignInplaceOutputs(); ctx.AssignInplaceOutputs();
...@@ -443,7 +444,8 @@ RunCustomOpDoubleGradNode::operator()( ...@@ -443,7 +444,8 @@ RunCustomOpDoubleGradNode::operator()(
VLOG(7) << "Run Kernel of Grad Custom Op: " << name(); VLOG(7) << "Run Kernel of Grad Custom Op: " << name();
// handle inplace map // handle inplace map
ctx.MapPlainOutputs(grad_inputs_name, grad_outputs_names, grad_inplace_map); ctx.UpdatePlainOutputs(
grad_inputs_name, grad_outputs_names, grad_inplace_map);
(*paddle::OpMetaInfoHelper::GetKernelFn(kernel_map.at(op_type_)[2]))(&ctx); (*paddle::OpMetaInfoHelper::GetKernelFn(kernel_map.at(op_type_)[2]))(&ctx);
ctx.AssignInplaceOutputs(); ctx.AssignInplaceOutputs();
......
...@@ -28,6 +28,7 @@ limitations under the License. */ ...@@ -28,6 +28,7 @@ limitations under the License. */
#include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/eager/api/utils/global_utils.h"
#include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/attribute.h"
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/custom_operator_utils.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/phi_utils.h"
...@@ -52,87 +53,6 @@ DECLARE_string(tensor_operants_mode); ...@@ -52,87 +53,6 @@ DECLARE_string(tensor_operants_mode);
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace detail {
// dynamic lib load func
template <typename T>
static T* DynLoad(void* handle, std::string name) {
T* func = reinterpret_cast<T*>(dlsym(handle, name.c_str()));
#if !defined(_WIN32)
auto errorno = dlerror();
#else
auto errorno = GetLastError();
#endif // !_WIN32
PADDLE_ENFORCE_NOT_NULL(
func,
platform::errors::NotFound(
"Failed to load dynamic operator library, error message(%s).",
errorno));
return func;
}
inline static bool IsDuplicableVar(const std::string& var_name) {
std::string suffix = kTensorVectorSuffix;
return var_name.rfind(suffix) != std::string::npos;
}
inline static bool IsOptionalVar(const std::string& var_name) {
std::string suffix = kOptionalSuffix;
return var_name.rfind(suffix) != std::string::npos;
}
inline static std::string NoGrad(const std::string& var_name,
bool is_double_grad = false) {
std::string suffix = kGradVarSuffix;
std::string new_out_suffix = kDoubleGradNewOutSuffix;
std::string tmp_var_name(var_name);
if (is_double_grad &&
(tmp_var_name.rfind(new_out_suffix) != std::string::npos)) {
tmp_var_name = tmp_var_name.substr(
0, tmp_var_name.size() - /*kDoubleGradNewOutSuffix length*/ 4);
}
return tmp_var_name.substr(0, tmp_var_name.size() - kGradVarSuffixSize);
}
inline static bool IsGradVar(const std::string& var_name, bool is_double_grad) {
std::string suffix = kGradVarSuffix;
if (!is_double_grad) {
return var_name.rfind(suffix) != std::string::npos;
} else {
// for double grad cases, the X@GRAD is not a grad var, X@GRAD@GRAD is a
// grad var, here we remove a @GRAD suffix
return NoGrad(var_name).rfind(suffix) != std::string::npos;
}
}
inline static bool IsMemberOf(const std::vector<std::string>& vec,
const std::string& name) {
return std::find(vec.cbegin(), vec.cend(), name) != vec.cend();
}
static std::vector<std::string> ParseAttrStr(const std::string& attr) {
auto split_pos = attr.find_first_of(":");
PADDLE_ENFORCE_NE(split_pos,
std::string::npos,
platform::errors::InvalidArgument(
"Invalid attribute string format. Attribute string "
"format is `<name>:<type>`."));
std::vector<std::string> rlt;
// 1. name
rlt.emplace_back(string::trim_spaces(attr.substr(0, split_pos)));
// 2. type
rlt.emplace_back(string::trim_spaces(attr.substr(split_pos + 1)));
VLOG(3) << "attr name: " << rlt[0] << ", attr type str: " << rlt[1];
return rlt;
}
} // namespace detail
////////////////// Kernel Define ////////////////////
// custom op kernel call function define // custom op kernel call function define
static void RunKernelFunc( static void RunKernelFunc(
const framework::ExecutionContext& ctx, const framework::ExecutionContext& ctx,
...@@ -355,7 +275,7 @@ static void RunKernelFunc( ...@@ -355,7 +275,7 @@ static void RunKernelFunc(
} }
// handle inplace map // handle inplace map
kernel_ctx.MapPlainOutputs(inputs, outputs, inplace_map); kernel_ctx.UpdatePlainOutputs(inputs, outputs, inplace_map);
func(&kernel_ctx); func(&kernel_ctx);
kernel_ctx.AssignInplaceOutputs(); kernel_ctx.AssignInplaceOutputs();
......
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/api/ext/op_meta_info.h"
namespace paddle {
namespace framework {
namespace detail {
// dynamic lib load func
template <typename T>
static T* DynLoad(void* handle, std::string name) {
T* func = reinterpret_cast<T*>(dlsym(handle, name.c_str()));
#if !defined(_WIN32)
auto errorno = dlerror();
#else
auto errorno = GetLastError();
#endif // !_WIN32
PADDLE_ENFORCE_NOT_NULL(
func,
platform::errors::NotFound(
"Failed to load dynamic operator library, error message(%s).",
errorno));
return func;
}
inline static bool IsDuplicableVar(const std::string& var_name) {
std::string suffix = kTensorVectorSuffix;
return var_name.rfind(suffix) != std::string::npos;
}
inline static bool IsOptionalVar(const std::string& var_name) {
std::string suffix = kOptionalSuffix;
return var_name.rfind(suffix) != std::string::npos;
}
inline static std::string NoGrad(const std::string& var_name,
bool is_double_grad = false) {
std::string suffix = kGradVarSuffix;
std::string new_out_suffix = kDoubleGradNewOutSuffix;
std::string tmp_var_name(var_name);
if (is_double_grad &&
(tmp_var_name.rfind(new_out_suffix) != std::string::npos)) {
tmp_var_name = tmp_var_name.substr(
0, tmp_var_name.size() - /*kDoubleGradNewOutSuffix length*/ 4);
}
return tmp_var_name.substr(0, tmp_var_name.size() - kGradVarSuffixSize);
}
inline static bool IsGradVar(const std::string& var_name, bool is_double_grad) {
std::string suffix = kGradVarSuffix;
if (!is_double_grad) {
return var_name.rfind(suffix) != std::string::npos;
} else {
// for double grad cases, the X@GRAD is not a grad var, X@GRAD@GRAD is a
// grad var, here we remove a @GRAD suffix
return NoGrad(var_name).rfind(suffix) != std::string::npos;
}
}
inline static bool IsMemberOf(const std::vector<std::string>& vec,
const std::string& name) {
return std::find(vec.cbegin(), vec.cend(), name) != vec.cend();
}
static std::vector<std::string> ParseAttrStr(const std::string& attr) {
auto split_pos = attr.find_first_of(":");
PADDLE_ENFORCE_NE(split_pos,
std::string::npos,
platform::errors::InvalidArgument(
"Invalid attribute string format. Attribute string "
"format is `<name>:<type>`."));
std::vector<std::string> rlt;
// 1. name
rlt.emplace_back(string::trim_spaces(attr.substr(0, split_pos)));
// 2. type
rlt.emplace_back(string::trim_spaces(attr.substr(split_pos + 1)));
VLOG(3) << "attr name: " << rlt[0] << ", attr type str: " << rlt[1];
return rlt;
}
} // namespace detail
} // namespace framework
} // namespace paddle
...@@ -33,6 +33,7 @@ typedef SSIZE_T ssize_t; ...@@ -33,6 +33,7 @@ typedef SSIZE_T ssize_t;
#include "paddle/fluid/eager/utils.h" #include "paddle/fluid/eager/utils.h"
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/custom_operator.h" #include "paddle/fluid/framework/custom_operator.h"
#include "paddle/fluid/framework/custom_operator_utils.h"
#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/phi_utils.h"
#include "paddle/fluid/framework/python_headers.h" #include "paddle/fluid/framework/python_headers.h"
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
...@@ -43,6 +44,7 @@ typedef SSIZE_T ssize_t; ...@@ -43,6 +44,7 @@ typedef SSIZE_T ssize_t;
#include "paddle/fluid/pybind/eager.h" #include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/eager_utils.h" #include "paddle/fluid/pybind/eager_utils.h"
#include "paddle/fluid/pybind/exception.h" #include "paddle/fluid/pybind/exception.h"
#include "paddle/fluid/pybind/op_function_common.h"
#include "paddle/fluid/pybind/tensor_py.h" #include "paddle/fluid/pybind/tensor_py.h"
#include "paddle/phi/api/ext/op_meta_info.h" #include "paddle/phi/api/ext/op_meta_info.h"
#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/api/lib/utils/allocator.h"
...@@ -424,55 +426,6 @@ static void ConstructFwdAndBwdMap( ...@@ -424,55 +426,6 @@ static void ConstructFwdAndBwdMap(
} }
} }
static std::vector<paddle::any> CastAttrsToTargetType(
const std::vector<paddle::any>& src,
const std::vector<std::string>& attrs_names) {
std::vector<paddle::any> res;
PADDLE_ENFORCE_EQ(src.size(),
attrs_names.size(),
paddle::platform::errors::InvalidArgument(
"We Expected same size of attrs and attrs_name list, "
"if u got this error indicate your custom op setting "
"%s attrs, but you just give %s",
attrs_names.size(),
src.size()));
for (size_t i = 0; i < src.size(); i++) {
size_t end = attrs_names[i].find(": ");
std::string type_name = attrs_names[i].substr(end + 2);
if (type_name == "int") {
if (src[i].type() == typeid(bool)) {
res.emplace_back(static_cast<int>(paddle::any_cast<bool>(src[i])));
} else if (src[i].type() == typeid(int)) {
res.emplace_back(src[i]);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Your No. %s attrs should only can be bool or int32, other type is "
"forbidden for now but we got %s. Check your code first please",
i,
src[i].type().name()));
}
} else if (type_name == "int64_t") {
if (src[i].type() == typeid(bool)) {
res.emplace_back(static_cast<int64_t>(paddle::any_cast<bool>(src[i])));
} else if (src[i].type() == typeid(int)) {
res.emplace_back(static_cast<int64_t>(paddle::any_cast<int>(src[i])));
} else if (src[i].type() == typeid(int64_t)) {
res.emplace_back(src[i]);
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"Your No. %s attrs should only can be bool or int32 or int64_t, "
"other type is forbidden for now but we got %s. Check your code "
"first please",
i,
src[i].type().name()));
}
} else {
res.emplace_back(src[i]);
}
}
return res;
}
static PyObject* eager_api_jit_function_call(PyObject* self, static PyObject* eager_api_jit_function_call(PyObject* self,
PyObject* args, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
...@@ -534,6 +487,25 @@ static PyObject* eager_api__get_custom_operator_inplace_reverse_idx( ...@@ -534,6 +487,25 @@ static PyObject* eager_api__get_custom_operator_inplace_reverse_idx(
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
// This function copies from function `EmptyTensorInitializer` with default
// parameters
static Tensor InitializedEmptyTensor() {
auto ddims = phi::make_ddim({0});
auto tensor = paddle::Tensor();
tensor.set_name(
egr::Controller::Instance().GenerateUniqueName("generated_tensor"));
auto autograd_meta = egr::EagerUtils::autograd_meta(&tensor);
autograd_meta->SetPersistable(false);
std::shared_ptr<phi::DenseTensor> dense_tensor = nullptr;
std::shared_ptr<phi::Allocation> allocation_ptr = nullptr;
dense_tensor = std::make_shared<phi::DenseTensor>(
allocation_ptr, phi::DenseTensorMeta(phi::DataType::FLOAT32, ddims));
tensor.set_impl(dense_tensor);
autograd_meta->SetGradNode(
std::make_shared<egr::GradNodeAccumulation>(autograd_meta));
return tensor;
}
static PyObject* eager_api_run_custom_op(PyObject* self, static PyObject* eager_api_run_custom_op(PyObject* self,
PyObject* args, PyObject* args,
PyObject* kwargs) { PyObject* kwargs) {
...@@ -545,14 +517,11 @@ static PyObject* eager_api_run_custom_op(PyObject* self, ...@@ -545,14 +517,11 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
VLOG(4) << "Initialize phi tensor operants successfully"; VLOG(4) << "Initialize phi tensor operants successfully";
} }
paddle::CustomOpKernelContext ctx = std::string op_type = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 0), 0);
CastPyArg2CustomOpKernelContext(PyTuple_GET_ITEM(args, 0), 0); VLOG(7) << "Get things from python for Custom Op: " << op_type;
std::string op_type = CastPyArg2AttrString(PyTuple_GET_ITEM(args, 1), 1); paddle::CustomOpKernelContext ctx;
bool trace_backward = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 2), 2);
{ {
eager_gil_scoped_release guard; eager_gil_scoped_release guard;
VLOG(7) << "Get things for python for Custom Op: " << op_type
<< ", trace_backward is: " << trace_backward;
auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap(); auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
PADDLE_ENFORCE_NE( PADDLE_ENFORCE_NE(
meta_info_map.find(op_type), meta_info_map.find(op_type),
...@@ -562,40 +531,138 @@ static PyObject* eager_api_run_custom_op(PyObject* self, ...@@ -562,40 +531,138 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
"created by LoadOpMetaInfoAndRegisterOp, please make " "created by LoadOpMetaInfoAndRegisterOp, please make "
"sure you registered your op first and try again. ", "sure you registered your op first and try again. ",
op_type)); op_type));
VLOG(7) << "Run Kernel of Custom Op: " << op_type;
// TODO(HongyuJia): Optimize Attrs Cast naming and implementation
std::vector<paddle::any> res_attrs = CastAttrsToTargetType(
ctx.Attrs(),
paddle::OpMetaInfoHelper::GetAttrs(meta_info_map.at(op_type)[0]));
ctx.EmplaceBackAttrs(res_attrs);
const auto& vec_map = meta_info_map.at(op_type); const auto& vec_map = meta_info_map.at(op_type);
const auto& inputs = paddle::OpMetaInfoHelper::GetInputs(vec_map[0]);
const auto& inputs = const auto& attrs = paddle::OpMetaInfoHelper::GetAttrs(vec_map[0]);
paddle::OpMetaInfoHelper::GetInputs(meta_info_map.at(op_type)[0]); const auto& outputs = paddle::OpMetaInfoHelper::GetOutputs(vec_map[0]);
const auto& outputs =
paddle::OpMetaInfoHelper::GetOutputs(meta_info_map.at(op_type)[0]);
const auto& inplace_map = const auto& inplace_map =
paddle::OpMetaInfoHelper::GetInplaceMap(meta_info_map.at(op_type)[0]); paddle::OpMetaInfoHelper::GetInplaceMap(vec_map[0]);
for (size_t i = 0; i < inputs.size(); ++i) {
const auto& input = inputs.at(i);
// Parse op_type first, so that use i + 1
PyObject* obj = PyTuple_GET_ITEM(args, i + 1);
// Emplace Py_None from python, this means optional inputs passed to C++,
// use one un-initialized tensor to indicate both Tensor and
// vector<Tensor> inputs.
if (obj == Py_None) {
VLOG(7) << "Custom operator add input " << input
<< " to CustomOpKernelContext. Add un-initialized tensor "
"because the optional input is None";
ctx.EmplaceBackInput(std::move(paddle::Tensor()));
continue;
}
if (paddle::framework::detail::IsDuplicableVar(input)) {
ctx.EmplaceBackInputs(std::move(CastPyArg2VectorOfTensor(obj, i + 1)));
VLOG(7) << "Custom operator add input " << input
<< " to CustomOpKernelContext. Add vector<Tensor> size = "
<< ctx.InputRangeAt(i).second - ctx.InputRangeAt(i).first;
} else {
ctx.EmplaceBackInput(std::move(CastPyArg2Tensor(obj, i + 1)));
VLOG(7) << "Custom operator add input " << input
<< " to CustomOpKernelContext. Add Tensor for general case.";
}
}
// Parse op_type and inputs first, so that use 1 + inputs.size() + i
int attr_start_idx = 1 + inputs.size();
for (size_t i = 0; i < attrs.size(); ++i) {
const auto& attr = attrs.at(i);
std::vector<std::string> attr_name_and_type =
paddle::framework::detail::ParseAttrStr(attr);
auto attr_type_str = attr_name_and_type[1];
VLOG(7) << "Custom operator add attrs " << attr_name_and_type[0]
<< " to CustomOpKernelContext. Attribute type = "
<< attr_type_str;
PyObject* obj = PyTuple_GET_ITEM(args, attr_start_idx + i);
if (attr_type_str == "bool") {
ctx.EmplaceBackAttr(CastPyArg2AttrBoolean(obj, attr_start_idx + i));
} else if (attr_type_str == "int") {
ctx.EmplaceBackAttr(CastPyArg2AttrInt(obj, attr_start_idx + i));
} else if (attr_type_str == "float") {
ctx.EmplaceBackAttr(CastPyArg2AttrFloat(obj, attr_start_idx + i));
} else if (attr_type_str == "int64_t") {
ctx.EmplaceBackAttr(CastPyArg2Long(obj, op_type, attr_start_idx + i));
} else if (attr_type_str == "std::string") {
ctx.EmplaceBackAttr(CastPyArg2AttrString(obj, attr_start_idx + i));
} else if (attr_type_str == "std::vector<int>") {
ctx.EmplaceBackAttr(CastPyArg2VectorOfInt(obj, attr_start_idx + i));
} else if (attr_type_str == "std::vector<float>") {
ctx.EmplaceBackAttr(CastPyArg2VectorOfFloat(obj, attr_start_idx + i));
} else if (attr_type_str == "std::vector<int64_t>") {
ctx.EmplaceBackAttr(CastPyArg2Longs(obj, op_type, attr_start_idx + i));
} else if (attr_type_str == "std::vector<std::string>") {
ctx.EmplaceBackAttr(CastPyArg2VectorOfString(obj, attr_start_idx + i));
} else {
PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported `%s` type value as custom attribute now. "
"Supported data types include `bool`, `int`, `float`, "
"`int64_t`, `std::string`, `std::vector<int>`, "
"`std::vector<float>`, `std::vector<int64_t>`, "
"`std::vector<std::string>`, Please check whether "
"the attribute data type and data type string are matched.",
attr_type_str));
}
}
ctx.ConstructInplaceIndex(inputs, outputs, inplace_map);
const auto& inplace_reverse_idx_map = ctx.GetInplaceReverseIndexMap();
for (size_t out_idx = 0; out_idx < outputs.size(); ++out_idx) {
const auto& output = outputs.at(out_idx);
// inplace special case
if (inplace_reverse_idx_map.find(out_idx) !=
inplace_reverse_idx_map.end()) {
size_t in_idx = inplace_reverse_idx_map.at(out_idx);
const auto& input_range = ctx.InputRangeAt(in_idx);
const auto& input_tensor = ctx.InputAt(input_range.first);
// inplace optional [Tensor or vector<Tensor>], un-initialized tensor.
if (paddle::framework::detail::IsOptionalVar(output) &&
!input_tensor.initialized()) {
VLOG(7) << "Custom operator add output " << output
<< " to CustomOpKernelContext. Add un-initialized tensor "
"because the inplace optional input is None";
ctx.EmplaceBackOutput(std::move(paddle::Tensor()));
continue;
}
/// inplace vector<Tensor>, initialized tensor.
if (paddle::framework::detail::IsDuplicableVar(output)) {
std::vector<paddle::Tensor> empty_tensors;
size_t vector_size = input_range.second - input_range.first;
empty_tensors.resize(vector_size);
for (size_t i = 0; i < vector_size; ++i) {
empty_tensors[i] = InitializedEmptyTensor();
}
VLOG(7) << "Custom operator add output " << output
<< " to CustomOpKernelContext. Add vector<tensor> size = "
<< empty_tensors.size();
ctx.EmplaceBackOutputs(std::move(empty_tensors));
continue;
}
}
VLOG(7) << "Custom operator add output " << output
<< " to CustomOpKernelContext. Add initialized Tensor because "
"using general or inplace mechanism";
// general Tensor or inplace Tensor, initialized tensor.
ctx.EmplaceBackOutput(std::move(InitializedEmptyTensor()));
}
// handle inplace map // handle inplace map
ctx.MapPlainOutputs(inputs, outputs, inplace_map); ctx.UpdatePlainOutputs(inputs, outputs, inplace_map);
VLOG(7) << "Run Kernel of Custom Op: " << op_type;
(*paddle::OpMetaInfoHelper::GetKernelFn(vec_map[0]))(&ctx); (*paddle::OpMetaInfoHelper::GetKernelFn(vec_map[0]))(&ctx);
ctx.AssignInplaceOutputs(); ctx.AssignInplaceOutputs();
// handle optional None output when construct backward graph // handle optional None output when construct backward graph
for (size_t i = 0; i < ctx.OutputRange().size(); i++) { for (size_t i = 0; i < ctx.OutputRange().size(); i++) {
if (ctx.OutputRangeAt(i).first + 1 == ctx.OutputRangeAt(i).second) { if (ctx.OutputRangeAt(i).first + 1 == ctx.OutputRangeAt(i).second) {
size_t idx = ctx.OutputRangeAt(i).first; paddle::Tensor* out_tensor =
paddle::Tensor* out_tensor = ctx.MutableOutputAt(idx); ctx.MutableOutputAt(ctx.OutputRangeAt(i).first);
if (!out_tensor->initialized()) { if (!out_tensor->initialized()) {
PADDLE_ENFORCE( PADDLE_ENFORCE(
outputs.at(idx).find(paddle::kOptionalSuffix) != paddle::framework::detail::IsOptionalVar(outputs.at(i)),
std::string::npos,
phi::errors::InvalidArgument( phi::errors::InvalidArgument(
"Custom operator's %d-th output is not initialized. " "Custom operator's %d-th output is not initialized. "
"Please check your implementation again. If you are " "Please check your implementation again. If you are "
"using inplace optional output, then you must use " "using inplace optional output, then you must use "
"`paddle::Optional` to decorate this output", "`paddle::Optional` to decorate this output",
idx)); i));
// We can also consider using `autograd_meta` to tolerant nullptr. // We can also consider using `autograd_meta` to tolerant nullptr.
out_tensor->set_autograd_meta(std::make_shared<egr::AutogradMeta>()); out_tensor->set_autograd_meta(std::make_shared<egr::AutogradMeta>());
} }
...@@ -603,45 +670,37 @@ static PyObject* eager_api_run_custom_op(PyObject* self, ...@@ -603,45 +670,37 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
} }
VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op"; VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op";
std::vector<std::vector<egr::AutogradMeta*>> ins_auto_grad_metas; size_t slot_ins_num = ctx.InputRange().size();
std::vector<std::vector<egr::AutogradMeta*>> outs_auto_grad_metas; size_t slot_outs_num = ctx.OutputRange().size();
VLOG(7) << "We got slot num of ins is: " << ctx.InputRange().size(); VLOG(7) << "We got slot num of ins is: " << slot_ins_num;
ins_auto_grad_metas.resize(ctx.InputRange().size()); VLOG(7) << "We got slot num of outs is: " << slot_outs_num;
VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size(); std::vector<egr::AutogradMeta*> ins_auto_grad_metas =
outs_auto_grad_metas.resize(ctx.OutputRange().size()); egr::EagerUtils::nullable_autograd_meta(*ctx.AllMutableInput());
std::vector<egr::AutogradMeta*> outs_auto_grad_metas =
for (size_t i = 0; i < ctx.InputRange().size(); i++) { egr::EagerUtils::unsafe_autograd_meta(*ctx.AllMutableOutput());
ins_auto_grad_metas[i] =
egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween(
ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second));
}
for (size_t i = 0; i < ctx.OutputRange().size(); i++) {
outs_auto_grad_metas[i] =
egr::EagerUtils::unsafe_autograd_meta(ctx.OutputsBetweeen(
ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second));
}
bool require_any_grad = false; bool require_any_grad = false;
for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) { bool trace_backward = true;
for (size_t i = 0; i < ins_auto_grad_metas.size(); ++i) {
require_any_grad = require_any_grad =
require_any_grad || egr::EagerUtils::ComputeRequireGrad( require_any_grad || egr::EagerUtils::ComputeRequireGrad(
trace_backward, &(ins_auto_grad_metas[i])); trace_backward, ins_auto_grad_metas[i]);
} }
// handle inplace map // handle inplace map
for (size_t i = 0; i < ctx.InputRange().size(); i++) { if (!inplace_map.empty()) {
if (inplace_map.find(inputs[i]) != inplace_map.end()) { for (size_t i = 0; i < ctx.InputRange().size(); i++) {
size_t input_size = if (inplace_map.find(inputs[i]) == inplace_map.end()) {
ctx.InputRangeAt(i).second - ctx.InputRangeAt(i).first; continue;
size_t start_idx = ctx.InputRangeAt(i).first; }
for (size_t j = 0; j < input_size; j++) { const auto& input_pair = ctx.InputRangeAt(i);
egr::EagerUtils::CheckInplace(ctx.InputAt(start_idx + j), for (size_t j = input_pair.first; j < input_pair.second; j++) {
ins_auto_grad_metas[i][j], egr::EagerUtils::CheckInplace(
require_any_grad); ctx.InputAt(j), ins_auto_grad_metas[j], require_any_grad);
if (ctx.MutableInputAt(start_idx + j).defined()) { if (ctx.MutableInputAt(j).defined()) {
// Bump Inplace Version // Bump Inplace Version
ctx.MutableInputAt(start_idx + j).bump_inplace_version(); ctx.MutableInputAt(j).bump_inplace_version();
VLOG(3) << "Custom operator: Tensor(" VLOG(3) << "Custom operator: Tensor(" << ctx.InputAt(j).name()
<< ctx.InputAt(start_idx + j).name()
<< ") uses Inplace Strategy."; << ") uses Inplace Strategy.";
} }
} }
...@@ -651,45 +710,50 @@ static PyObject* eager_api_run_custom_op(PyObject* self, ...@@ -651,45 +710,50 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
if (require_any_grad && (vec_map.size() > 1)) { if (require_any_grad && (vec_map.size() > 1)) {
VLOG(6) << " Construct Grad for Custom Op: " << op_type; VLOG(6) << " Construct Grad for Custom Op: " << op_type;
ConstructFwdAndBwdMap(vec_map, op_type); ConstructFwdAndBwdMap(vec_map, op_type);
for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) { for (size_t i = 0; i < outs_auto_grad_metas.size(); ++i) {
egr::EagerUtils::PassStopGradient(false, &(outs_auto_grad_metas[i])); egr::EagerUtils::PassStopGradient(false, outs_auto_grad_metas[i]);
} }
// Note(HongyuJia): In dygraph eager mode, CheckInplace makes sure leaf // Note(HongyuJia): In dygraph eager mode, CheckInplace makes sure leaf
// nodes set stop_gradient=True. However, dygraph mode can also outputs // nodes set stop_gradient=True. However, dygraph mode can also outputs
// lead nodes' gradients (For example, we can get x.grad after x.add_(y)). // lead nodes' gradients (For example, we can get x.grad after x.add_(y)).
// To be consistent with dygraph mode, we have to PassStopGradient for all // To be consistent with dygraph mode, we have to PassStopGradient for all
// inplaced ins_auto_grad_metas. // inplaced ins_auto_grad_metas.
std::unordered_map<size_t, size_t> inplace_tensor_map = const auto& inplace_index_map = ctx.GetInplaceIndexMap();
ctx.GetInplaceTensorMap(); for (auto pair : inplace_index_map) {
for (auto pair : inplace_tensor_map) { const auto& size_pair = ctx.InputRangeAt(pair.first);
egr::EagerUtils::PassStopGradient(false, for (size_t i = size_pair.first; i < size_pair.second; ++i) {
&(ins_auto_grad_metas[pair.first])); egr::EagerUtils::PassStopGradient(false, ins_auto_grad_metas[i]);
}
} }
auto grad_node = std::make_shared<egr::RunCustomOpNode>( auto grad_node = std::make_shared<egr::RunCustomOpNode>(
outs_auto_grad_metas.size(), ins_auto_grad_metas.size(), op_type); slot_outs_num, slot_ins_num, op_type);
auto slot_map = const auto& slot_map =
egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type); egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type);
// Prepare Grad outputs // Prepare Grad outputs
size_t no_grad_cnt = 0; size_t no_grad_cnt = 0;
for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) { for (size_t i = 0; i < slot_ins_num; i++) {
const std::vector<paddle::Tensor>& in_tensors = ctx.InputsBetween( const std::vector<paddle::Tensor>& in_tensors = ctx.InputsBetween(
ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second); ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second);
if (slot_map[0][0].find(i) != slot_map[0][0].end()) { if (slot_map[0][0].find(i) != slot_map[0][0].end()) {
grad_node->SetGradOutMeta(in_tensors, slot_map[0][0][i]); grad_node->SetGradOutMeta(in_tensors, slot_map[0][0].at(i));
} else { } else {
grad_node->SetGradOutMeta( grad_node->SetGradOutMeta(in_tensors, slot_ins_num - 1 - no_grad_cnt);
in_tensors, ins_auto_grad_metas.size() - 1 - no_grad_cnt);
no_grad_cnt++; no_grad_cnt++;
} }
} }
// Prepare Grad inputs with grad of fwd outputs // Prepare Grad inputs with grad of fwd outputs
for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) { for (size_t i = 0; i < slot_outs_num; i++) {
const std::vector<paddle::Tensor>& out_tensors = ctx.OutputsBetweeen( const auto& size_pair = ctx.OutputRangeAt(i);
ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second); const std::vector<paddle::Tensor>& out_tensors =
ctx.OutputsBetweeen(size_pair.first, size_pair.second);
egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i); for (size_t j = size_pair.first; j < size_pair.second; j++) {
egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node); // SetOutRankWithSlot: slot_id = i, rank = j - size_pair.first
outs_auto_grad_metas[j]->SetSingleOutRankWithSlot(
i, j - size_pair.first);
egr::EagerUtils::SetHistory(outs_auto_grad_metas[j], grad_node);
}
grad_node->SetGradInMeta(out_tensors, i); grad_node->SetGradInMeta(out_tensors, i);
} }
...@@ -713,9 +777,8 @@ static PyObject* eager_api_run_custom_op(PyObject* self, ...@@ -713,9 +777,8 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
ctx.InputRangeAt(it->first).second)); ctx.InputRangeAt(it->first).second));
} }
auto attrs_names = const std::vector<paddle::any>& res_attrs = ctx.Attrs();
paddle::OpMetaInfoHelper::GetAttrs(meta_info_map.at(op_type)[1]); std::vector<paddle::any> attrs(res_attrs.size());
std::vector<paddle::any> attrs(attrs_names.size());
// Prepare attrs for Grad node // Prepare attrs for Grad node
for (auto it = slot_map[0][4].begin(); it != slot_map[0][4].end(); it++) { for (auto it = slot_map[0][4].begin(); it != slot_map[0][4].end(); it++) {
VLOG(7) << "Prepare fwd attrs: " << it->first VLOG(7) << "Prepare fwd attrs: " << it->first
...@@ -725,7 +788,7 @@ static PyObject* eager_api_run_custom_op(PyObject* self, ...@@ -725,7 +788,7 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
grad_node->SetAttrs(attrs); grad_node->SetAttrs(attrs);
} }
} }
RETURN_PY_NONE return ToPyObject(*ctx.AllMutableOutput());
EAGER_CATCH_AND_THROW_RETURN_NULL EAGER_CATCH_AND_THROW_RETURN_NULL
} }
......
...@@ -56,7 +56,6 @@ extern PyTypeObject* g_cudapinnedplace_pytype; ...@@ -56,7 +56,6 @@ extern PyTypeObject* g_cudapinnedplace_pytype;
extern PyTypeObject* g_customplace_pytype; extern PyTypeObject* g_customplace_pytype;
extern PyTypeObject* g_framework_tensor_pytype; extern PyTypeObject* g_framework_tensor_pytype;
extern PyTypeObject* g_framework_lodtensorarray_pytype; extern PyTypeObject* g_framework_lodtensorarray_pytype;
extern PyTypeObject* g_custom_op_kernel_ctx_pytype;
extern PyTypeObject* g_jit_function_pytype; extern PyTypeObject* g_jit_function_pytype;
int TensorDtype2NumpyDtype(phi::DataType dtype) { int TensorDtype2NumpyDtype(phi::DataType dtype) {
...@@ -432,6 +431,54 @@ std::vector<size_t> CastPyArg2VectorOfSize_t(PyObject* obj, size_t arg_pos) { ...@@ -432,6 +431,54 @@ std::vector<size_t> CastPyArg2VectorOfSize_t(PyObject* obj, size_t arg_pos) {
return result; return result;
} }
std::vector<float> CastPyArg2VectorOfFloat(PyObject* obj, size_t arg_pos) {
std::vector<float> result;
if (PyList_Check(obj)) {
Py_ssize_t len = PyList_Size(obj);
PyObject* item = nullptr;
for (Py_ssize_t i = 0; i < len; i++) {
item = PyList_GetItem(obj, i);
if (PyObject_CheckFloatOrConvertToFloat(&item)) {
result.emplace_back(static_cast<float>(PyFloat_AsDouble(item)));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"argument (position %d) must be "
"list of float, but got %s at pos %d",
arg_pos + 1,
reinterpret_cast<PyTypeObject*>(item->ob_type)->tp_name,
i));
}
}
} else if (PyTuple_Check(obj)) {
Py_ssize_t len = PyTuple_Size(obj);
PyObject* item = nullptr;
for (Py_ssize_t i = 0; i < len; i++) {
item = PyTuple_GET_ITEM(obj, i);
if (PyObject_CheckFloatOrConvertToFloat(&item)) {
result.emplace_back(static_cast<float>(PyFloat_AsDouble(item)));
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"argument (position %d) must be "
"list of float, but got %s at pos %d",
arg_pos + 1,
reinterpret_cast<PyTypeObject*>(item->ob_type)->tp_name,
i));
}
}
} else if (obj == Py_None) {
return {};
} else if (PyObject_CheckFloatOrConvertToFloat(&obj)) {
return {static_cast<float>(PyFloat_AsDouble(obj))};
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"argument (position %d) must be "
"list of float, but got %s",
arg_pos + 1,
reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
}
return result;
}
std::vector<std::vector<size_t>> CastPyArg2VectorOfVectorOfSize_t( std::vector<std::vector<size_t>> CastPyArg2VectorOfVectorOfSize_t(
PyObject* obj, size_t arg_pos) { PyObject* obj, size_t arg_pos) {
std::vector<std::vector<size_t>> result; std::vector<std::vector<size_t>> result;
...@@ -602,19 +649,6 @@ std::vector<std::string> CastPyArg2VectorOfString(PyObject* obj, ...@@ -602,19 +649,6 @@ std::vector<std::string> CastPyArg2VectorOfString(PyObject* obj,
} }
} }
paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj,
ssize_t arg_pos) {
if (PyObject_IsInstance(
obj, reinterpret_cast<PyObject*>(g_custom_op_kernel_ctx_pytype))) {
return ::pybind11::handle(obj).cast<paddle::CustomOpKernelContext>();
} else {
PADDLE_THROW(platform::errors::InvalidArgument(
"argument (position %d) must be CustomOpKernelContext, "
"but got %s",
arg_pos + 1,
reinterpret_cast<PyTypeObject*>(obj->ob_type)->tp_name));
}
}
PyObject* ToPyObject(bool value) { PyObject* ToPyObject(bool value) {
if (value) { if (value) {
Py_INCREF(Py_True); Py_INCREF(Py_True);
......
...@@ -57,8 +57,6 @@ int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos); ...@@ -57,8 +57,6 @@ int64_t CastPyArg2AttrLong(PyObject* obj, ssize_t arg_pos);
size_t CastPyArg2AttrSize_t(PyObject* obj, ssize_t arg_pos); size_t CastPyArg2AttrSize_t(PyObject* obj, ssize_t arg_pos);
float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos); float CastPyArg2AttrFloat(PyObject* obj, ssize_t arg_pos);
std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos); std::string CastPyArg2AttrString(PyObject* obj, ssize_t arg_pos);
paddle::CustomOpKernelContext CastPyArg2CustomOpKernelContext(PyObject* obj,
ssize_t arg_pos);
std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj, std::shared_ptr<imperative::VarBase> CastPyArg2VarBase(PyObject* obj,
ssize_t arg_pos); ssize_t arg_pos);
std::vector<paddle::Tensor> CastPyArg2VectorOfTensor(PyObject* obj, std::vector<paddle::Tensor> CastPyArg2VectorOfTensor(PyObject* obj,
...@@ -70,6 +68,7 @@ std::vector<phi::DenseTensor> CastPyArg2VectorOfTensorBase(PyObject* obj, ...@@ -70,6 +68,7 @@ std::vector<phi::DenseTensor> CastPyArg2VectorOfTensorBase(PyObject* obj,
std::vector<int> CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos); std::vector<int> CastPyArg2VectorOfInt(PyObject* obj, size_t arg_pos);
std::vector<int64_t> CastPyArg2VectorOfInt64(PyObject* obj, size_t arg_pos); std::vector<int64_t> CastPyArg2VectorOfInt64(PyObject* obj, size_t arg_pos);
std::vector<size_t> CastPyArg2VectorOfSize_t(PyObject* obj, size_t arg_pos); std::vector<size_t> CastPyArg2VectorOfSize_t(PyObject* obj, size_t arg_pos);
std::vector<float> CastPyArg2VectorOfFloat(PyObject* obj, size_t arg_pos);
std::vector<std::vector<size_t>> CastPyArg2VectorOfVectorOfSize_t( std::vector<std::vector<size_t>> CastPyArg2VectorOfVectorOfSize_t(
PyObject* obj, size_t arg_pos); PyObject* obj, size_t arg_pos);
framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj, framework::proto::VarType::Type CastPyArg2ProtoType(PyObject* obj,
......
...@@ -464,7 +464,7 @@ std::vector<int64_t> CastPyArg2Longs(PyObject* obj, ...@@ -464,7 +464,7 @@ std::vector<int64_t> CastPyArg2Longs(PyObject* obj,
for (Py_ssize_t i = 0; i < len; i++) { for (Py_ssize_t i = 0; i < len; i++) {
item = PyList_GetItem(obj, i); item = PyList_GetItem(obj, i);
if (PyObject_CheckLongOrToLong(&item)) { if (PyObject_CheckLongOrToLong(&item)) {
value.emplace_back(PyLong_AsLong(item)); value.emplace_back((int64_t)PyLong_AsLongLong(item));
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"%s(): argument (position %d) must be " "%s(): argument (position %d) must be "
...@@ -481,7 +481,7 @@ std::vector<int64_t> CastPyArg2Longs(PyObject* obj, ...@@ -481,7 +481,7 @@ std::vector<int64_t> CastPyArg2Longs(PyObject* obj,
for (Py_ssize_t i = 0; i < len; i++) { for (Py_ssize_t i = 0; i < len; i++) {
item = PyTuple_GetItem(obj, i); item = PyTuple_GetItem(obj, i);
if (PyObject_CheckLongOrToLong(&item)) { if (PyObject_CheckLongOrToLong(&item)) {
value.emplace_back(PyLong_AsLong(item)); value.emplace_back((int64_t)PyLong_AsLongLong(item));
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"%s(): argument (position %d) must be " "%s(): argument (position %d) must be "
...@@ -498,7 +498,7 @@ std::vector<int64_t> CastPyArg2Longs(PyObject* obj, ...@@ -498,7 +498,7 @@ std::vector<int64_t> CastPyArg2Longs(PyObject* obj,
for (Py_ssize_t i = 0; i < len; i++) { for (Py_ssize_t i = 0; i < len; i++) {
item = PySequence_GetItem(obj, i); item = PySequence_GetItem(obj, i);
if (PyObject_CheckLongOrToLong(&item)) { if (PyObject_CheckLongOrToLong(&item)) {
value.emplace_back(PyLong_AsLong(item)); value.emplace_back((int64_t)PyLong_AsLongLong(item));
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"%s(): argument (position %d) must be " "%s(): argument (position %d) must be "
...@@ -512,7 +512,7 @@ std::vector<int64_t> CastPyArg2Longs(PyObject* obj, ...@@ -512,7 +512,7 @@ std::vector<int64_t> CastPyArg2Longs(PyObject* obj,
} else if (obj == Py_None) { } else if (obj == Py_None) {
return {}; return {};
} else if (PyObject_CheckLongOrToLong(&obj)) { } else if (PyObject_CheckLongOrToLong(&obj)) {
return {static_cast<int64_t>(PyLong_AsLong(obj))}; return {(int64_t)PyLong_AsLongLong(obj)};
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"%s(): argument (position %d) must be " "%s(): argument (position %d) must be "
......
...@@ -1013,70 +1013,6 @@ PYBIND11_MODULE(libpaddle, m) { ...@@ -1013,70 +1013,6 @@ PYBIND11_MODULE(libpaddle, m) {
m.def("_promote_types_if_complex_exists", m.def("_promote_types_if_complex_exists",
&paddle::framework::PromoteTypesIfComplexExists); &paddle::framework::PromoteTypesIfComplexExists);
py::class_<paddle::CustomOpKernelContext> custom_op_kernel_ctx(
m, "CustomOpKernelContext", R"DOC()DOC");
g_custom_op_kernel_ctx_pytype =
reinterpret_cast<PyTypeObject *>(custom_op_kernel_ctx.ptr());
custom_op_kernel_ctx.def(py::init<>())
.def("add_inputs",
[](paddle::CustomOpKernelContext &self, const py::handle &input) {
PyObject *obj = input.ptr();
if (PyList_Check(obj) || PyTuple_Check(obj)) {
self.EmplaceBackInputs(
std::move(CastPyArg2VectorOfTensor(obj, 1)));
} else if (obj == Py_None) {
// Check optional Tensor, use one un-initialized tensor to
// indicate both Tensor and vector<Tensor> inputs
self.EmplaceBackInput(std::move(paddle::Tensor()));
} else {
self.EmplaceBackInput(std::move(CastPyArg2Tensor(obj, 1)));
}
})
.def("add_outputs",
[](paddle::CustomOpKernelContext &self, py::handle &outputs) {
PyObject *obj = outputs.ptr();
if (PyList_Check(obj) || PyTuple_Check(obj)) {
self.EmplaceBackOutputs(
std::move(CastPyArg2VectorOfTensor(obj, 1)));
} else {
self.EmplaceBackOutput(std::move(CastPyArg2Tensor(obj, 1)));
}
})
.def("add_attr",
[](paddle::CustomOpKernelContext &self, bool attr) {
self.EmplaceBackAttr(attr);
})
.def("add_attr",
[](paddle::CustomOpKernelContext &self, int attr) {
self.EmplaceBackAttr(attr);
})
.def("add_attr",
[](paddle::CustomOpKernelContext &self, float attr) {
self.EmplaceBackAttr(attr);
})
.def("add_attr",
[](paddle::CustomOpKernelContext &self, int64_t attr) {
self.EmplaceBackAttr(attr);
})
.def("add_attr",
[](paddle::CustomOpKernelContext &self, const std::string &attr) {
self.EmplaceBackAttr(attr);
})
.def("add_attr",
[](paddle::CustomOpKernelContext &self,
const std::vector<int> &attr) { self.EmplaceBackAttr(attr); })
.def("add_attr",
[](paddle::CustomOpKernelContext &self,
const std::vector<float> &attr) { self.EmplaceBackAttr(attr); })
.def("add_attr",
[](paddle::CustomOpKernelContext &self,
const std::vector<int64_t> &attr) { self.EmplaceBackAttr(attr); })
.def("add_attr",
[](paddle::CustomOpKernelContext &self,
const std::vector<std::string> &attr) {
self.EmplaceBackAttr(attr);
});
py::class_<Variable>(m, "Variable", R"DOC(Variable Class. py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
All parameter, weight, gradient are variables in Paddle. All parameter, weight, gradient are variables in Paddle.
......
...@@ -119,6 +119,7 @@ class PADDLE_API CustomOpKernelContext { ...@@ -119,6 +119,7 @@ class PADDLE_API CustomOpKernelContext {
const Tensor& InputAt(size_t idx) const; const Tensor& InputAt(size_t idx) const;
std::vector<Tensor> InputsBetween(size_t start, size_t end) const; std::vector<Tensor> InputsBetween(size_t start, size_t end) const;
Tensor& MutableInputAt(size_t idx); Tensor& MutableInputAt(size_t idx);
std::vector<Tensor>* AllMutableInput();
paddle::optional<Tensor> OptionalInputAt(size_t idx); paddle::optional<Tensor> OptionalInputAt(size_t idx);
paddle::optional<std::vector<Tensor>> OptionalInputsBetween(size_t start, paddle::optional<std::vector<Tensor>> OptionalInputsBetween(size_t start,
size_t end); size_t end);
...@@ -144,13 +145,18 @@ class PADDLE_API CustomOpKernelContext { ...@@ -144,13 +145,18 @@ class PADDLE_API CustomOpKernelContext {
} }
// handle inplace map // handle inplace map
void MapPlainOutputs( void ConstructInplaceIndex(
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs,
const std::unordered_map<std::string, std::string>& inplace_map);
void UpdatePlainOutputs(
const std::vector<std::string>& inputs, const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs, const std::vector<std::string>& outputs,
const std::unordered_map<std::string, std::string>& inplace_map); const std::unordered_map<std::string, std::string>& inplace_map);
void AssignInplaceOutputs(); void AssignInplaceOutputs();
std::vector<Tensor*>* AllMutablePlainOutput(); std::vector<Tensor*>* AllMutablePlainOutput();
std::unordered_map<size_t, size_t> GetInplaceTensorMap(); std::unordered_map<size_t, size_t> GetInplaceIndexMap();
std::unordered_map<size_t, size_t> GetInplaceReverseIndexMap();
private: private:
// TODO(chenweihang): replaced be SmallVector // TODO(chenweihang): replaced be SmallVector
...@@ -159,7 +165,10 @@ class PADDLE_API CustomOpKernelContext { ...@@ -159,7 +165,10 @@ class PADDLE_API CustomOpKernelContext {
std::vector<paddle::any> attrs_; std::vector<paddle::any> attrs_;
// handle inplace map // handle inplace map
std::vector<Tensor*> plain_outputs_; std::vector<Tensor*> plain_outputs_;
std::unordered_map<size_t, size_t> inplace_tensor_map_; // {input: output}
std::unordered_map<size_t, size_t> inplace_idx_map_;
// {output: input}
std::unordered_map<size_t, size_t> inplace_reverse_idx_map_;
std::vector<std::pair<size_t, size_t>> input_range_; std::vector<std::pair<size_t, size_t>> input_range_;
std::vector<std::pair<size_t, size_t>> output_range_; std::vector<std::pair<size_t, size_t>> output_range_;
......
...@@ -103,6 +103,10 @@ Tensor& CustomOpKernelContext::MutableInputAt(size_t idx) { ...@@ -103,6 +103,10 @@ Tensor& CustomOpKernelContext::MutableInputAt(size_t idx) {
return inputs_.at(idx); return inputs_.at(idx);
} }
std::vector<Tensor>* CustomOpKernelContext::AllMutableInput() {
return &inputs_;
}
paddle::optional<Tensor> CustomOpKernelContext::OptionalInputAt(size_t idx) { paddle::optional<Tensor> CustomOpKernelContext::OptionalInputAt(size_t idx) {
if (!inputs_.at(idx).is_initialized()) { if (!inputs_.at(idx).is_initialized()) {
return paddle::none; return paddle::none;
...@@ -156,13 +160,15 @@ const std::pair<size_t, size_t>& CustomOpKernelContext::OutputRangeAt( ...@@ -156,13 +160,15 @@ const std::pair<size_t, size_t>& CustomOpKernelContext::OutputRangeAt(
return output_range_.at(idx); return output_range_.at(idx);
} }
// handle inplace mechanism void CustomOpKernelContext::ConstructInplaceIndex(
// Find out non-inplace output tensors.
// TODO(HongyuJia): Add cache for inplace_tensor_map_ to optimize performance
void CustomOpKernelContext::MapPlainOutputs(
const std::vector<std::string>& inputs, const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs, const std::vector<std::string>& outputs,
const std::unordered_map<std::string, std::string>& inplace_map) { const std::unordered_map<std::string, std::string>& inplace_map) {
// Cache inplace indices.
if (inplace_map.empty() || !inplace_idx_map_.empty()) {
VLOG(4) << "Custom opertor ConstructInplaceIndex no need to recompute.";
return;
}
for (size_t in_idx = 0; in_idx < inputs.size(); ++in_idx) { for (size_t in_idx = 0; in_idx < inputs.size(); ++in_idx) {
auto& input = inputs[in_idx]; auto& input = inputs[in_idx];
if (inplace_map.find(input) == inplace_map.end()) { if (inplace_map.find(input) == inplace_map.end()) {
...@@ -175,15 +181,26 @@ void CustomOpKernelContext::MapPlainOutputs( ...@@ -175,15 +181,26 @@ void CustomOpKernelContext::MapPlainOutputs(
"the input of `Inplace` again and make " "the input of `Inplace` again and make "
"sure you registered your op accurately. ", "sure you registered your op accurately. ",
input)); input));
inplace_tensor_map_[in_idx] = distance(outputs.begin(), out_iter); size_t out_idx = distance(outputs.begin(), out_iter);
inplace_idx_map_[in_idx] = out_idx;
inplace_reverse_idx_map_[out_idx] = in_idx;
}
VLOG(4) << "Custom opertor update inplace input-output map successfully.";
}
// Find out non-inplace output tensors.
void CustomOpKernelContext::UpdatePlainOutputs(
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs,
const std::unordered_map<std::string, std::string>& inplace_map) {
// Cache plain outputs vector.
if (!plain_outputs_.empty()) {
VLOG(4) << "Custom opertor UpdatePlainOutputs no need to recompute.";
return;
} }
ConstructInplaceIndex(inputs, outputs, inplace_map);
for (size_t i = 0; i < outputs.size(); ++i) { for (size_t i = 0; i < outputs.size(); ++i) {
if (std::any_of( if (inplace_reverse_idx_map_.find(i) != inplace_reverse_idx_map_.end()) {
inplace_tensor_map_.begin(),
inplace_tensor_map_.end(),
[i](std::unordered_map<size_t, size_t>::const_reference pair) {
return pair.second == i;
})) {
continue; continue;
} }
size_t output_start_idx = output_range_[i].first; size_t output_start_idx = output_range_[i].first;
...@@ -192,11 +209,12 @@ void CustomOpKernelContext::MapPlainOutputs( ...@@ -192,11 +209,12 @@ void CustomOpKernelContext::MapPlainOutputs(
plain_outputs_.push_back(&outputs_[idx]); plain_outputs_.push_back(&outputs_[idx]);
} }
} }
VLOG(4) << "Custom opertor update inplace input-output map successfully."; VLOG(4) << "Custom opertor update plain outputs map successfully.";
} }
// Assign input tensor to inplace output tensors. // Assign input tensor to inplace output tensors.
void CustomOpKernelContext::AssignInplaceOutputs() { void CustomOpKernelContext::AssignInplaceOutputs() {
for (auto pair : inplace_tensor_map_) { for (auto pair : inplace_idx_map_) {
size_t in_start_idx = input_range_[pair.first].first; size_t in_start_idx = input_range_[pair.first].first;
size_t in_end_idx = input_range_[pair.first].second; size_t in_end_idx = input_range_[pair.first].second;
size_t out_start_idx = output_range_[pair.second].first; size_t out_start_idx = output_range_[pair.second].first;
...@@ -213,15 +231,21 @@ void CustomOpKernelContext::AssignInplaceOutputs() { ...@@ -213,15 +231,21 @@ void CustomOpKernelContext::AssignInplaceOutputs() {
} }
VLOG(4) << "Custom opertor update inplace input-output tensor " VLOG(4) << "Custom opertor update inplace input-output tensor "
"successfully. Update map size = " "successfully. Update map size = "
<< inplace_tensor_map_.size(); << inplace_idx_map_.size();
} }
} }
std::vector<Tensor*>* CustomOpKernelContext::AllMutablePlainOutput() { std::vector<Tensor*>* CustomOpKernelContext::AllMutablePlainOutput() {
return &plain_outputs_; return &plain_outputs_;
} }
std::unordered_map<size_t, size_t> CustomOpKernelContext::GetInplaceIndexMap() {
return inplace_idx_map_;
}
std::unordered_map<size_t, size_t> std::unordered_map<size_t, size_t>
CustomOpKernelContext::GetInplaceTensorMap() { CustomOpKernelContext::GetInplaceReverseIndexMap() {
return inplace_tensor_map_; return inplace_reverse_idx_map_;
} }
////////////////////// Op Meta Info ////////////////////// ////////////////////// Op Meta Info //////////////////////
......
...@@ -1042,7 +1042,9 @@ def _gen_output_content( ...@@ -1042,7 +1042,9 @@ def _gen_output_content(
# ' ' * tab space * tab number # ' ' * tab space * tab number
indent = ' ' * 4 * 2 indent = ' ' * 4 * 2
inplace_idx = {v: k for k, v in inplace_reverse_idx.items()} inplace_idx = {v: k for k, v in inplace_reverse_idx.items()}
dynamic_content = "" dynamic_content = f"""
{indent}res = []
{indent}start_idx = 0"""
static_content = f""" static_content = f"""
{indent}ins = {{}} {indent}ins = {{}}
{indent}ins_map = {ins_map} {indent}ins_map = {ins_map}
...@@ -1065,10 +1067,11 @@ def _gen_output_content( ...@@ -1065,10 +1067,11 @@ def _gen_output_content(
lower_in_names = in_names[in_idx].split("@")[0].lower() lower_in_names = in_names[in_idx].split("@")[0].lower()
dynamic_content += f""" dynamic_content += f"""
{indent}if {lower_in_names} is not None: {indent}if {lower_in_names} is not None:
{indent} outs['{out_name}'] = [core.eager.Tensor() for _ in range(len({lower_in_names}))] {indent} res.append(outs[start_idx: start_idx + len({lower_in_names})])
{indent} start_idx += len({lower_in_names})
{indent}else: {indent}else:
{indent} outs['{out_name}'] = core.eager.Tensor() {indent} res.append(None)
{indent}ctx.add_outputs(outs['{out_name}'])""" {indent} start_idx += 1"""
static_content += f""" static_content += f"""
{indent}if {lower_in_names} is not None: {indent}if {lower_in_names} is not None:
{indent} outs['{out_name}'] = [helper.create_variable(dtype='float32') for _ in range(len({lower_in_names}))]""" {indent} outs['{out_name}'] = [helper.create_variable(dtype='float32') for _ in range(len({lower_in_names}))]"""
...@@ -1077,8 +1080,8 @@ def _gen_output_content( ...@@ -1077,8 +1080,8 @@ def _gen_output_content(
): # inplace vector<Tensor> output case ): # inplace vector<Tensor> output case
lower_in_names = in_names[in_idx].split("@")[0].lower() lower_in_names = in_names[in_idx].split("@")[0].lower()
dynamic_content += f""" dynamic_content += f"""
{indent}outs['{out_name}'] = [core.eager.Tensor() for _ in range(len({lower_in_names}))] {indent}res.append(outs[start_idx: start_idx + len({lower_in_names})])
{indent}ctx.add_outputs(outs['{out_name}'])""" {indent}start_idx += len({lower_in_names})"""
static_content += f""" static_content += f"""
{indent}outs['{out_name}'] = [helper.create_variable(dtype='float32') for _ in range(len({lower_in_names}))]""" {indent}outs['{out_name}'] = [helper.create_variable(dtype='float32') for _ in range(len({lower_in_names}))]"""
elif ( elif (
...@@ -1086,21 +1089,22 @@ def _gen_output_content( ...@@ -1086,21 +1089,22 @@ def _gen_output_content(
): # inplace optional Tensor output case, handle inplace None input ): # inplace optional Tensor output case, handle inplace None input
lower_in_names = in_names[in_idx].split("@")[0].lower() lower_in_names = in_names[in_idx].split("@")[0].lower()
dynamic_content += f""" dynamic_content += f"""
{indent}outs['{out_name}'] = core.eager.Tensor() {indent}if {lower_in_names} is not None:
{indent}ctx.add_outputs(outs['{out_name}'])""" {indent} res.append(outs[start_idx])
{indent}else:
{indent} res.append(None)
{indent}start_idx += 1"""
static_content += f""" static_content += f"""
{indent}if {lower_in_names} is not None: {indent}if {lower_in_names} is not None:
{indent} outs['{out_name}'] = helper.create_variable(dtype='float32')""" {indent} outs['{out_name}'] = helper.create_variable(dtype='float32')"""
else: # general/inplace Tensor output case else: # general/inplace Tensor output case
dynamic_content += f""" dynamic_content += f"""
{indent}outs['{out_name}'] = core.eager.Tensor() {indent}res.append(outs[start_idx])
{indent}ctx.add_outputs(outs['{out_name}'])""" {indent}start_idx += 1"""
static_content += f""" static_content += f"""
{indent}outs['{out_name}'] = helper.create_variable(dtype='float32')""" {indent}outs['{out_name}'] = helper.create_variable(dtype='float32')"""
dynamic_content += f""" dynamic_content += f"""
{indent}core.eager._run_custom_op(ctx, "{op_name}", True)
{indent}res = [outs[out_name] if isinstance(outs[out_name], list) or outs[out_name]._is_initialized() else None for out_name in outs_list]
{indent}return res[0] if len(res)==1 else res""" {indent}return res[0] if len(res)==1 else res"""
static_content += f""" static_content += f"""
...@@ -1134,7 +1138,7 @@ def _custom_api_content(op_name): ...@@ -1134,7 +1138,7 @@ def _custom_api_content(op_name):
API_TEMPLATE = textwrap.dedent( API_TEMPLATE = textwrap.dedent(
""" """
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.core import Tensor, CustomOpKernelContext from paddle.fluid.core import Tensor
from paddle.fluid.framework import _dygraph_tracer, in_dygraph_mode from paddle.fluid.framework import _dygraph_tracer, in_dygraph_mode
from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.layer_helper import LayerHelper
...@@ -1146,11 +1150,7 @@ def _custom_api_content(op_name): ...@@ -1146,11 +1150,7 @@ def _custom_api_content(op_name):
# The output variable's dtype use default value 'float32', # The output variable's dtype use default value 'float32',
# and the actual dtype of output variable will be inferred in runtime. # and the actual dtype of output variable will be inferred in runtime.
if in_dygraph_mode(): if in_dygraph_mode():
ctx = CustomOpKernelContext() outs = core.eager._run_custom_op("{op_name}", {params_list})
for i in {in_names}:
ctx.add_inputs(i)
for j in {attr_names}:
ctx.add_attr(j)
{dynamic_content} {dynamic_content}
else: else:
{static_content} {static_content}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册