未验证 提交 04025237 编写于 作者: H HongyuJia 提交者: GitHub

[CustomOP Inplace] Automap inplace dtype and shape, support vector<Tensor> output (#52114)

* [CustomOP Inplace] Automap inplace dtype and shape, prepare for vector<Tensor> output

* delete dtype,shape func of multi_inplace op

* [CustomOP Inplace] Automap inplace dtype and shape, support vector<Tensor> output
上级 888a30c9
......@@ -518,6 +518,7 @@ static PyObject* eager_api_run_custom_op(PyObject* self,
"sure you registered your op first and try again. ",
op_type));
VLOG(7) << "Run Kernel of Custom Op: " << op_type;
// TODO(HongyuJia): Optimize Attrs Cast naming and implementation
std::vector<paddle::any> res_attrs = CastAttrsToTargetType(
ctx.Attrs(),
paddle::OpMetaInfoHelper::GetAttrs(meta_info_map.at(op_type)[0]));
......
......@@ -196,6 +196,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
template <typename... RemainingArgs>
struct ComputeCallHelper;
// Handle args for general Tensor input case
template <typename... Tail>
struct ComputeCallHelper<const Tensor&, Tail...> {
template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
......@@ -209,6 +210,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
}
};
// Handle args for optional Tensor input case
template <typename... Tail>
struct ComputeCallHelper<const paddle::optional<paddle::Tensor>&, Tail...> {
template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
......@@ -228,6 +230,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
}
};
// Handle args for general vector<Tensor> input case
template <typename... Tail>
struct ComputeCallHelper<const std::vector<Tensor>&, Tail...> {
template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
......@@ -241,6 +244,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
}
};
// Handle args for optional vector<Tensor> input case
template <typename... Tail>
struct ComputeCallHelper<const paddle::optional<std::vector<paddle::Tensor>>&,
Tail...> {
......@@ -293,6 +297,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
// Used to be compatible with 2.3 released internal inplace interface, not
// recommended
// Handle args for compatible inplace case
template <typename... Tail>
struct ComputeCallHelper<Tensor*, Tail...> {
template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
......@@ -310,6 +315,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
// recommended
// TODO(chenweihang): What is the appropriate output form?
// std::vector<Tensor>*? or std::vector<Tensor*>? or std::vector<Tensor*>*
// Handle args for compatible inplace case
template <typename... Tail>
struct ComputeCallHelper<std::vector<Tensor*>, Tail...> {
template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
......@@ -323,7 +329,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
}
};
// Handle Tensor& for inplace case
// Handle args for inplace Tensor case
template <typename... Tail>
struct ComputeCallHelper<Tensor&, Tail...> {
template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
......@@ -337,6 +343,20 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
}
};
// Handle args for inplace vector<Tensor> case
template <typename... Tail>
struct ComputeCallHelper<std::vector<Tensor>&, Tail...> {
template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
static void Compute(CustomOpKernelContext* ctx, PreviousArgs&... pargs) {
auto& range = ctx->InputRangeAt(in_idx);
auto arg = ctx->InputsBetween(range.first, range.second);
ComputeCallHelper<
Tail...>::template Compute<in_idx + 1, attr_idx, out_idx>(ctx,
pargs...,
arg);
}
};
template <int out_idx, typename T>
struct ComputeReturnHelper;
......@@ -739,6 +759,7 @@ class PADDLE_API OpMetaInfo {
std::vector<std::string> outputs_;
std::vector<std::string> attrs_;
std::unordered_map<std::string, std::string> inplace_map_;
std::unordered_map<std::string, std::string> inplace_reverse_map_;
// 2. func info
KernelFunc kernel_fn_{nullptr};
InferShapeFunc infer_shape_fn_{nullptr};
......@@ -767,6 +788,10 @@ class OpMetaInfoHelper {
const paddle::OpMetaInfo& info) {
return info.inplace_map_;
}
static const std::unordered_map<std::string, std::string>&
GetInplaceReverseMap(const paddle::OpMetaInfo& info) {
return info.inplace_reverse_map_;
}
static const KernelFunc& GetKernelFn(const paddle::OpMetaInfo& info) {
return info.kernel_fn_;
}
......
......@@ -134,6 +134,7 @@ const std::pair<size_t, size_t>& CustomOpKernelContext::OutputRangeAt(
// handle inplace mechanism
// Find out non-inplace output tensors.
// TODO(HongyuJia): Add cache for inplace_tensor_map_ to optimize performance
void CustomOpKernelContext::MapPlainOutputs(
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs,
......@@ -215,6 +216,9 @@ OpMetaInfo& OpMetaInfo::SetInplaceMap(
std::unordered_map<std::string, std::string>&& inplace_map) {
inplace_map_ =
std::forward<std::unordered_map<std::string, std::string>>(inplace_map);
for (const auto& pair : inplace_map_) {
inplace_reverse_map_[pair.second] = pair.first;
}
return *this;
}
OpMetaInfo& OpMetaInfo::SetKernelFn(KernelFunc&& func) {
......
......@@ -19,18 +19,18 @@
#include "paddle/extension.h"
template <typename data_t>
void add_forward_kernel(data_t* x_data, const data_t* y_data, int64_t numel) {
void add_data_pointer(const data_t* x_data, data_t* out_data, int64_t numel) {
for (size_t i = 0; i < numel; ++i) {
x_data[i] += y_data[i];
out_data[i] += x_data[i];
}
}
template <typename data_t>
void add_backward_kernel(data_t* y_grad_data,
const data_t* out_grad_data,
void assign_data_pointer(const data_t* x_data,
data_t* out_data,
int64_t numel) {
for (size_t i = 0; i < numel; ++i) {
y_grad_data[i] = out_grad_data[i];
out_data[i] = x_data[i];
}
}
......@@ -54,21 +54,10 @@ void relu_backward_kernel(const data_t* out_data,
void AddForward(paddle::Tensor& x, const paddle::Tensor& y) { // NOLINT
PD_CHECK(x.place() == paddle::PlaceType::kCPU, "x must be a CPU Tensor.");
PD_DISPATCH_FLOATING_TYPES(x.type(), "AddForward", ([&] {
add_forward_kernel<data_t>(x.data<data_t>(),
y.data<data_t>(),
x.size());
}));
}
std::vector<paddle::DataType> AddInferDtype(const paddle::DataType& x_dtype,
const paddle::DataType& y_dtype) {
return {x_dtype};
}
std::vector<std::vector<int64_t>> AddInferShape(
const std::vector<int64_t>& x_shape, const std::vector<int64_t>& y_shape) {
return {x_shape};
PD_DISPATCH_FLOATING_TYPES(
x.type(), "AddForward", ([&] {
add_data_pointer<data_t>(y.data<data_t>(), x.data<data_t>(), x.size());
}));
}
std::vector<paddle::Tensor> AddBackward(const paddle::Tensor& x,
......@@ -81,8 +70,8 @@ std::vector<paddle::Tensor> AddBackward(const paddle::Tensor& x,
PD_DISPATCH_FLOATING_TYPES(
out_grad.type(), "AddBackward", ([&] {
add_backward_kernel<data_t>(
y_grad.data<data_t>(), out_grad.data<data_t>(), out_grad.size());
assign_data_pointer<data_t>(
out_grad.data<data_t>(), y_grad.data<data_t>(), out_grad.size());
}));
return {y_grad};
......@@ -92,9 +81,7 @@ PD_BUILD_OP(custom_add)
.Inputs({"X", "Y"})
.Outputs({"Out"})
.SetInplaceMap({{"X", "Out"}})
.SetKernelFn(PD_KERNEL(AddForward))
.SetInferShapeFn(PD_INFER_SHAPE(AddInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(AddInferDtype));
.SetKernelFn(PD_KERNEL(AddForward));
PD_BUILD_GRAD_OP(custom_add)
.Inputs({"X", "Y", paddle::Grad("Out")})
......@@ -102,6 +89,58 @@ PD_BUILD_GRAD_OP(custom_add)
.SetInplaceMap({{paddle::Grad("Out"), paddle::Grad("X")}})
.SetKernelFn(PD_KERNEL(AddBackward));
// out[i] = x[i] + y
void AddVectorForward(std::vector<paddle::Tensor>& x, // NOLINT
const paddle::Tensor& y) {
PD_CHECK(y.place() == paddle::PlaceType::kCPU, "y must be a CPU Tensor.");
PD_DISPATCH_FLOATING_TYPES(y.type(), "AddVectorForward", ([&] {
for (size_t i = 0; i < x.size(); ++i) {
add_data_pointer<data_t>(y.data<data_t>(),
x[i].data<data_t>(),
y.size());
}
}));
}
// dout[i] / dx[i] = out_grad[i] (do not need any code, inplace automatically)
// dout / dy = out_grad[0] + ... + out_grad[n - 1]
std::vector<paddle::Tensor> AddVectorBackward(
const std::vector<paddle::Tensor>& x,
const paddle::Tensor& y,
std::vector<paddle::Tensor>& out_grad) { // NOLINT
PD_CHECK(x[0].place() == paddle::PlaceType::kCPU,
"x[0] must be a CPU Tensor.");
PD_CHECK(y.place() == paddle::PlaceType::kCPU, "y must be a CPU Tensor.");
PD_CHECK(x.size() == out_grad.size(),
"x must have the same size as out_grad.");
paddle::Tensor y_grad = paddle::zeros(y.shape(), y.dtype(), y.place());
PD_DISPATCH_FLOATING_TYPES(
y.type(), "AddVectorBackward", ([&] {
// y_grad = out_grad[0] + ... + out_grad[n - 1]
for (size_t i = 0; i < out_grad.size(); ++i) {
add_data_pointer<data_t>(
out_grad[i].data<data_t>(), y_grad.data<data_t>(), y_grad.size());
}
}));
return {y_grad};
}
PD_BUILD_OP(custom_add_vec)
.Inputs({paddle::Vec("X"), "Y"})
.Outputs({paddle::Vec("Out")})
.SetInplaceMap({{paddle::Vec("X"), paddle::Vec("Out")}})
.SetKernelFn(PD_KERNEL(AddVectorForward));
PD_BUILD_GRAD_OP(custom_add_vec)
.Inputs({paddle::Vec("X"), "Y", paddle::Grad(paddle::Vec("Out"))})
.Outputs({paddle::Grad(paddle::Vec("X")), paddle::Grad("Y")})
.SetInplaceMap({{paddle::Grad(paddle::Vec("Out")),
paddle::Grad(paddle::Vec("X"))}})
.SetKernelFn(PD_KERNEL(AddVectorBackward));
void MultiInplaceForward(paddle::Tensor& x, // NOLINT
const paddle::Tensor& y,
paddle::Tensor& a, // NOLINT
......@@ -111,29 +150,11 @@ void MultiInplaceForward(paddle::Tensor& x, // NOLINT
PD_DISPATCH_FLOATING_TYPES(
x.type(), "MultiInplaceForward", ([&] {
add_forward_kernel<data_t>(
x.data<data_t>(), y.data<data_t>(), x.size());
add_forward_kernel<data_t>(
a.data<data_t>(), b.data<data_t>(), a.size());
add_data_pointer<data_t>(y.data<data_t>(), x.data<data_t>(), x.size());
add_data_pointer<data_t>(b.data<data_t>(), a.data<data_t>(), a.size());
}));
}
std::vector<paddle::DataType> MultiInplaceInferDtype(
const paddle::DataType& x_dtype,
const paddle::DataType& y_dtype,
const paddle::DataType& a_dtype,
const paddle::DataType& b_dtype) {
return {x_dtype, a_dtype};
}
std::vector<std::vector<int64_t>> MultiInplaceInferShape(
const std::vector<int64_t>& x_shape,
const std::vector<int64_t>& y_shape,
const std::vector<int64_t>& a_shape,
const std::vector<int64_t>& b_shape) {
return {x_shape, a_shape};
}
std::vector<paddle::Tensor> MultiInplaceBackward(
const paddle::Tensor& x,
const paddle::Tensor& y,
......@@ -151,11 +172,11 @@ std::vector<paddle::Tensor> MultiInplaceBackward(
PD_DISPATCH_FLOATING_TYPES(
outxy_grad.type(), "MultiInplaceBackward", ([&] {
add_backward_kernel<data_t>(y_grad.data<data_t>(),
outxy_grad.data<data_t>(),
assign_data_pointer<data_t>(outxy_grad.data<data_t>(),
y_grad.data<data_t>(),
outxy_grad.size());
add_backward_kernel<data_t>(b_grad.data<data_t>(),
outab_grad.data<data_t>(),
assign_data_pointer<data_t>(outab_grad.data<data_t>(),
b_grad.data<data_t>(),
outab_grad.size());
}));
......@@ -166,9 +187,7 @@ PD_BUILD_OP(custom_multi_inplace)
.Inputs({"X", "Y", "A", "B"})
.Outputs({"OutXY", "OutAB"})
.SetInplaceMap({{"X", "OutXY"}, {"A", "OutAB"}})
.SetKernelFn(PD_KERNEL(MultiInplaceForward))
.SetInferShapeFn(PD_INFER_SHAPE(MultiInplaceInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(MultiInplaceInferDtype));
.SetKernelFn(PD_KERNEL(MultiInplaceForward));
PD_BUILD_GRAD_OP(custom_multi_inplace)
.Inputs({"X", "Y", paddle::Grad("OutXY"), "A", "B", paddle::Grad("OutAB")})
......
......@@ -40,6 +40,54 @@ custom_inplace = load(
verbose=True,
)
# Temporarily assemble custom python API
import paddle.fluid.core as core
from paddle.fluid.core import CustomOpKernelContext
from paddle.fluid.framework import in_dygraph_mode
from paddle.fluid.layer_helper import LayerHelper
def custom_add_vec(x_vector, y):
# prepare inputs and outputs
attrs = {}
outs = {}
out_names = ["Out@VECTOR"]
# The output variable's dtype use default value 'float32',
# and the actual dtype of output variable will be inferred in runtime.
if in_dygraph_mode():
ctx = CustomOpKernelContext()
for i in [x_vector, y]:
ctx.add_inputs(i)
for out_name in out_names:
outs[out_name] = [core.eager.Tensor() for _ in range(len(x_vector))]
ctx.add_outputs(outs[out_name])
core.eager._run_custom_op(ctx, "custom_add_vec", True)
else:
ins = {}
for key, value in dict({"X@VECTOR": x_vector, "Y": y}).items():
# handle optional inputs
if value is not None:
ins[key] = value
helper = LayerHelper("custom_add_vec", **locals())
for out_name in out_names:
outs[out_name] = [
helper.create_variable(dtype='float32')
for _ in range(len(x_vector))
]
helper.append_op(
type="custom_add_vec", inputs=ins, outputs=outs, attrs=attrs
)
res = [outs[out_name] for out_name in out_names]
return res[0] if len(res) == 1 else res
# Set custom python API manually
custom_inplace.custom_add_vec = custom_add_vec
def inplace_dynamic_add(phi_func, device, dtype, np_x, np_y):
paddle.set_device(device)
......@@ -88,7 +136,89 @@ def inplace_static_add(func, device, dtype, np_x, np_y):
return x_v, out_v, x_grad_v, y_grad_v, out_grad_v
def inplace_dynamic_relu(phi_func, device, dtype, np_x, np_y, np_z):
def inplace_dynamic_add_vector(phi_func, device, dtype, np_inputs, np_y):
paddle.set_device(device)
inputs = [
paddle.to_tensor(np_input, dtype=dtype, stop_gradient=True)
for np_input in np_inputs
]
y = paddle.to_tensor(np_y, dtype=dtype, stop_gradient=False)
if phi_func:
out = custom_inplace.custom_add_vec(inputs, y)
else:
out = [x.add_(y) for x in inputs]
mean_out = paddle.mean(paddle.concat(out))
mean_out.backward()
return (
np.concatenate([input.numpy() for input in inputs]),
y.numpy(),
np.concatenate([o.numpy() for o in out]),
np.concatenate([input.grad.numpy() for input in inputs]),
y.grad.numpy(),
)
def inplace_static_add_vector(phi_func, device, dtype, np_inputs, np_y):
paddle.enable_static()
paddle.set_device(device)
with static.scope_guard(static.Scope()):
with static.program_guard(static.Program()):
x1 = static.data(
name="x1", shape=[None, np_inputs[0].shape[1]], dtype=dtype
)
x2 = static.data(
name="x2", shape=[None, np_inputs[1].shape[1]], dtype=dtype
)
y = static.data(name="y", shape=[None, np_y.shape[1]], dtype=dtype)
x1.stop_gradient = False
x2.stop_gradient = False
y.stop_gradient = False
if phi_func:
out = custom_inplace.custom_add_vec([x1, x2], y)
else:
out = [paddle.add(x1, y), paddle.add(x2, y)]
mean_out = paddle.mean(paddle.concat(out))
static.append_backward(mean_out)
exe = static.Executor()
exe.run(static.default_startup_program())
(
out0_v,
out1_v,
x1_grad_v,
x2_grad_v,
y_grad_v,
out0_grad_v,
out1_grad_v,
) = exe.run(
static.default_main_program(),
feed={
"x1": np_inputs[0].astype(dtype),
"x2": np_inputs[1].astype(dtype),
"y": np_y.astype(dtype),
},
fetch_list=[
out[0].name,
out[1].name,
x1.name + "@GRAD",
x2.name + "@GRAD",
y.name + "@GRAD",
out[0].name + "@GRAD",
out[1].name + "@GRAD",
],
)
paddle.disable_static()
return (
[out0_v, out1_v],
[x1_grad_v, x2_grad_v],
y_grad_v,
[out0_grad_v, out1_grad_v],
)
def inplace_dynamic_relu_net(phi_func, device, dtype, np_x, np_y, np_z):
paddle.set_device(device)
x = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
y = paddle.to_tensor(np_y, dtype=dtype, stop_gradient=False)
......@@ -107,7 +237,7 @@ def inplace_dynamic_relu(phi_func, device, dtype, np_x, np_y, np_z):
return x.numpy(), y.numpy(), out.numpy(), x.grad.numpy(), y.grad.numpy()
def inplace_static_relu(func, device, dtype, np_x, np_y, np_z):
def inplace_static_relu_net(func, device, dtype, np_x, np_y, np_z):
paddle.enable_static()
paddle.set_device(device)
with static.scope_guard(static.Scope()):
......@@ -255,6 +385,10 @@ class TestCustomInplaceJit(unittest.TestCase):
self.np_z = np.random.random((3, 2)).astype("float32")
self.np_a = np.random.random((3, 2)).astype("float32")
self.np_b = np.random.random((3, 2)).astype("float32")
self.np_inputs = [
np.random.random((3, 2)).astype("float32"),
np.random.random((3, 2)).astype("float32"),
]
def check_output(self, out, pd_out, name):
np.testing.assert_array_equal(
......@@ -354,7 +488,79 @@ class TestCustomInplaceJit(unittest.TestCase):
self.check_output(phi_x_grad, pd_x_grad, "x_grad")
self.check_output(phi_y_grad, pd_y_grad, "y_grad")
def test_static_multiple_inplace_relu(self):
def test_static_add_vector(self):
for device in self.devices:
for dtype in self.dtypes:
(
pd_out,
pd_x_grad,
pd_y_grad,
pd_out_grad,
) = inplace_static_add_vector(
True,
device,
dtype,
self.np_inputs,
self.np_y,
)
(
phi_out,
phi_x_grad,
phi_y_grad,
phi_out_grad,
) = inplace_static_add_vector(
False,
device,
dtype,
self.np_inputs,
self.np_y,
)
self.check_output(phi_out, pd_out, "out")
self.check_output(phi_x_grad, pd_x_grad, "x_grad")
self.check_output(phi_y_grad, pd_y_grad, "y_grad")
self.check_output(phi_out_grad, pd_out_grad, "out_grad")
def test_dynamic_add_vector(self):
for device in self.devices:
for dtype in self.dtypes:
(
pd_x,
pd_y,
pd_out,
pd_x_grad,
pd_y_grad,
) = inplace_dynamic_add_vector(
True,
device,
dtype,
self.np_inputs,
self.np_y,
)
(
phi_x,
phi_y,
phi_out,
phi_x_grad,
phi_y_grad,
) = inplace_dynamic_add_vector(
False,
device,
dtype,
self.np_inputs,
self.np_y,
)
self.check_output(phi_x, phi_out, "inplace_phi_x")
self.check_output(pd_x, pd_out, "inplace_pd_x")
self.check_output(phi_x, pd_x, "x")
self.check_output(phi_y, pd_y, "y")
self.check_output(phi_out, pd_out, "out")
self.check_output(phi_x_grad, pd_x_grad, "x_grad")
self.check_output(phi_y_grad, pd_y_grad, "y_grad")
def test_static_relu_net(self):
for device in self.devices:
for dtype in self.dtypes:
(
......@@ -363,7 +569,7 @@ class TestCustomInplaceJit(unittest.TestCase):
pd_out,
pd_x_grad,
pd_y_grad,
) = inplace_static_relu(
) = inplace_static_relu_net(
paddle.nn.functional.relu,
device,
dtype,
......@@ -377,7 +583,7 @@ class TestCustomInplaceJit(unittest.TestCase):
phi_out,
phi_x_grad,
phi_y_grad,
) = inplace_static_relu(
) = inplace_static_relu_net(
custom_inplace.custom_relu_inplace,
device,
dtype,
......@@ -391,7 +597,7 @@ class TestCustomInplaceJit(unittest.TestCase):
self.check_output_allclose(phi_x_grad, pd_x_grad, "x_grad")
self.check_output_allclose(phi_y_grad, pd_y_grad, "y_grad")
def test_dynamic_multiple_inplace_relu(self):
def test_dynamic_relu_net(self):
for device in self.devices:
for dtype in self.dtypes:
(
......@@ -400,7 +606,7 @@ class TestCustomInplaceJit(unittest.TestCase):
pd_out,
pd_x_grad,
pd_y_grad,
) = inplace_dynamic_relu(
) = inplace_dynamic_relu_net(
False,
device,
dtype,
......@@ -414,7 +620,7 @@ class TestCustomInplaceJit(unittest.TestCase):
phi_out,
phi_x_grad,
phi_y_grad,
) = inplace_dynamic_relu(
) = inplace_dynamic_relu_net(
True,
device,
dtype,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册