未验证 提交 cce176bf 编写于 作者: C Chen Weihang 提交者: GitHub

[Phi] add stack yaml and adapt eager mode (#41334)

* add stack yaml

* add stack yaml

* add stack yaml

* add no_need_buffer

* refine no_need_buffer declare

* remove original grad infershape

* revert stack op
上级 69b79e6f
...@@ -32,51 +32,7 @@ limitations under the License. */ ...@@ -32,51 +32,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace experimental { namespace experimental {
// TODO(chenweihang): the original sum grad op can support higher-level ////////////////// Forward api impls //////////////////////
// differentiation,
// but if we use this impl, it will not support. We need to be able to reuse
// the autograd API here, which is not yet implemented
// TODO(chenweihang): we should support call generated api in custom api impl
std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
const Tensor& out_grad) {
auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
Backend kernel_backend = kernel_key.backend();
DataLayout kernel_layout = kernel_key.layout();
DataType kernel_data_type = kernel_key.dtype();
auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
"scale", {kernel_backend, kernel_layout, kernel_data_type});
VLOG(6) << "add_n_grad API kernel key: [" << kernel_backend << ", "
<< kernel_layout << ", " << kernel_data_type << "]";
VLOG(6) << "add_n_grad API kernel: " << kernel;
auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
auto dense_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
size_t out_number = x.size();
std::vector<Tensor> x_grad;
auto dense_x_grad = SetKernelOutput(out_number, kernel_backend, &x_grad);
using kernel_signature = void (*)(const platform::DeviceContext&,
const phi::DenseTensor&,
const phi::Scalar&,
float,
bool,
phi::DenseTensor*);
auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
for (auto* dense_x_grad_t : dense_x_grad) {
phi::MetaTensor meta_out(dense_x_grad_t);
phi::UnchangedInferMeta(MakeMetaTensor(*dense_out_grad), &meta_out);
(*kernel_fn)(
*dev_ctx, *dense_out_grad, phi::Scalar(1.0), 0.0, true, dense_x_grad_t);
}
return x_grad;
}
Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) { Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
auto kernel_key_set = ParseKernelKeyByInputArgs(x); auto kernel_key_set = ParseKernelKeyByInputArgs(x);
...@@ -167,6 +123,54 @@ std::vector<Tensor> split_impl(const Tensor& x, ...@@ -167,6 +123,54 @@ std::vector<Tensor> split_impl(const Tensor& x,
return out; return out;
} }
////////////////// Backward(grad) api impls //////////////////////
// TODO(chenweihang): the original sum grad op can support higher-level
// differentiation,
// but if we use this impl, it will not support. We need to be able to reuse
// the autograd API here, which is not yet implemented
// TODO(chenweihang): we should support call generated api in custom api impl
std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
const Tensor& out_grad) {
auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
Backend kernel_backend = kernel_key.backend();
DataLayout kernel_layout = kernel_key.layout();
DataType kernel_data_type = kernel_key.dtype();
auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
"scale", {kernel_backend, kernel_layout, kernel_data_type});
VLOG(6) << "add_n_grad API kernel key: [" << kernel_backend << ", "
<< kernel_layout << ", " << kernel_data_type << "]";
VLOG(6) << "add_n_grad API kernel: " << kernel;
auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
auto dense_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
size_t out_number = x.size();
std::vector<Tensor> x_grad;
auto dense_x_grad = SetKernelOutput(out_number, kernel_backend, &x_grad);
using kernel_signature = void (*)(const platform::DeviceContext&,
const phi::DenseTensor&,
const phi::Scalar&,
float,
bool,
phi::DenseTensor*);
auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
for (auto* dense_x_grad_t : dense_x_grad) {
phi::MetaTensor meta_out(dense_x_grad_t);
phi::UnchangedInferMeta(MakeMetaTensor(*dense_out_grad), &meta_out);
(*kernel_fn)(
*dev_ctx, *dense_out_grad, phi::Scalar(1.0), 0.0, true, dense_x_grad_t);
}
return x_grad;
}
std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl( std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl(
const Tensor& x, const Tensor& x,
const Tensor& scale, const Tensor& scale,
...@@ -361,5 +365,50 @@ std::vector<Tensor> concat_grad_impl(const std::vector<Tensor>& x, ...@@ -361,5 +365,50 @@ std::vector<Tensor> concat_grad_impl(const std::vector<Tensor>& x,
return x_grad; return x_grad;
} }
std::vector<Tensor> stack_grad_impl(const std::vector<Tensor>& x,
const Tensor& out_grad,
int axis) {
auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
Backend kernel_backend = kernel_key.backend();
DataLayout kernel_layout = kernel_key.layout();
DataType kernel_data_type = kernel_key.dtype();
auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
"stack_grad", {kernel_backend, kernel_layout, kernel_data_type});
VLOG(6) << "stack_grad API kernel key: [" << kernel_backend << ", "
<< kernel_layout << ", " << kernel_data_type << "]";
VLOG(6) << "stack_grad API kernel: " << kernel;
auto* dev_ctx = GetDeviceContextByBackend(kernel_backend);
auto dense_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
size_t out_number = x.size();
std::vector<Tensor> x_grad;
auto dense_x_grad = SetKernelOutput(out_number, kernel_backend, &x_grad);
std::vector<phi::MetaTensor> meta_x_grad;
meta_x_grad.reserve(out_number);
std::vector<phi::MetaTensor*> meta_x_grad_ptrs;
meta_x_grad_ptrs.reserve(out_number);
for (size_t i = 0; i < out_number; ++i) {
meta_x_grad.push_back(dense_x_grad[i]);
meta_x_grad_ptrs.push_back(&meta_x_grad.back());
}
phi::StackGradInferMeta(
MakeMetaTensor(*dense_out_grad), axis, meta_x_grad_ptrs);
using kernel_signature = void (*)(const platform::DeviceContext&,
const phi::DenseTensor&,
int axis,
std::vector<phi::DenseTensor*>);
auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
(*kernel_fn)(*dev_ctx, *dense_out_grad, axis, dense_x_grad);
return x_grad;
}
} // namespace experimental } // namespace experimental
} // namespace paddle } // namespace paddle
...@@ -22,8 +22,10 @@ limitations under the License. */ ...@@ -22,8 +22,10 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace experimental { namespace experimental {
std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x, // NOTE: Separate forward and backward(grad) api impl
const Tensor& out_grad); // NOTE: The api_impl in this file are arranged in alphabetic order.
////////////////// Forward api impls //////////////////////
Tensor copy_to_impl(const Tensor& x, Place place, bool blocking); Tensor copy_to_impl(const Tensor& x, Place place, bool blocking);
...@@ -31,6 +33,11 @@ std::vector<Tensor> split_impl(const Tensor& x, ...@@ -31,6 +33,11 @@ std::vector<Tensor> split_impl(const Tensor& x,
const IntArray& num_or_sections, const IntArray& num_or_sections,
const Scalar& axis); const Scalar& axis);
////////////////// Backward(grad) api impls //////////////////////
std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
const Tensor& out_grad);
std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl( std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl(
const Tensor& x, const Tensor& x,
const Tensor& scale, const Tensor& scale,
...@@ -49,5 +56,9 @@ std::vector<Tensor> concat_grad_impl(const std::vector<Tensor>& x, ...@@ -49,5 +56,9 @@ std::vector<Tensor> concat_grad_impl(const std::vector<Tensor>& x,
const Tensor& out_grad, const Tensor& out_grad,
const Scalar& axis); const Scalar& axis);
std::vector<Tensor> stack_grad_impl(const std::vector<Tensor>& x,
const Tensor& out_grad,
int axis);
} // namespace experimental } // namespace experimental
} // namespace paddle } // namespace paddle
...@@ -375,4 +375,45 @@ void ScatterNdAddGradInferMeta(const MetaTensor& index, ...@@ -375,4 +375,45 @@ void ScatterNdAddGradInferMeta(const MetaTensor& index,
} }
} }
void StackGradInferMeta(const MetaTensor& out_grad,
int axis,
std::vector<MetaTensor*> x_grad) {
auto dy_dim = out_grad.dims();
int rank = dy_dim.size();
PADDLE_ENFORCE_GE(
axis,
-rank,
phi::errors::InvalidArgument(
"Attr(axis) must be inside [-rank, rank), where rank = %d, "
"but received axis is:%d.",
rank,
axis));
PADDLE_ENFORCE_LT(
axis,
rank,
phi::errors::InvalidArgument(
"Attr(axis) must be inside [-rank, rank), where rank = %d, "
"but received axis is:%d.",
rank,
axis));
if (axis < 0) axis += rank;
PADDLE_ENFORCE_LE(
x_grad.size(),
static_cast<size_t>(dy_dim[axis]),
phi::errors::InvalidArgument(
"Number of Outputs(X@Grad) should be less than or equal to dy dim "
"at axis, but received outputs size is:%d, dy dims is:%d.",
x_grad.size(),
static_cast<size_t>(dy_dim[axis])));
auto vec = phi::vectorize<int>(dy_dim);
vec.erase(vec.begin() + axis);
for (size_t i = 0; i < x_grad.size(); ++i) {
x_grad[i]->set_dims(phi::make_ddim(vec));
x_grad[i]->set_dtype(out_grad.dtype());
}
}
} // namespace phi } // namespace phi
...@@ -163,4 +163,8 @@ void ScatterNdAddGradInferMeta(const MetaTensor& index, ...@@ -163,4 +163,8 @@ void ScatterNdAddGradInferMeta(const MetaTensor& index,
MetaTensor* x_grad, MetaTensor* x_grad,
MetaTensor* updates_grad); MetaTensor* updates_grad);
void StackGradInferMeta(const MetaTensor& out_grad,
int axis,
std::vector<MetaTensor*> x_grad);
} // namespace phi } // namespace phi
...@@ -10309,7 +10309,10 @@ def stack(x, axis=0, name=None): ...@@ -10309,7 +10309,10 @@ def stack(x, axis=0, name=None):
""" """
axis = 0 if axis is None else axis axis = 0 if axis is None else axis
if _non_static_mode(): if in_dygraph_mode():
return _C_ops.final_state_stack(x, axis)
if _in_legacy_dygraph():
return _C_ops.stack(x, 'axis', axis) return _C_ops.stack(x, 'axis', axis)
if not isinstance(x, list) and not isinstance(x, tuple): if not isinstance(x, list) and not isinstance(x, tuple):
......
...@@ -40,6 +40,7 @@ class TestStackOpBase(OpTest): ...@@ -40,6 +40,7 @@ class TestStackOpBase(OpTest):
self.initDefaultParameters() self.initDefaultParameters()
self.initParameters() self.initParameters()
self.op_type = 'stack' self.op_type = 'stack'
self.python_api = paddle.stack
self.x = [] self.x = []
for i in range(self.num_inputs): for i in range(self.num_inputs):
self.x.append( self.x.append(
...@@ -55,20 +56,20 @@ class TestStackOpBase(OpTest): ...@@ -55,20 +56,20 @@ class TestStackOpBase(OpTest):
self.attrs = {'axis': self.axis} self.attrs = {'axis': self.axis}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(check_eager=True)
def test_check_grad(self): def test_check_grad(self):
self.check_grad(self.get_x_names(), 'Y') self.check_grad(self.get_x_names(), 'Y', check_eager=True)
class TestStackOp1(TestStackOpBase): class TestStackOp1(TestStackOpBase):
def initParameters(self): def initParameters(self):
self.num_inputs = 16 self.num_inputs = 8
class TestStackOp2(TestStackOpBase): class TestStackOp2(TestStackOpBase):
def initParameters(self): def initParameters(self):
self.num_inputs = 20 self.num_inputs = 10
class TestStackOp3(TestStackOpBase): class TestStackOp3(TestStackOpBase):
...@@ -111,6 +112,7 @@ class TestStackBF16Op(OpTest): ...@@ -111,6 +112,7 @@ class TestStackBF16Op(OpTest):
self.initDefaultParameters() self.initDefaultParameters()
self.initParameters() self.initParameters()
self.op_type = 'stack' self.op_type = 'stack'
self.python_api = paddle.stack
self.x = [] self.x = []
for i in range(self.num_inputs): for i in range(self.num_inputs):
self.x.append( self.x.append(
...@@ -128,10 +130,10 @@ class TestStackBF16Op(OpTest): ...@@ -128,10 +130,10 @@ class TestStackBF16Op(OpTest):
self.attrs = {'axis': self.axis} self.attrs = {'axis': self.axis}
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output(check_eager=True)
def test_check_grad(self): def test_check_grad(self):
self.check_grad(self.get_x_names(), 'Y') self.check_grad(self.get_x_names(), 'Y', check_eager=True)
class TestStackAPIWithLoDTensorArray(unittest.TestCase): class TestStackAPIWithLoDTensorArray(unittest.TestCase):
......
...@@ -1610,6 +1610,15 @@ ...@@ -1610,6 +1610,15 @@
view: (x -> out) view: (x -> out)
backward : squeeze_grad backward : squeeze_grad
- api : stack
args : (Tensor[] x, int axis)
output : Tensor
infer_meta :
func : StackInferMeta
kernel :
func : stack
backward : stack_grad
- api : strided_slice - api : strided_slice
args : (Tensor x, int[] axes, IntArray starts, IntArray ends, IntArray strides) args : (Tensor x, int[] axes, IntArray starts, IntArray ends, IntArray strides)
output : Tensor output : Tensor
......
...@@ -1180,6 +1180,13 @@ ...@@ -1180,6 +1180,13 @@
kernel : kernel :
func : squeeze_grad func : squeeze_grad
- backward_api : stack_grad
forward : stack (Tensor[] x, int axis) -> Tensor(out)
args : (Tensor[] x, Tensor out_grad, int axis)
output : Tensor[](x_grad)
invoke : stack_grad_impl(x, out_grad, axis)
no_need_buffer : x
- backward_api : strided_slice_grad - backward_api : strided_slice_grad
forward : strided_slice (Tensor x, int[] axes, IntArray starts, IntArray ends, IntArray strides) -> Tensor(out) forward : strided_slice (Tensor x, int[] axes, IntArray starts, IntArray ends, IntArray strides) -> Tensor(out)
args : (Tensor x, Tensor out_grad, int[] axes, IntArray starts, IntArray ends, IntArray strides) args : (Tensor x, Tensor out_grad, int[] axes, IntArray starts, IntArray ends, IntArray strides)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册