diff --git a/paddle/fluid/operators/slice_xpu_op.cc b/paddle/fluid/operators/slice_xpu_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3d6f52c7dc31f8352dc972041b5db645926d4786 --- /dev/null +++ b/paddle/fluid/operators/slice_xpu_op.cc @@ -0,0 +1,203 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include +#include +#include +#include +#include "paddle/fluid/operators/slice_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class SliceXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto in = ctx.Input("Input"); + auto out = ctx.Output("Out"); + auto axes = ctx.Attr>("axes"); + auto starts = ctx.Attr>("starts"); + auto ends = ctx.Attr>("ends"); + auto in_dims = in->dims(); + + // prepare starts, ends on XPU + int dim_value = 0, start = 0, end = 0; + // If a negative value is passed for any of the start or end indices, + // it represents number of elements before the end of that dimension. + // If the value passed to start or end is larger than the n + // (the number of elements in this dimension), it represents n. + for (size_t i = 0; i < axes.size(); ++i) { + dim_value = in_dims[axes[i]]; + start = starts[i]; + end = ends[i]; + start = start < 0 ? (start + dim_value) : start; + end = end < 0 ? (end + dim_value) : end; + start = std::max(start, 0); + end = std::max(end, 0); + end = std::min(end, dim_value); + PADDLE_ENFORCE_GT(end, start, platform::errors::InvalidArgument( + "end should greater than start")); + starts[i] = start; + ends[i] = end; + } + size_t shape_size = in_dims.size(); + // the slice XPU kernel require that the length of `start`, `end` must be + // equal + // to the dims size of input tensor, therefore, if shape_size > axes.size(), + // the `starts_extension` and `ends_extension` is necessary. + std::vector starts_extension(shape_size, 0); + std::vector ends_extension(shape_size, 0); + if (shape_size > axes.size()) { + for (size_t i = 0; i < shape_size; ++i) { + ends_extension[i] = in_dims[i]; + } + for (size_t i = 0; i < axes.size(); ++i) { + starts_extension[axes[i]] = starts[i]; + ends_extension[axes[i]] = ends[i]; + } + } else { + starts_extension = std::move(starts); + ends_extension = std::move(ends); + } + + // prepare shape on XPU + std::vector shape(shape_size, 0); + for (size_t i = 0; i < shape_size; ++i) { + shape[i] = in_dims[i]; + } + + auto& dev_ctx = ctx.template device_context(); + auto* in_data = in->data(); + auto* out_data = out->mutable_data(ctx.GetPlace()); + + int r = xpu::slice_forward(dev_ctx.x_context(), shape.data(), + starts_extension.data(), ends_extension.data(), + shape_size, in_data, out_data); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("XPU slice kernel error!")); + } +}; + +template +class SliceGradXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_in = ctx.Output(framework::GradVarName("Input")); + d_in->mutable_data(ctx.GetPlace()); + + auto in_dims = d_in->dims(); + auto axes = ctx.Attr>("axes"); + auto starts = ctx.Attr>("starts"); + auto ends = ctx.Attr>("ends"); + + // prepare starts, ends on XPU + int dim_value = 0, start = 0, end = 0; + // If a negative value is passed for any of the start or end indices, + // it represents number of elements before the end of that dimension. + // If the value passed to start or end is larger than the n + // (the number of elements in this dimension), it represents n. + for (size_t i = 0; i < axes.size(); ++i) { + dim_value = in_dims[axes[i]]; + start = starts[i]; + end = ends[i]; + start = start < 0 ? (start + dim_value) : start; + end = end < 0 ? (end + dim_value) : end; + start = std::max(start, 0); + end = std::max(end, 0); + end = std::min(end, dim_value); + PADDLE_ENFORCE_GT(end, start, platform::errors::InvalidArgument( + "end should greater than start")); + starts[i] = start; + ends[i] = end; + } + size_t shape_size = in_dims.size(); + // the slice XPU kernel require that the length of `start`, `end` must be + // equal + // to the dims size of input tensor, therefore, if shape_size > axes.size(), + // the `starts_extension` and `ends_extension` is necessary. + std::vector starts_extension(shape_size, 0); + std::vector ends_extension(shape_size, 0); + if (shape_size > axes.size()) { + for (size_t i = 0; i < shape_size; ++i) { + ends_extension[i] = in_dims[i]; + } + for (size_t i = 0; i < axes.size(); ++i) { + starts_extension[axes[i]] = starts[i]; + ends_extension[axes[i]] = ends[i]; + } + } + int* starts_device = nullptr; + int* ends_device = nullptr; + int* starts_host = + shape_size > axes.size() ? starts_extension.data() : starts.data(); + int* ends_host = + shape_size > axes.size() ? ends_extension.data() : ends.data(); + PADDLE_ENFORCE_EQ( + xpu_malloc((void**)(&starts_device), shape_size * sizeof(int)), + XPU_SUCCESS, platform::errors::External("XPU has no enough memory")); + PADDLE_ENFORCE_EQ( + xpu_malloc((void**)(&ends_device), shape_size * sizeof(int)), + XPU_SUCCESS, platform::errors::External("XPU has no enough memory")); + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), + starts_device, platform::CPUPlace(), starts_host, + shape_size * sizeof(int)); + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), + ends_device, platform::CPUPlace(), ends_host, + shape_size * sizeof(int)); + + // prepare shape on XPU + std::vector shape(shape_size, 0); + for (size_t i = 0; i < shape_size; ++i) { + shape[i] = in_dims[i]; + } + int* shape_device = nullptr; + PADDLE_ENFORCE_EQ( + xpu_malloc((void**)(&shape_device), shape_size * sizeof(int)), + XPU_SUCCESS, platform::errors::External("XPU has no enough memory")); + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), + shape_device, platform::CPUPlace(), shape.data(), + shape_size * sizeof(int)); + + auto& dev_ctx = ctx.template device_context(); + int r = + xpu::slice_backward(dev_ctx.x_context(), shape_device, starts_device, + ends_device, shape_size, d_out->data(), + d_in->data(), d_in->numel(), d_out->numel()); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External("xpu slice kernel error")); + dev_ctx.Wait(); + // free device data + xpu_free(shape_device); + xpu_free(starts_device); + xpu_free(ends_device); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_XPU_KERNEL( + slice, ops::SliceXPUKernel); +REGISTER_OP_XPU_KERNEL( + slice_grad, + ops::SliceGradXPUKernel); +#endif diff --git a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py new file mode 100644 index 0000000000000000000000000000000000000000..44c8821be06bcf1aea3862093fa4cfc2b85776fd --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py @@ -0,0 +1,171 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +import paddle +import paddle.fluid.core as core +from op_test import OpTest +import paddle.fluid as fluid +import paddle.fluid.layers as layers + + +# Situation 1: starts(list, no tensor), ends(list, no tensor) +# 1.1 without attr(decrease) +class TestSliceOp(OpTest): + def setUp(self): + self.op_type = "slice" + self.config() + self.inputs = {'Input': self.input} + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags, + "use_xpu": True + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.starts = [1, 0, 2] + self.ends = [3, 3, 4] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[1:3, 0:3, 2:4, :] + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + def test_check_grad_normal(self): + place = paddle.XPUPlace(0) + self.check_grad_with_place(place, ['Input'], 'Out') + + +class TestCase1(TestSliceOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[-3:3, 0:100, 2:-1, :] + + +class TestCase2(TestSliceOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[-3:3, 0:100, :, 2:-1] + + +# 1.2 with attr(decrease) +class TestSliceOp_decs_dim(OpTest): + def setUp(self): + self.op_type = "slice" + self.config() + self.inputs = {'Input': self.input} + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags, + 'decrease_axis': self.decrease_axis, + "use_xpu": True + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.starts = [1, 0, 2] + self.ends = [2, 3, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0:3, 2:4, :] + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + def test_check_grad_normal(self): + place = paddle.XPUPlace(0) + self.check_grad_with_place(place, ['Input'], 'Out') + + +class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.starts = [1, 0, 2] + self.ends = [2, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0, 2:4, :] + + +class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.starts = [-1, 0, 2] + self.ends = [1000000, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[-1, 0, 2:4, :] + + +class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 7]).astype("float64") + self.starts = [0, 1, 2, 3] + self.ends = [1, 2, 3, 4] + self.axes = [0, 1, 2, 3] + self.decrease_axis = [0, 1, 2, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[0, 1, 2, 3:4] + + +class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.starts = [-1] + self.ends = [1000000] + self.axes = [3] + self.decrease_axis = [3] + self.infer_flags = [1, 1, 1] + self.out = self.input[:, :, :, -1] + + +class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.starts = [0, 1, 2, 3] + self.ends = [1, 2, 3, 4] + self.axes = [0, 1, 2, 3] + self.decrease_axis = [0, 1, 2, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[0, 1, 2, 3:4] + + +if __name__ == '__main__': + unittest.main()