diff --git a/paddle/fluid/operators/xpu/slice_xpu_op.cc b/paddle/fluid/operators/xpu/slice_xpu_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..013692601b7a2391304ddad51346a34677b58ecc --- /dev/null +++ b/paddle/fluid/operators/xpu/slice_xpu_op.cc @@ -0,0 +1,190 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/slice_op.h" +#include +#include +#include +#include + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class SliceXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto in = ctx.Input("Input"); + auto out = ctx.Output("Out"); + auto axes = ctx.Attr>("axes"); + auto starts = ctx.Attr>("starts"); + auto ends = ctx.Attr>("ends"); + auto in_dims = in->dims(); + + // prepare starts, ends on XPU + int dim_value = 0, start = 0, end = 0; + // If a negative value is passed for any of the start or end indices, + // it represents number of elements before the end of that dimension. + // If the value passed to start or end is larger than the n + // (the number of elements in this dimension), it represents n. + for (size_t i = 0; i < axes.size(); ++i) { + dim_value = in_dims[axes[i]]; + start = starts[i]; + end = ends[i]; + start = start < 0 ? (start + dim_value) : start; + end = end < 0 ? (end + dim_value) : end; + start = std::max(start, 0); + end = std::max(end, 0); + end = std::min(end, dim_value); + PADDLE_ENFORCE_GT(end, start, "end should greater than start"); + starts[i] = start; + ends[i] = end; + } + size_t shape_size = in_dims.size(); + // the slice XPU kernel require that the length of `start`, `end` must be equal + // to the dims size of input tensor, therefore, if shape_size > axes.size(), + // the `starts_extension` and `ends_extension` is necessary. + std::vector starts_extension(shape_size, 0); + std::vector ends_extension(shape_size, 0); + if (shape_size > axes.size()) { + for (size_t i = 0; i < shape_size; ++i){ + ends_extension[i] = in_dims[i]; + } + for (size_t i = 0; i < axes.size(); ++i) { + starts_extension[axes[i]] = starts[i]; + ends_extension[axes[i]] = ends[i]; + } + } else { + starts_extension = std::move(starts); + ends_extension = std::move(ends); + } + + // prepare shape on XPU + std::vector shape(shape_size, 0); + for (size_t i = 0; i < shape_size; ++i) { + shape[i] = in_dims[i]; + } + + auto& dev_ctx = ctx.template device_context(); + auto* in_data = in->data(); + auto* out_data = out->mutable_data(ctx.GetPlace()); + + int r = xpu::slice_forward(dev_ctx.x_context(), shape.data(), + starts_extension.data(), ends_extension.data(), + shape_size, in_data, out_data); + PADDLE_ENFORCE(r == xpu::Error_t::SUCCESS, "XPU kernel error!"); + } +}; + +template +class SliceGradXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* d_out = ctx.Input(framework::GradVarName("Out")); + auto* d_in = ctx.Output(framework::GradVarName("Input")); + d_in->mutable_data(ctx.GetPlace()); + + auto in_dims = d_in->dims(); + auto axes = ctx.Attr>("axes"); + auto starts = ctx.Attr>("starts"); + auto ends = ctx.Attr>("ends"); + + // prepare starts, ends on XPU + int dim_value = 0, start = 0, end = 0; + // If a negative value is passed for any of the start or end indices, + // it represents number of elements before the end of that dimension. + // If the value passed to start or end is larger than the n + // (the number of elements in this dimension), it represents n. + for (size_t i = 0; i < axes.size(); ++i) { + dim_value = in_dims[axes[i]]; + start = starts[i]; + end = ends[i]; + start = start < 0 ? (start + dim_value) : start; + end = end < 0 ? (end + dim_value) : end; + start = std::max(start, 0); + end = std::max(end, 0); + end = std::min(end, dim_value); + PADDLE_ENFORCE_GT(end, start, "end should greater than start"); + starts[i] = start; + ends[i] = end; + } + size_t shape_size = in_dims.size(); + // the slice XPU kernel require that the length of `start`, `end` must be equal + // to the dims size of input tensor, therefore, if shape_size > axes.size(), + // the `starts_extension` and `ends_extension` is necessary. + std::vector starts_extension(shape_size, 0); + std::vector ends_extension(shape_size, 0); + if (shape_size > axes.size()) { + for (size_t i = 0; i < shape_size; ++i){ + ends_extension[i] = in_dims[i]; + } + for (size_t i = 0; i < axes.size(); ++i) { + starts_extension[axes[i]] = starts[i]; + ends_extension[axes[i]] = ends[i]; + } + } + int* starts_device = nullptr; + int* ends_device = nullptr; + int* starts_host = shape_size > axes.size() ? + starts_extension.data() : starts.data(); + int* ends_host = shape_size > axes.size() ? + ends_extension.data() : ends.data(); + PADDLE_ENFORCE(xpu_malloc((void**)(&starts_device), shape_size * sizeof(int)) == XPU_SUCCESS); + PADDLE_ENFORCE(xpu_malloc((void**)(&ends_device), shape_size * sizeof(int)) == XPU_SUCCESS); + memory::Copy(boost::get(ctx.GetPlace()), starts_device, + platform::CPUPlace(), starts_host, + shape_size * sizeof(int)); + memory::Copy(boost::get(ctx.GetPlace()), ends_device, + platform::CPUPlace(), ends_host, + shape_size * sizeof(int)); + + // prepare shape on XPU + std::vector shape(shape_size, 0); + for (size_t i = 0; i < shape_size; ++i) { + shape[i] = in_dims[i]; + } + int* shape_device = nullptr; + PADDLE_ENFORCE(xpu_malloc((void**)(&shape_device), shape_size * sizeof(int)) == XPU_SUCCESS); + memory::Copy(boost::get(ctx.GetPlace()), shape_device, + platform::CPUPlace(), shape.data(), + shape_size * sizeof(int)); + + auto& dev_ctx = ctx.template device_context(); + int r = xpu::slice_backward(dev_ctx.x_context(), + shape_device, starts_device, ends_device, + shape_size, d_out->data(), d_in->data(), + d_in->numel(), d_out->numel()); + PADDLE_ENFORCE(r == xpu::Error_t::SUCCESS, "XPU kernel error!"); + dev_ctx.Wait(); + // free device data + xpu_free(shape_device); + xpu_free(starts_device); + xpu_free(ends_device); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_XPU_KERNEL(slice, + ops::SliceXPUKernel); +REGISTER_OP_XPU_KERNEL(slice_grad, + ops::SliceGradXPUKernel); +#endif diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py index fdcd2d350a6fac115086b5677a972f8b1145ff95..997f71cc61fff6adf7c7969f688d61432f4d7965 100644 --- a/python/paddle/fluid/tests/unittests/test_slice_op.py +++ b/python/paddle/fluid/tests/unittests/test_slice_op.py @@ -677,6 +677,121 @@ class TestImperativeCUDAPinnedInput(unittest.TestCase): zero_copy=False) sliced = var[:, 10:, :var.shape[1]] self.assertEqual(sliced.shape, [2, 70, 80]) +# for xpu +@unittest.skipIf(not core.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXpuSliceOp(TestSliceOp): + def test_check_output(self): + place = core.XPUPlace(0) + self.check_output_with_place(place) + + def test_check_grad_normal(self): + place = core.XPUPlace(0) + self.check_grad_with_place( + place, ['Input'], 'Out', max_relative_error=0.006) + + +@unittest.skipIf(not core.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXpuCase1(TestXpuSliceOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[-3:3, 0:100, 2:-1, :] + + +@unittest.skipIf(not core.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXpuCase2(TestXpuSliceOp): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.starts = [-3, 0, 2] + self.ends = [3, 100, -1] + self.axes = [0, 1, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[-3:3, 0:100, :, 2:-1] + + +# 1.2 with attr(decrease) +@unittest.skipIf(not core.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXpuSliceOp_decs_dim(TestSliceOp_decs_dim): + def test_check_output(self): + place = core.XPUPlace(0) + self.check_output_with_place(place) + + def test_check_grad_normal(self): + place = core.XPUPlace(0) + self.check_grad_with_place( + place, ['Input'], 'Out', max_relative_error=0.006) + + +@unittest.skipIf(not core.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXpuSliceOp_decs_dim_2(TestXpuSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.starts = [1, 0, 2] + self.ends = [2, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0, 2:4, :] + + +@unittest.skipIf(not core.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXpuSliceOp_decs_dim_3(TestXpuSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.starts = [-1, 0, 2] + self.ends = [1000000, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[-1, 0, 2:4, :] + + +@unittest.skipIf(not core.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXpuSliceOp_decs_dim_4(TestXpuSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 7]).astype("float64") + self.starts = [0, 1, 2, 3] + self.ends = [1, 2, 3, 4] + self.axes = [0, 1, 2, 3] + self.decrease_axis = [0, 1, 2, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[0, 1, 2, 3:4] + + +@unittest.skipIf(not core.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXpuSliceOp_decs_dim_5(TestXpuSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.starts = [-1] + self.ends = [1000000] + self.axes = [3] + self.decrease_axis = [3] + self.infer_flags = [1, 1, 1] + self.out = self.input[:, :, :, -1] + + +@unittest.skipIf(not core.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestXpuSliceOp_decs_dim_6(TestXpuSliceOp_decs_dim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.starts = [0, 1, 2, 3] + self.ends = [1, 2, 3, 4] + self.axes = [0, 1, 2, 3] + self.decrease_axis = [0, 1, 2, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[0, 1, 2, 3:4] if __name__ == '__main__':