From d3d1a6b6e0ac11f9b2facfa8fdd45a07b2097459 Mon Sep 17 00:00:00 2001 From: taixiurong Date: Fri, 20 Nov 2020 13:10:09 +0800 Subject: [PATCH] add kunlun kernel: slice, slice_grad, top_k, cast. *test=kunlun (#28542) * 1.add xpu slice op 2. add xpu top_k op 3.modify xpu cast to new api * 1.add xpu slice op 2. add xpu top_k op 3.modify xpu cast to new api --- cmake/external/xpu.cmake | 2 +- paddle/fluid/operators/cast_op_xpu.cc | 16 ++-- .../{slice_xpu_op.cc => slice_op_xpu.cc} | 34 ++++---- paddle/fluid/operators/top_k_op_xpu.cc | 82 +++++++++++++++++++ .../tests/unittests/xpu/test_slice_op_xpu.py | 47 +++++++---- .../tests/unittests/xpu/test_top_k_op_xpu.py | 77 +++++++++++++++++ 6 files changed, 219 insertions(+), 39 deletions(-) rename paddle/fluid/operators/{slice_xpu_op.cc => slice_op_xpu.cc} (88%) create mode 100644 paddle/fluid/operators/top_k_op_xpu.cc create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index eb00b82220..8d3fee915c 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -4,7 +4,7 @@ endif() INCLUDE(ExternalProject) SET(XPU_PROJECT "extern_xpu") -SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2020_09_22_api_2020_11_05.tar.gz" CACHE STRING "" FORCE) +SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2020_11_10.tar.gz" CACHE STRING "" FORCE) SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu") SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}") SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu") diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc index 56160bd297..a2791cb262 100644 --- a/paddle/fluid/operators/cast_op_xpu.cc +++ b/paddle/fluid/operators/cast_op_xpu.cc @@ -13,10 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/cast_op.h" #include + #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/cast_op.h" #include "paddle/fluid/platform/float16.h" +#include "xpu/refactor/math.h" namespace paddle { namespace operators { @@ -37,14 +39,16 @@ class CastXPUKernel : public framework::OpKernel { int r = -1; if (out_type == framework::proto::VarType::FP32) { auto* out_data = out->mutable_data(context.GetPlace()); - r = xpu::cast(dev_ctx.x_context(), in_data, out_data, numel); + r = xpu::cast_v2(dev_ctx.x_context(), in_data, out_data, + numel); } else if (out_type == framework::proto::VarType::INT32) { auto* out_data = out->mutable_data(context.GetPlace()); - r = xpu::cast(dev_ctx.x_context(), in_data, out_data, numel); + r = xpu::cast_v2(dev_ctx.x_context(), in_data, out_data, + numel); } else if (out_type == framework::proto::VarType::INT64) { auto* out_data = out->mutable_data(context.GetPlace()); - r = xpu::cast(dev_ctx.x_context(), in_data, out_data, - numel); + r = xpu::cast_v2(dev_ctx.x_context(), in_data, out_data, + numel); } else { PADDLE_THROW(platform::errors::Unavailable("Not supported cast %d -> %d", in_type, out_type)); @@ -63,7 +67,7 @@ class CastXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( - cast, ops::CastXPUKernel, + cast, ops::CastXPUKernel, ops::CastXPUKernel, ops::CastXPUKernel); #endif diff --git a/paddle/fluid/operators/slice_xpu_op.cc b/paddle/fluid/operators/slice_op_xpu.cc similarity index 88% rename from paddle/fluid/operators/slice_xpu_op.cc rename to paddle/fluid/operators/slice_op_xpu.cc index 3d6f52c7dc..5f98efe8e9 100644 --- a/paddle/fluid/operators/slice_xpu_op.cc +++ b/paddle/fluid/operators/slice_op_xpu.cc @@ -13,12 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU - +#include "paddle/fluid/operators/slice_op.h" #include #include #include #include -#include "paddle/fluid/operators/slice_op.h" +#include "xpu/refactor/math.h" namespace paddle { namespace operators { @@ -85,10 +85,8 @@ class SliceXPUKernel : public framework::OpKernel { auto& dev_ctx = ctx.template device_context(); auto* in_data = in->data(); auto* out_data = out->mutable_data(ctx.GetPlace()); - - int r = xpu::slice_forward(dev_ctx.x_context(), shape.data(), - starts_extension.data(), ends_extension.data(), - shape_size, in_data, out_data); + int r = xpu::slice(dev_ctx.x_context(), in_data, out_data, shape, + starts_extension, ends_extension); PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External("XPU slice kernel error!")); } @@ -149,12 +147,14 @@ class SliceGradXPUKernel : public framework::OpKernel { shape_size > axes.size() ? starts_extension.data() : starts.data(); int* ends_host = shape_size > axes.size() ? ends_extension.data() : ends.data(); - PADDLE_ENFORCE_EQ( - xpu_malloc((void**)(&starts_device), shape_size * sizeof(int)), - XPU_SUCCESS, platform::errors::External("XPU has no enough memory")); - PADDLE_ENFORCE_EQ( - xpu_malloc((void**)(&ends_device), shape_size * sizeof(int)), - XPU_SUCCESS, platform::errors::External("XPU has no enough memory")); + PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&starts_device), + shape_size * sizeof(int)), + XPU_SUCCESS, + platform::errors::External("XPU has no enough memory")); + PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&ends_device), + shape_size * sizeof(int)), + XPU_SUCCESS, + platform::errors::External("XPU has no enough memory")); memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), starts_device, platform::CPUPlace(), starts_host, shape_size * sizeof(int)); @@ -168,9 +168,10 @@ class SliceGradXPUKernel : public framework::OpKernel { shape[i] = in_dims[i]; } int* shape_device = nullptr; - PADDLE_ENFORCE_EQ( - xpu_malloc((void**)(&shape_device), shape_size * sizeof(int)), - XPU_SUCCESS, platform::errors::External("XPU has no enough memory")); + PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&shape_device), + shape_size * sizeof(int)), + XPU_SUCCESS, + platform::errors::External("XPU has no enough memory")); memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), shape_device, platform::CPUPlace(), shape.data(), shape_size * sizeof(int)); @@ -196,7 +197,8 @@ class SliceGradXPUKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( - slice, ops::SliceXPUKernel); + slice, ops::SliceXPUKernel, + ops::SliceXPUKernel); REGISTER_OP_XPU_KERNEL( slice_grad, ops::SliceGradXPUKernel); diff --git a/paddle/fluid/operators/top_k_op_xpu.cc b/paddle/fluid/operators/top_k_op_xpu.cc new file mode 100644 index 0000000000..5e89e38c7d --- /dev/null +++ b/paddle/fluid/operators/top_k_op_xpu.cc @@ -0,0 +1,82 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include + +#include "paddle/fluid/operators/top_k_op.h" +#include "xpu/refactor/math.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +class TopkXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // Get the top k elements of each row of input tensor + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto* indices = ctx.Output("Indices"); + + size_t k = static_cast(ctx.Attr("k")); + auto* k_t = ctx.Input("K"); + if (k_t) { + k = k_t->data()[0]; + framework::DDim output_dims = output->dims(); + output_dims[output_dims.size() - 1] = k; + output->Resize(output_dims); + indices->Resize(output_dims); + } + + T* output_data = output->mutable_data(ctx.GetPlace()); + int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); + Tensor indices_32_data_tensor; + int32_t* indices_int_data = indices_32_data_tensor.mutable_data( + ctx.GetPlace(), indices->numel()); + // reshape input to a flattern matrix(like flat_inner_dims) + framework::DDim inputdims = input->dims(); + const size_t row = framework::product( + framework::slice_ddim(inputdims, 0, inputdims.size() - 1)); + const size_t col = inputdims[inputdims.size() - 1]; + auto& dev_ctx = ctx.template device_context(); + + int ret = xpu::sorted_topk(dev_ctx.x_context(), input->data(), + output_data, indices_int_data, row, col, k); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d] in call kernel name " + "[%s], please check " + "where Baidu Kunlun Card is properly installed.", + ret, "sorted_topk")); + ret = xpu::cast_v2(dev_ctx.x_context(), + (const int32_t*)indices_int_data, + indices_data, indices->numel()); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU API return wrong value[%d] in call kernel name " + "[%s], please check " + "where Baidu Kunlun Card is properly installed.", + ret, "cast_v2")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL(top_k, ops::TopkXPUKernel); +#endif diff --git a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py index 44c8821be0..8f3578b526 100644 --- a/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py +++ b/python/paddle/fluid/tests/unittests/xpu/test_slice_op_xpu.py @@ -12,21 +12,20 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import print_function - -import unittest +import paddle import numpy as np import sys +import unittest sys.path.append("..") -import paddle -import paddle.fluid.core as core from op_test import OpTest -import paddle.fluid as fluid -import paddle.fluid.layers as layers + +paddle.enable_static() # Situation 1: starts(list, no tensor), ends(list, no tensor) # 1.1 without attr(decrease) +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") class TestSliceOp(OpTest): def setUp(self): self.op_type = "slice" @@ -42,7 +41,7 @@ class TestSliceOp(OpTest): } def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.input = np.random.random([3, 4, 5, 6]).astype("float32") self.starts = [1, 0, 2] self.ends = [3, 3, 4] self.axes = [0, 1, 2] @@ -58,9 +57,11 @@ class TestSliceOp(OpTest): self.check_grad_with_place(place, ['Input'], 'Out') +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") class TestCase1(TestSliceOp): def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.input = np.random.random([3, 4, 5, 6]).astype("float32") self.starts = [-3, 0, 2] self.ends = [3, 100, -1] self.axes = [0, 1, 2] @@ -68,9 +69,11 @@ class TestCase1(TestSliceOp): self.out = self.input[-3:3, 0:100, 2:-1, :] +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") class TestCase2(TestSliceOp): def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.input = np.random.random([3, 4, 5, 6]).astype("float32") self.starts = [-3, 0, 2] self.ends = [3, 100, -1] self.axes = [0, 1, 3] @@ -79,6 +82,8 @@ class TestCase2(TestSliceOp): # 1.2 with attr(decrease) +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") class TestSliceOp_decs_dim(OpTest): def setUp(self): self.op_type = "slice" @@ -95,7 +100,7 @@ class TestSliceOp_decs_dim(OpTest): } def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.input = np.random.random([3, 4, 5, 6]).astype("float32") self.starts = [1, 0, 2] self.ends = [2, 3, 4] self.axes = [0, 1, 2] @@ -112,9 +117,11 @@ class TestSliceOp_decs_dim(OpTest): self.check_grad_with_place(place, ['Input'], 'Out') +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim): def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.input = np.random.random([3, 4, 5, 6]).astype("float32") self.starts = [1, 0, 2] self.ends = [2, 1, 4] self.axes = [0, 1, 2] @@ -123,9 +130,11 @@ class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim): self.out = self.input[1, 0, 2:4, :] +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim): def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.input = np.random.random([3, 4, 5, 6]).astype("float32") self.starts = [-1, 0, 2] self.ends = [1000000, 1, 4] self.axes = [0, 1, 2] @@ -134,9 +143,11 @@ class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim): self.out = self.input[-1, 0, 2:4, :] +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim): def config(self): - self.input = np.random.random([3, 4, 5, 7]).astype("float64") + self.input = np.random.random([3, 4, 5, 7]).astype("float32") self.starts = [0, 1, 2, 3] self.ends = [1, 2, 3, 4] self.axes = [0, 1, 2, 3] @@ -145,9 +156,11 @@ class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim): self.out = self.input[0, 1, 2, 3:4] +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim): def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.input = np.random.random([3, 4, 5, 6]).astype("float32") self.starts = [-1] self.ends = [1000000] self.axes = [3] @@ -156,9 +169,11 @@ class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim): self.out = self.input[:, :, :, -1] +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim): def config(self): - self.input = np.random.random([3, 4, 5, 6]).astype("float64") + self.input = np.random.random([3, 4, 5, 6]).astype("float32") self.starts = [0, 1, 2, 3] self.ends = [1, 2, 3, 4] self.axes = [0, 1, 2, 3] diff --git a/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py new file mode 100644 index 0000000000..c4418bd55c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_top_k_op_xpu.py @@ -0,0 +1,77 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np +import sys +sys.path.append("..") +from paddle.fluid.op import Operator +import paddle.fluid.core as core +import paddle.fluid as fluid +import paddle +from op_test import OpTest + +paddle.enable_static() + + +@unittest.skipIf(not paddle.is_compiled_with_xpu(), + "core is not compiled with XPU") +class TestTopkOp(OpTest): + def setUp(self): + self.variable_k = False + self.use_xpu = True + self.set_args() + self.op_type = "top_k" + self.dtype = np.float32 + self.init_dtype() + + k = self.top_k + input = np.random.random((self.row, k)).astype(self.dtype) + output = np.ndarray((self.row, k)) + indices = np.ndarray((self.row, k)).astype("int64") + self.inputs = {'X': input} + + if self.variable_k: + self.inputs['K'] = np.array([k]).astype("int32") + else: + self.attrs = {'k': k} + + for rowid in range(self.row): + row = input[rowid] + output[rowid] = np.sort(row)[::-1][:k] + indices[rowid] = row.argsort()[::-1][:k] + + self.outputs = {'Out': output, 'Indices': indices} + + def init_dtype(self): + self.dtype = np.float32 + + def set_args(self): + self.row = 100 + self.top_k = 1 + + def test_check_output(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + def test_check_grad(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_grad_with_place(place, ['X'], 'Out') + + +if __name__ == "__main__": + unittest.main() -- GitLab