未验证 提交 d3d1a6b6 编写于 作者: T taixiurong 提交者: GitHub

add kunlun kernel: slice, slice_grad, top_k, cast. *test=kunlun (#28542)

* 1.add xpu slice op 2. add xpu top_k op 3.modify xpu cast to new api

* 1.add xpu slice op 2. add xpu top_k op 3.modify xpu cast to new api
上级 9362d85e
......@@ -4,7 +4,7 @@ endif()
INCLUDE(ExternalProject)
SET(XPU_PROJECT "extern_xpu")
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2020_09_22_api_2020_11_05.tar.gz" CACHE STRING "" FORCE)
SET(XPU_URL "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2020_11_10.tar.gz" CACHE STRING "" FORCE)
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
......
......@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/cast_op.h"
#include <memory>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/cast_op.h"
#include "paddle/fluid/platform/float16.h"
#include "xpu/refactor/math.h"
namespace paddle {
namespace operators {
......@@ -37,13 +39,15 @@ class CastXPUKernel : public framework::OpKernel<InT> {
int r = -1;
if (out_type == framework::proto::VarType::FP32) {
auto* out_data = out->mutable_data<float>(context.GetPlace());
r = xpu::cast<InT, float>(dev_ctx.x_context(), in_data, out_data, numel);
r = xpu::cast_v2<InT, float>(dev_ctx.x_context(), in_data, out_data,
numel);
} else if (out_type == framework::proto::VarType::INT32) {
auto* out_data = out->mutable_data<int>(context.GetPlace());
r = xpu::cast<InT, int>(dev_ctx.x_context(), in_data, out_data, numel);
r = xpu::cast_v2<InT, int32_t>(dev_ctx.x_context(), in_data, out_data,
numel);
} else if (out_type == framework::proto::VarType::INT64) {
auto* out_data = out->mutable_data<int64_t>(context.GetPlace());
r = xpu::cast<InT, int64_t>(dev_ctx.x_context(), in_data, out_data,
r = xpu::cast_v2<InT, int64_t>(dev_ctx.x_context(), in_data, out_data,
numel);
} else {
PADDLE_THROW(platform::errors::Unavailable("Not supported cast %d -> %d",
......@@ -63,7 +67,7 @@ class CastXPUKernel : public framework::OpKernel<InT> {
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(
cast, ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int>,
cast, ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int32_t>,
ops::CastXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::CastXPUKernel<paddle::platform::XPUDeviceContext, int64_t>);
#endif
......@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/operators/slice_op.h"
#include <algorithm>
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/operators/slice_op.h"
#include "xpu/refactor/math.h"
namespace paddle {
namespace operators {
......@@ -85,10 +85,8 @@ class SliceXPUKernel : public framework::OpKernel<T> {
auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto* in_data = in->data<T>();
auto* out_data = out->mutable_data<T>(ctx.GetPlace());
int r = xpu::slice_forward(dev_ctx.x_context(), shape.data(),
starts_extension.data(), ends_extension.data(),
shape_size, in_data, out_data);
int r = xpu::slice<T>(dev_ctx.x_context(), in_data, out_data, shape,
starts_extension, ends_extension);
PADDLE_ENFORCE_EQ(r, XPU_SUCCESS,
platform::errors::External("XPU slice kernel error!"));
}
......@@ -149,12 +147,14 @@ class SliceGradXPUKernel : public framework::OpKernel<T> {
shape_size > axes.size() ? starts_extension.data() : starts.data();
int* ends_host =
shape_size > axes.size() ? ends_extension.data() : ends.data();
PADDLE_ENFORCE_EQ(
xpu_malloc((void**)(&starts_device), shape_size * sizeof(int)),
XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
PADDLE_ENFORCE_EQ(
xpu_malloc((void**)(&ends_device), shape_size * sizeof(int)),
XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&starts_device),
shape_size * sizeof(int)),
XPU_SUCCESS,
platform::errors::External("XPU has no enough memory"));
PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&ends_device),
shape_size * sizeof(int)),
XPU_SUCCESS,
platform::errors::External("XPU has no enough memory"));
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
starts_device, platform::CPUPlace(), starts_host,
shape_size * sizeof(int));
......@@ -168,9 +168,10 @@ class SliceGradXPUKernel : public framework::OpKernel<T> {
shape[i] = in_dims[i];
}
int* shape_device = nullptr;
PADDLE_ENFORCE_EQ(
xpu_malloc((void**)(&shape_device), shape_size * sizeof(int)),
XPU_SUCCESS, platform::errors::External("XPU has no enough memory"));
PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&shape_device),
shape_size * sizeof(int)),
XPU_SUCCESS,
platform::errors::External("XPU has no enough memory"));
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()),
shape_device, platform::CPUPlace(), shape.data(),
shape_size * sizeof(int));
......@@ -196,7 +197,8 @@ class SliceGradXPUKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(
slice, ops::SliceXPUKernel<paddle::platform::XPUDeviceContext, float>);
slice, ops::SliceXPUKernel<paddle::platform::XPUDeviceContext, float>,
ops::SliceXPUKernel<paddle::platform::XPUDeviceContext, int>);
REGISTER_OP_XPU_KERNEL(
slice_grad,
ops::SliceGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_XPU
#include <memory>
#include "paddle/fluid/operators/top_k_op.h"
#include "xpu/refactor/math.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
class TopkXPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
// Get the top k elements of each row of input tensor
auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out");
auto* indices = ctx.Output<Tensor>("Indices");
size_t k = static_cast<int>(ctx.Attr<int>("k"));
auto* k_t = ctx.Input<Tensor>("K");
if (k_t) {
k = k_t->data<int>()[0];
framework::DDim output_dims = output->dims();
output_dims[output_dims.size() - 1] = k;
output->Resize(output_dims);
indices->Resize(output_dims);
}
T* output_data = output->mutable_data<T>(ctx.GetPlace());
int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
Tensor indices_32_data_tensor;
int32_t* indices_int_data = indices_32_data_tensor.mutable_data<int32_t>(
ctx.GetPlace(), indices->numel());
// reshape input to a flattern matrix(like flat_inner_dims)
framework::DDim inputdims = input->dims();
const size_t row = framework::product(
framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
const size_t col = inputdims[inputdims.size() - 1];
auto& dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
int ret = xpu::sorted_topk<T>(dev_ctx.x_context(), input->data<T>(),
output_data, indices_int_data, row, col, k);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d] in call kernel name "
"[%s], please check "
"where Baidu Kunlun Card is properly installed.",
ret, "sorted_topk"));
ret = xpu::cast_v2<int32_t, int64_t>(dev_ctx.x_context(),
(const int32_t*)indices_int_data,
indices_data, indices->numel());
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d] in call kernel name "
"[%s], please check "
"where Baidu Kunlun Card is properly installed.",
ret, "cast_v2"));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(top_k, ops::TopkXPUKernel<float>);
#endif
......@@ -12,21 +12,20 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle
import numpy as np
import sys
import unittest
sys.path.append("..")
import paddle
import paddle.fluid.core as core
from op_test import OpTest
import paddle.fluid as fluid
import paddle.fluid.layers as layers
paddle.enable_static()
# Situation 1: starts(list, no tensor), ends(list, no tensor)
# 1.1 without attr(decrease)
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestSliceOp(OpTest):
def setUp(self):
self.op_type = "slice"
......@@ -42,7 +41,7 @@ class TestSliceOp(OpTest):
}
def config(self):
self.input = np.random.random([3, 4, 5, 6]).astype("float64")
self.input = np.random.random([3, 4, 5, 6]).astype("float32")
self.starts = [1, 0, 2]
self.ends = [3, 3, 4]
self.axes = [0, 1, 2]
......@@ -58,9 +57,11 @@ class TestSliceOp(OpTest):
self.check_grad_with_place(place, ['Input'], 'Out')
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestCase1(TestSliceOp):
def config(self):
self.input = np.random.random([3, 4, 5, 6]).astype("float64")
self.input = np.random.random([3, 4, 5, 6]).astype("float32")
self.starts = [-3, 0, 2]
self.ends = [3, 100, -1]
self.axes = [0, 1, 2]
......@@ -68,9 +69,11 @@ class TestCase1(TestSliceOp):
self.out = self.input[-3:3, 0:100, 2:-1, :]
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestCase2(TestSliceOp):
def config(self):
self.input = np.random.random([3, 4, 5, 6]).astype("float64")
self.input = np.random.random([3, 4, 5, 6]).astype("float32")
self.starts = [-3, 0, 2]
self.ends = [3, 100, -1]
self.axes = [0, 1, 3]
......@@ -79,6 +82,8 @@ class TestCase2(TestSliceOp):
# 1.2 with attr(decrease)
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestSliceOp_decs_dim(OpTest):
def setUp(self):
self.op_type = "slice"
......@@ -95,7 +100,7 @@ class TestSliceOp_decs_dim(OpTest):
}
def config(self):
self.input = np.random.random([3, 4, 5, 6]).astype("float64")
self.input = np.random.random([3, 4, 5, 6]).astype("float32")
self.starts = [1, 0, 2]
self.ends = [2, 3, 4]
self.axes = [0, 1, 2]
......@@ -112,9 +117,11 @@ class TestSliceOp_decs_dim(OpTest):
self.check_grad_with_place(place, ['Input'], 'Out')
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
def config(self):
self.input = np.random.random([3, 4, 5, 6]).astype("float64")
self.input = np.random.random([3, 4, 5, 6]).astype("float32")
self.starts = [1, 0, 2]
self.ends = [2, 1, 4]
self.axes = [0, 1, 2]
......@@ -123,9 +130,11 @@ class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
self.out = self.input[1, 0, 2:4, :]
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
def config(self):
self.input = np.random.random([3, 4, 5, 6]).astype("float64")
self.input = np.random.random([3, 4, 5, 6]).astype("float32")
self.starts = [-1, 0, 2]
self.ends = [1000000, 1, 4]
self.axes = [0, 1, 2]
......@@ -134,9 +143,11 @@ class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
self.out = self.input[-1, 0, 2:4, :]
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
def config(self):
self.input = np.random.random([3, 4, 5, 7]).astype("float64")
self.input = np.random.random([3, 4, 5, 7]).astype("float32")
self.starts = [0, 1, 2, 3]
self.ends = [1, 2, 3, 4]
self.axes = [0, 1, 2, 3]
......@@ -145,9 +156,11 @@ class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
self.out = self.input[0, 1, 2, 3:4]
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
def config(self):
self.input = np.random.random([3, 4, 5, 6]).astype("float64")
self.input = np.random.random([3, 4, 5, 6]).astype("float32")
self.starts = [-1]
self.ends = [1000000]
self.axes = [3]
......@@ -156,9 +169,11 @@ class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
self.out = self.input[:, :, :, -1]
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
def config(self):
self.input = np.random.random([3, 4, 5, 6]).astype("float64")
self.input = np.random.random([3, 4, 5, 6]).astype("float32")
self.starts = [0, 1, 2, 3]
self.ends = [1, 2, 3, 4]
self.axes = [0, 1, 2, 3]
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import sys
sys.path.append("..")
from paddle.fluid.op import Operator
import paddle.fluid.core as core
import paddle.fluid as fluid
import paddle
from op_test import OpTest
paddle.enable_static()
@unittest.skipIf(not paddle.is_compiled_with_xpu(),
"core is not compiled with XPU")
class TestTopkOp(OpTest):
def setUp(self):
self.variable_k = False
self.use_xpu = True
self.set_args()
self.op_type = "top_k"
self.dtype = np.float32
self.init_dtype()
k = self.top_k
input = np.random.random((self.row, k)).astype(self.dtype)
output = np.ndarray((self.row, k))
indices = np.ndarray((self.row, k)).astype("int64")
self.inputs = {'X': input}
if self.variable_k:
self.inputs['K'] = np.array([k]).astype("int32")
else:
self.attrs = {'k': k}
for rowid in range(self.row):
row = input[rowid]
output[rowid] = np.sort(row)[::-1][:k]
indices[rowid] = row.argsort()[::-1][:k]
self.outputs = {'Out': output, 'Indices': indices}
def init_dtype(self):
self.dtype = np.float32
def set_args(self):
self.row = 100
self.top_k = 1
def test_check_output(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_output_with_place(place)
def test_check_grad(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_grad_with_place(place, ['X'], 'Out')
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册