未验证 提交 689de12c 编写于 作者: H houj04 提交者: GitHub

[XPU] add fp16 support for top_k_v2, squeeze2 and argsort. (#50614)

上级 1c8e15c9
......@@ -40,6 +40,7 @@ XPUOpMap& get_kl2_ops() {
{"argsort",
XPUKernelSet({phi::DataType::INT32,
phi::DataType::INT64,
phi::DataType::FLOAT16,
phi::DataType::FLOAT32})},
{"assign",
XPUKernelSet({phi::DataType::FLOAT32,
......@@ -598,6 +599,7 @@ XPUOpMap& get_kl2_ops() {
phi::DataType::BOOL,
phi::DataType::INT8,
phi::DataType::UINT8,
phi::DataType::FLOAT16,
phi::DataType::FLOAT32})},
{"squeeze",
XPUKernelSet({phi::DataType::FLOAT64,
......@@ -665,7 +667,8 @@ XPUOpMap& get_kl2_ops() {
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"truncated_gaussian_random", XPUKernelSet({phi::DataType::FLOAT32})},
{"top_k", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"top_k_v2", XPUKernelSet({phi::DataType::FLOAT32})},
{"top_k_v2",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"update_loss_scaling",
XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
{"unbind", XPUKernelSet({phi::DataType::FLOAT32})},
......
......@@ -207,34 +207,45 @@ void ArgsortKernel(const Context& dev_ctx,
}
}
using XPUType = typename XPUTypeTrait<T>::Type;
if (int64_need_cast) {
XPUArgsort<T, true, true>()(dev_ctx.x_context(),
input_data,
output_data,
indices_data,
data_shape,
permute_vec,
descending);
XPUArgsort<XPUType, true, true>()(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(input_data),
reinterpret_cast<XPUType*>(output_data),
indices_data,
data_shape,
permute_vec,
descending);
} else if (index_need_cast) {
XPUArgsort<T, false, true>()(dev_ctx.x_context(),
input_data,
output_data,
indices_data,
data_shape,
permute_vec,
descending);
XPUArgsort<XPUType, false, true>()(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(input_data),
reinterpret_cast<XPUType*>(output_data),
indices_data,
data_shape,
permute_vec,
descending);
} else {
XPUArgsort<T, false, false>()(dev_ctx.x_context(),
input_data,
output_data,
indices_data,
data_shape,
permute_vec,
descending);
XPUArgsort<XPUType, false, false>()(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(input_data),
reinterpret_cast<XPUType*>(output_data),
indices_data,
data_shape,
permute_vec,
descending);
}
}
} // namespace phi
PD_REGISTER_KERNEL(
argsort, XPU, ALL_LAYOUT, phi::ArgsortKernel, float, int, int64_t) {}
PD_REGISTER_KERNEL(argsort,
XPU,
ALL_LAYOUT,
phi::ArgsortKernel,
float,
int,
int64_t,
phi::dtype::float16) {}
......@@ -28,6 +28,8 @@ void TopkKernel(const Context& dev_ctx,
bool sorted,
DenseTensor* out,
DenseTensor* indices) {
using XPUType = typename XPUTypeTrait<T>::Type;
const auto& in_dims = x.dims();
const T* in_data = x.data<T>();
int64_t* indices_data = dev_ctx.template Alloc<int64_t>(indices);
......@@ -59,13 +61,13 @@ void TopkKernel(const Context& dev_ctx,
const size_t row =
phi::product(phi::slice_ddim(in_dims, 0, in_dims.size() - 1));
const size_t col = in_dims[in_dims.size() - 1];
int r = xpu::sorted_topk<T>(dev_ctx.x_context(),
in_data,
output_data,
indices_int_data,
row,
col,
k);
int r = xpu::sorted_topk<XPUType>(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(in_data),
reinterpret_cast<XPUType*>(output_data),
indices_int_data,
row,
col,
k);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk");
r = xpu::cast<int32_t, int64_t>(dev_ctx.x_context(),
......@@ -97,11 +99,14 @@ void TopkKernel(const Context& dev_ctx,
}
xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
T* trans_in_data = RAII_GUARD.alloc_l3_or_gm<T>(x.numel());
XPUType* trans_in_data = RAII_GUARD.alloc_l3_or_gm<XPUType>(x.numel());
// Transpose and save interval output to trans_in
int r = xpu::transpose<T>(
dev_ctx.x_context(), in_data, trans_in_data, x_shape_host, trans_axes);
int r = xpu::transpose<XPUType>(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(in_data),
trans_in_data,
x_shape_host,
trans_axes);
PADDLE_ENFORCE_EQ(r,
xpu::Error_t::SUCCESS,
errors::External("XPU API 1st Transpose kernel"
......@@ -109,7 +114,7 @@ void TopkKernel(const Context& dev_ctx,
r,
XPUAPIErrorMsg[r]));
T* trans_out_data = RAII_GUARD.alloc_l3_or_gm<T>(out->numel());
XPUType* trans_out_data = RAII_GUARD.alloc_l3_or_gm<XPUType>(out->numel());
int64_t* trans_idx_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(out->numel());
int32_t* trans_idx_int32_data =
RAII_GUARD.alloc_l3_or_gm<int32_t>(out->numel());
......@@ -118,13 +123,14 @@ void TopkKernel(const Context& dev_ctx,
const size_t col = trans_dims[trans_dims.size() - 1];
// Do top k on transposed input
r = xpu::sorted_topk<T>(dev_ctx.x_context(),
trans_in_data,
trans_out_data,
trans_idx_int32_data,
row,
col,
k);
r = xpu::sorted_topk<XPUType>(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(trans_in_data),
reinterpret_cast<XPUType*>(trans_out_data),
trans_idx_int32_data,
row,
col,
k);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "sorted_topk");
r = xpu::cast<int32_t, int64_t>(dev_ctx.x_context(),
......@@ -146,11 +152,12 @@ void TopkKernel(const Context& dev_ctx,
for (size_t i = 0; i < trans_back_axes.size(); ++i) {
trans_out_shape_host[i] = trans_out_dims[i];
}
r = xpu::transpose<T>(dev_ctx.x_context(),
trans_out_data,
output_data,
trans_out_shape_host,
trans_back_axes);
r = xpu::transpose<XPUType>(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(trans_out_data),
reinterpret_cast<XPUType*>(output_data),
trans_out_shape_host,
trans_back_axes);
PADDLE_ENFORCE_EQ(r,
xpu::Error_t::SUCCESS,
errors::External("XPU API 2nd Transpose kernel"
......@@ -173,4 +180,5 @@ void TopkKernel(const Context& dev_ctx,
} // namespace phi
PD_REGISTER_KERNEL(topk, XPU, ALL_LAYOUT, phi::TopkKernel, float) {}
PD_REGISTER_KERNEL(
topk, XPU, ALL_LAYOUT, phi::TopkKernel, float, phi::dtype::float16) {}
......@@ -185,7 +185,9 @@ class XPUTestArgsortOp_LargeN(XPUOpTestWrapper):
support_types = get_xpu_op_support_types('argsort')
for stype in support_types:
create_test_class(globals(), XPUTestArgsortOp, stype)
create_test_class(globals(), XPUTestArgsortOp_LargeN, stype)
if stype != "float16":
# skip fp16 test on LARGE input because unstable sort on low-precision fp16 will lead to test failure
create_test_class(globals(), XPUTestArgsortOp_LargeN, stype)
if __name__ == '__main__':
unittest.main()
......@@ -30,6 +30,20 @@ import paddle
paddle.enable_static()
def random_unique_float(shape, dtype):
# create a random float array with 10x length
numel = np.prod(shape)
arr = np.random.uniform(-10.0, 10.0, numel * 10).astype(dtype)
arr = np.unique(arr)
assert (
arr.shape[0] >= numel
), "failed to create enough unique values: %d vs %d" % (arr.shape[0], numel)
arr = arr[:numel]
np.random.shuffle(arr)
arr = arr.reshape(shape)
return arr
def numpy_topk(x, k=1, axis=-1, largest=True):
if axis < 0:
axis = len(x.shape) + axis
......@@ -52,16 +66,14 @@ class XPUTestTopKV2Op(XPUOpTestWrapper):
self.use_dynamic_create_class = False
class TestTopkOp(XPUOpTest):
def init_args(self):
self.k = 3
self.axis = 1
self.largest = True
self.input_data = np.random.rand(10, 20).astype(self.dtype)
def setUp(self):
self.place = paddle.XPUPlace(0)
self.op_type = "top_k_v2"
self.init_args()
self.dtype = self.in_type
self.init_args()
self.input_data = random_unique_float(
self.input_data_shape, self.dtype
)
self.inputs = {'X': self.input_data}
self.attrs = {
'k': self.k,
......@@ -74,98 +86,112 @@ class XPUTestTopKV2Op(XPUOpTestWrapper):
self.outputs = {'Out': output, 'Indices': indices}
def test_check_output(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_output_with_place(place)
self.check_output_with_place(self.place)
def test_check_grad(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_grad(set(['X']), 'Out')
self.check_grad_with_place(self.place, ['X'], 'Out')
def init_args(self):
self.k = 3
self.axis = 1
self.largest = True
self.input_data_shape = (10, 20)
class TestTopkOp1(TestTopkOp):
def init_args(self):
self.k = 3
self.axis = 1
self.largest = True
self.input_data = np.random.rand(100, 155).astype(self.dtype)
# too many values for fp16 will lead to failure in random_unique_float function
if self.dtype == np.float16:
self.input_data_shape = (100, 55)
else:
self.input_data_shape = (100, 155)
class TestTopkOp2(TestTopkOp):
def init_args(self):
self.k = 3
self.axis = 1
self.largest = True
self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
self.input_data_shape = (10, 10, 5)
class TestTopkOp3(TestTopkOp):
def init_args(self):
self.k = 5
self.axis = 1
self.largest = True
self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
self.input_data_shape = (10, 10, 5)
class TestTopkOp4(TestTopkOp):
def init_args(self):
self.k = 1
self.axis = 1
self.largest = True
self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
self.input_data_shape = (10, 10, 5)
class TestTopkOp5(TestTopkOp):
def init_args(self):
self.k = 3
self.axis = 2
self.largest = True
self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
self.input_data_shape = (10, 10, 5)
class TestTopkOp6(TestTopkOp):
def init_args(self):
self.k = 5
self.axis = 1
self.largest = True
self.input_data = np.random.rand(8, 32, 64).astype(self.dtype)
# too many values for fp16 will lead to failure in random_unique_float function
if self.dtype == np.float16:
self.input_data_shape = (8, 32, 32)
else:
self.input_data_shape = (8, 32, 64)
class TestTopkOp7(TestTopkOp):
def init_args(self):
self.k = 10
self.axis = 2
self.largest = True
self.input_data = np.random.rand(8, 5, 10, 16).astype(self.dtype)
self.input_data_shape = (8, 5, 10, 16)
class TestTopkOp8(TestTopkOp):
def init_args(self):
self.k = 1
self.axis = 1
self.largest = True
self.input_data = np.random.rand(8, 32, 64).astype(self.dtype)
# too many values for fp16 will lead to failure in random_unique_float function
if self.dtype == np.float16:
self.input_data_shape = (8, 32, 32)
else:
self.input_data_shape = (8, 32, 64)
class TestTopkOp9(TestTopkOp):
def init_args(self):
self.k = 3
self.axis = 1
self.largest = True
self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
self.input_data_shape = (10, 10, 5)
class TestTopkOp10(TestTopkOp):
def init_args(self):
self.k = 3
self.axis = 1
self.largest = True
self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
self.input_data_shape = (10, 10, 5)
class TestTopkOp11(TestTopkOp):
def init_args(self):
self.k = 5
self.axis = 1
self.largest = True
self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
self.input_data_shape = (10, 10, 5)
class TestTopkOp12(TestTopkOp):
def init_args(self):
self.k = 1
self.axis = 1
self.largest = True
self.input_data = np.random.rand(10, 10, 5).astype(self.dtype)
self.input_data_shape = (10, 10, 5)
support_types = get_xpu_op_support_types('top_k_v2')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册