From fee4316d0631cc1ae89628d4864575efdebe331d Mon Sep 17 00:00:00 2001 From: tanzhipeng <51696454+tiancaitzp@users.noreply.github.com> Date: Mon, 7 Feb 2022 19:47:52 +0800 Subject: [PATCH] add sequence_conv op in xpu place (#39025) --- .../sequence_ops/sequence_conv_op_xpu.cc | 288 ++++++++++++++++++ .../fluid/platform/device/xpu/xpu2_op_list.h | 4 + .../xpu/test_sequence_conv_op_xpu.py | 277 +++++++++++++++++ 3 files changed, 569 insertions(+) create mode 100644 paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc create mode 100755 python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc new file mode 100644 index 0000000000..6c33ff5204 --- /dev/null +++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op_xpu.cc @@ -0,0 +1,288 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/sequence_ops/sequence_conv_op.h" +#include "paddle/fluid/platform/device/device_wrapper.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +template +class SequenceConvXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + auto filter = *context.Input("Filter"); + + out->mutable_data(context.GetPlace()); + + int context_start = context.Attr("contextStart"); + int context_length = context.Attr("contextLength"); + int context_stride = context.Attr("contextStride"); + bool padding_trainable = context.Attr("paddingTrainable"); + + PADDLE_ENFORCE_EQ( + in->lod().empty(), false, + platform::errors::InvalidArgument("Input(X) Tensor of SequenceConvOp " + "does not contain LoD information.")); + PADDLE_ENFORCE_EQ( + in->lod().size(), 1UL, + platform::errors::InvalidArgument( + "Only support input sequence with lod level equal to 1 at " + "present. But received: lod level %u.", + in->lod().size())); + + PADDLE_ENFORCE_EQ( + padding_trainable, false, + platform::errors::InvalidArgument("Only support padding_trainable " + "equal false.")); + + int up_pad = std::max(0, -context_start); + int down_pad = std::max(0, context_start + context_length - 1); + PADDLE_ENFORCE_EQ(up_pad, 2, platform::errors::InvalidArgument( + "Only support up_pad equal 2.")); + PADDLE_ENFORCE_EQ(down_pad, 2, platform::errors::InvalidArgument( + "Only support down_pad equal 2.")); + + auto xpu_context = + context.template device_context().x_context(); + auto sequence_width = static_cast(in->dims()[1]); + framework::DDim col_shape = {in->dims()[0], + context_length * sequence_width}; + xpu::ctx_guard RAII_GUARD(xpu_context); + int col_numel = col_shape[0] * col_shape[1]; + T* col_data = RAII_GUARD.alloc_l3_or_gm(col_numel); + PADDLE_ENFORCE_NOT_NULL( + col_data, paddle::platform::errors::Fatal("XPU memory is not enough")); + + auto lod_level_0 = in->lod()[0]; + int lod_size = lod_level_0.size(); + // If batch size set to 256, the lod is {0, batch[0] - 0, + // batch[1] - batch [0], ..., batch[255] - batch[254]}, + // so the lod_size will be 257. + PADDLE_ENFORCE_LE(lod_size, 257, platform::errors::InvalidArgument( + "Only support batch size <= 256.")); + + std::vector cpu_lodx(lod_size); + for (int i = 0; i < lod_size; i++) { + cpu_lodx[i] = lod_level_0[i]; + } + xpu::VectorParam lodx = {cpu_lodx.data(), + static_cast(cpu_lodx.size()), nullptr}; + + int r = xpu::sequence_context_projection( + xpu_context, in->data(), col_data, nullptr, lodx, sequence_width, + context_start, context_length, context_stride, {2, 2}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sequence_context_projection"); + + bool trans_a = false; + bool trans_b = false; + int m = col_shape[0]; + int k = col_shape[1]; + int k1 = filter.dims()[0]; + int n = filter.dims()[1]; + PADDLE_ENFORCE_EQ(k, k1, + platform::errors::InvalidArgument( + "The shape of FC in SequenceConvOp is invalid." + "The k of matrix A is %d, k1 of matrix B is %d." + "But expect k == k1", + k, k1)); + int lda = (!trans_a) ? k : m; + int ldb = (!trans_b) ? n : k; + int ldc = n; + T alpha = static_cast(1.0); + T beta = static_cast(0.0); + const T* data_a = col_data; + const T* data_b = filter.data(); + T* data_c = out->data(); + + r = xpu::fc_fusion( + xpu_context, data_a, data_b, data_c, m, n, k, trans_a, trans_b, nullptr, + nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr, + xpu::Activation_t::LINEAR); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_fusion"); + if (xpu_context->xpu_stream != nullptr) { + xpu_wait(xpu_context->xpu_stream); + } + } +}; + +template +class SequenceConvGradXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in_g = context.Output(framework::GradVarName("X")); + auto* out_g = context.Input(framework::GradVarName("Out")); + auto* filter_g = context.Output(framework::GradVarName("Filter")); + auto* in = context.Input("X"); + auto* filter = context.Input("Filter"); + + int context_start = context.Attr("contextStart"); + int context_length = context.Attr("contextLength"); + int context_stride = context.Attr("contextStride"); + bool padding_trainable = context.Attr("paddingTrainable"); + + PADDLE_ENFORCE_EQ( + in->lod().empty(), false, + platform::errors::InvalidArgument("Input(X) Tensor of SequenceConvOp " + "does not contain LoD information.")); + PADDLE_ENFORCE_EQ( + in->lod().size(), 1UL, + platform::errors::InvalidArgument( + "Only support input sequence with lod level equal to 1 at " + "present. But received: lod level %u.", + in->lod().size())); + + PADDLE_ENFORCE_EQ( + padding_trainable, false, + platform::errors::InvalidArgument("Only support padding_trainable " + "equal false.")); + + int up_pad = std::max(0, -context_start); + int down_pad = std::max(0, context_start + context_length - 1); + PADDLE_ENFORCE_EQ(up_pad, 2, platform::errors::InvalidArgument( + "Only support up_pad equal 2.")); + PADDLE_ENFORCE_EQ(down_pad, 2, platform::errors::InvalidArgument( + "Only support down_pad equal 2.")); + + auto lod_level_0 = in->lod()[0]; + int lod_size = lod_level_0.size(); + PADDLE_ENFORCE_LE(lod_size, 257, platform::errors::InvalidArgument( + "Only support batch size <= 256.")); + + std::vector cpu_lodx(lod_size); + for (int i = 0; i < lod_size; i++) { + cpu_lodx[i] = lod_level_0[i]; + } + xpu::VectorParam lodx = {cpu_lodx.data(), + static_cast(cpu_lodx.size()), nullptr}; + + auto xpu_context = + context.template device_context().x_context(); + auto sequence_width = static_cast(in->dims()[1]); + framework::DDim col_shape = {in->dims()[0], + context_length * sequence_width}; + xpu::ctx_guard RAII_GUARD(xpu_context); + int col_numel = col_shape[0] * col_shape[1]; + T* col_data = RAII_GUARD.alloc_l3_or_gm(col_numel); + PADDLE_ENFORCE_NOT_NULL( + col_data, paddle::platform::errors::Fatal("XPU memory is not enough")); + + if (in_g || filter_g) { + int r = xpu::constant(xpu_context, col_data, col_numel, T(0)); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + + bool trans_a = false; + bool trans_b = true; + int m = out_g->dims()[0]; + int k = out_g->dims()[1]; + int n = filter->dims()[0]; + int k1 = filter->dims()[1]; + PADDLE_ENFORCE_EQ(k, k1, + platform::errors::InvalidArgument( + "The shape of FC in SequenceConvGradOp is invalid." + "The k of matrix A is %d, k1 of matrix B is %d." + "But expect k == k1", + k, k1)); + int lda = (!trans_a) ? k : m; + int ldb = (!trans_b) ? n : k; + int ldc = n; + T alpha = static_cast(1.0); + T beta = static_cast(0.0); + const T* data_a = out_g->data(); + const T* data_b = filter->data(); + T* data_c = col_data; + + r = xpu::fc_fusion( + xpu_context, data_a, data_b, data_c, m, n, k, trans_a, trans_b, + nullptr, nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr, + xpu::Activation_t::LINEAR); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_fusion"); + } + + if (in_g) { + PADDLE_ENFORCE_LT(sequence_width, 512, + platform::errors::InvalidArgument( + "Only support sequence_width < 512.")); + + in_g->mutable_data(context.GetPlace()); + in_g->set_lod(in->lod()); + xpu::constant(xpu_context, in_g->data(), in_g->numel(), T(0)); + + int r = xpu::sequence_context_projection_grad( + xpu_context, in_g->data(), col_data, nullptr, lodx, sequence_width, + context_start, context_length, context_stride, {2, 2}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sequence_context_projection_grad"); + } + + if (filter_g) { + filter_g->mutable_data(context.GetPlace()); + xpu::constant(xpu_context, filter_g->data(), filter_g->numel(), + T(0)); + + int r = xpu::sequence_context_projection( + xpu_context, in->data(), col_data, nullptr, lodx, sequence_width, + context_start, context_length, context_stride, {2, 2}); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "sequence_context_projection"); + + bool trans_a = true; + bool trans_b = false; + int k = col_shape[0]; + int m = col_shape[1]; + int k1 = out_g->dims()[0]; + int n = out_g->dims()[1]; + PADDLE_ENFORCE_EQ(k, k1, + platform::errors::InvalidArgument( + "The shape of FC in SequenceConvGradOp is invalid." + "The k of matrix A is %d, k1 of matrix B is %d." + "But expect k == k1", + k, k1)); + int lda = (!trans_a) ? k : m; + int ldb = (!trans_b) ? n : k; + int ldc = n; + T alpha = static_cast(1.0); + T beta = static_cast(0.0); + const T* data_a = col_data; + const T* data_b = out_g->data(); + T* data_c = filter_g->data(); + + r = xpu::fc_fusion( + xpu_context, data_a, data_b, data_c, m, n, k, trans_a, trans_b, + nullptr, nullptr, nullptr, lda, ldb, ldc, alpha, beta, nullptr, + xpu::Activation_t::LINEAR); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_fusion"); + if (xpu_context->xpu_stream != nullptr) { + xpu_wait(xpu_context->xpu_stream); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + sequence_conv, + ops::SequenceConvXPUKernel); + +REGISTER_OP_XPU_KERNEL( + sequence_conv_grad, + ops::SequenceConvGradXPUKernel); + +#endif diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h index cb2b57474d..d73d6f0b81 100644 --- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h +++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h @@ -383,6 +383,10 @@ XPUOpMap& get_kl2_ops() { pOpKernelType(vartype::FP32, XPUPlace())})}, // AddMore + {"sequence_conv", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"sequence_conv_grad", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, }; return s_xpu2_kernels; diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py new file mode 100755 index 0000000000..2ad79dd0cc --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_sequence_conv_op_xpu.py @@ -0,0 +1,277 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import random +import sys +sys.path.append("../") +from op_test_xpu import XPUOpTest + +paddle.enable_static() +np.set_printoptions(threshold=np.inf) + + +def seqconv(x, + lod, + filter, + context_length, + context_start, + padding_trainable=False, + padding_data=None): + [T, M] = x.shape + col = np.zeros((T, context_length * M)).astype('float32') + offset = [0] + for seq_len in lod[0]: + offset.append(offset[-1] + seq_len) + begin_pad = np.max([0, -context_start]) + for i in range(len(offset) - 1): + for j in range(context_length): + in_begin = offset[i] + context_start + j + in_end = offset[i + 1] + context_start + j + out_begin = offset[i] + out_end = offset[i + 1] + if in_begin < offset[i]: + pad_size = np.min( + [offset[i] - in_begin, offset[i + 1] - offset[i]]) + if padding_trainable: + sub_w = padding_data[j:j + pad_size, :] + col[offset[i]:offset[i] + pad_size, j * M:(j + 1) * + M] = sub_w + out_begin = offset[i] + pad_size + in_begin = offset[i] + + if in_end > offset[i + 1]: + pad_size = np.min( + [in_end - offset[i + 1], offset[i + 1] - offset[i]]) + if padding_trainable: + sub_w = padding_data[begin_pad + context_start + j - + pad_size:begin_pad + context_start + + j, :] + col[offset[i + 1] - pad_size:offset[i + 1], j * M:(j + 1) * + M] = sub_w + in_end = offset[i + 1] + out_end = offset[i + 1] - pad_size + if in_end <= in_begin: + continue + in_sub = x[in_begin:in_end, :] + col[out_begin:out_end, j * M:(j + 1) * M] += in_sub + return np.dot(col, filter) + + +class TestSeqProject(XPUOpTest): + def setUp(self): + self.init_test_case() + self.op_type = 'sequence_conv' + self.use_xpu = True + + if self.context_length == 1 \ + and self.context_start == 0 \ + and self.padding_trainable: + print("If context_start is 0 " \ + "and context_length is 1," \ + " padding_trainable should be false.") + return + + # one level, batch size + x = np.random.uniform(-6.10907e-05, 0.000104218, + [self.input_size[0], + self.input_size[1]]).astype('float32') + w = np.random.uniform(-3.17068e-05, 0.000159822, [ + self.context_length * self.input_size[1], self.output_represention + ]).astype('float32') + + begin_pad = np.max([0, -self.context_start]) + end_pad = np.max([0, self.context_start + self.context_length - 1]) + total_pad = begin_pad + end_pad + padding_data = np.random.uniform( + 0, 0, [total_pad, self.input_size[1]]).astype('float32') + self.pad_data = padding_data + self.inputs = { + 'X': (x, self.lod), + 'Filter': w, + } + self.inputs_val = ['X', 'Filter'] + self.inputs_val_no_x = ['Filter'] + self.inputs_val_no_f = ['X'] + + if total_pad != 0: + self.inputs['PaddingData'] = padding_data + self.inputs_val = ['X', 'PaddingData', 'Filter'] + self.inputs_val_no_x = ['PaddingData', 'Filter'] + self.inputs_val_no_f = ['PaddingData', 'X'] + + self.attrs = { + 'contextStart': self.context_start, + 'contextLength': self.context_length, + 'paddingTrainable': self.padding_trainable, + 'contextStride': self.context_stride + } + out = seqconv(x, self.lod, w, self.context_length, self.context_start, + self.padding_trainable, self.pad_data) + self.outputs = {'Out': out} + + def test_check_output(self): + place = paddle.XPUPlace(0) + self.check_output_with_place(place) + + def test_check_grad_input(self): + self.check_grad(['X'], 'Out', no_grad_set=set(self.inputs_val_no_x)) + + def test_check_grad_padding_data(self): + if self.padding_trainable: + self.check_grad( + ['PaddingData'], 'Out', no_grad_set=set(['X', 'Filter'])) + + def test_check_grad_Filter(self): + self.check_grad( + ['Filter'], 'Out', no_grad_set=set(self.inputs_val_no_f)) + + def test_check_grad_input_filter(self): + if self.padding_trainable: + self.check_grad( + ['X', 'Filter'], 'Out', no_grad_set=set(['PaddingData'])) + + def test_check_grad_padding_input(self): + if self.padding_trainable: + self.check_grad( + self.inputs_val_no_f, 'Out', no_grad_set=set(['Filter'])) + + def test_check_grad_padding_filter(self): + if self.padding_trainable: + self.check_grad(self.inputs_val_no_x, 'Out', no_grad_set=set(['X'])) + + def init_test_case(self): + self.input_row = 7 + self.input_col = 25 + self.context_start = -2 + self.context_length = 5 + self.padding_trainable = False + self.context_stride = 1 + + self.input_size = [self.input_row, self.input_col] + offset_lod = [[0, 1, self.input_row]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) + self.output_represention = 8 # output feature size + + +class TestSeqProjectCase1(TestSeqProject): + def init_test_case(self): + self.input_row = 11 + self.context_start = -2 + self.context_length = 5 + self.padding_trainable = False + self.context_stride = 1 + + self.input_size = [self.input_row, 50] + offset_lod = [[0, 4, 5, 8, self.input_row]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) + self.output_represention = 8 # output feature size + + +class TestSeqProjectCase2Len0(TestSeqProject): + def init_test_case(self): + self.input_row = 11 + self.context_start = -2 + self.context_length = 5 + self.padding_trainable = False + self.context_stride = 1 + + self.input_size = [self.input_row, 50] + offset_lod = [[0, 0, 4, 5, 5, 8, self.input_row, self.input_row]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) + self.output_represention = 8 # output feature size + + +class TestSeqProjectCase3(TestSeqProject): + def init_test_case(self): + self.input_row = 25 + self.context_start = -2 + self.context_length = 5 + self.padding_trainable = False + self.context_stride = 1 + + self.input_size = [self.input_row, 25] + idx = list(range(self.input_size[0])) + del idx[0] + offset_lod = [[0] + np.sort(random.sample(idx, 8)).tolist() + + [self.input_size[0]]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) + self.output_represention = 8 # output feature size + + +class TestSeqProjectCase4(TestSeqProject): + def init_test_case(self): + self.input_row = 7835 + self.input_col = 128 + self.context_start = -2 + self.context_length = 5 + self.padding_trainable = False + self.context_stride = 1 + + self.input_size = [self.input_row, self.input_col] + offset_lod = [[ + 0, 1, 2, 3, 131, 241, 242, 263, 264, 265, 266, 267, 268, 387, 515, + 516, 644, 645, 772, 794, 922, 923, 924, 944, 945, 1073, 1074, 1202, + 1330, 1458, 1556, 1557, 1558, 1686, 1748, 1876, 1912, 1913, 1914, + 2032, 2066, 2194, 2308, 2309, 2347, 2475, 2476, 2477, 2478, 2606, + 2607, 2735, 2736, 2737, 2738, 2838, 2966, 2967, 2968, 2969, 3097, + 3225, 3353, 3481, 3482, 3520, 3642, 3643, 3754, 3882, 3883, 4010, + 4011, 4012, 4140, 4219, 4228, 4356, 4357, 4415, 4475, 4476, 4604, + 4605, 4606, 4694, 4695, 4808, 4936, 4961, 4962, 5004, 5132, 5260, + 5312, 5440, 5441, 5569, 5570, 5675, 5676, 5750, 5810, 5811, 5939, + 6021, 6149, 6277, 6278, 6364, 6425, 6519, 6647, 6648, 6739, 6867, + 6995, 6996, 7120, 7223, 7244, 7367, 7407, 7408, 7467, 7595, 7699, + 7827, 7835 + ]] + self.lod = [[]] + # convert from offset-based lod to length-based lod + for i in range(len(offset_lod[0]) - 1): + self.lod[0].append(offset_lod[0][i + 1] - offset_lod[0][i]) + self.output_represention = 8 # output feature size + + +class TestSeqConvApi(unittest.TestCase): + def test_api(self): + import paddle.fluid as fluid + + x = fluid.layers.data('x', shape=[32], lod_level=1) + y = fluid.layers.sequence_conv( + input=x, num_filters=2, filter_size=3, padding_start=None) + + place = fluid.CPUPlace() + x_tensor = fluid.create_lod_tensor( + np.random.rand(10, 32).astype("float32"), [[2, 3, 1, 4]], place) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + ret = exe.run(feed={'x': x_tensor}, fetch_list=[y], return_numpy=False) + + +if __name__ == '__main__': + unittest.main() -- GitLab