From b6f8ccd278ea3ec170e7f7fd14f740bc2b9d5153 Mon Sep 17 00:00:00 2001 From: shanliang1992 Date: Fri, 23 Apr 2021 13:58:18 +0800 Subject: [PATCH] add lstm support on xpu test=kunlun (#32436) --- paddle/fluid/operators/rnn_op_xpu.cc | 314 ++++++++++++++++++ .../fluid/tests/unittests/op_test_xpu.py | 2 +- .../tests/unittests/xpu/test_rnn_op_xpu.py | 173 ++++++++++ 3 files changed, 488 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/rnn_op_xpu.cc create mode 100755 python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc new file mode 100644 index 0000000000..fb82d18e62 --- /dev/null +++ b/paddle/fluid/operators/rnn_op_xpu.cc @@ -0,0 +1,314 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/xpu_header.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +using TensorList = std::vector; + +template +void reset_parameter_vector(const std::vector& raw_params_vec, + const int& num_layers, const bool& is_bidirec, + std::vector>* params_vec) { + // the parameter raw seuquence is [FWhi, FWhh, BWhi, BWhh] * num_layers + // + [FBhi, FBhh, BBhi, BBhh] * num_layers, we will reset the parameter to + // ([FWhi, FWhh, FBhi, FBhh] + [BWhi, BWhh, BBhi, BBhh]) * num_layers + const int& direction_num = is_bidirec ? 2 : 1; + const int& layer_weight_size = 4 * direction_num; + const int& all_weight_size = num_layers * layer_weight_size; + const int& bias_start_idx = all_weight_size / 2; + for (int i = 0; i < num_layers; i++) { + params_vec->at(i).resize(layer_weight_size); + for (int j = 0; j < layer_weight_size; j++) { + int k = j % 4; + const int& section = j / 4; + int tensor_idx = i * 2 * direction_num + section * 2 + k % 2; + if (k >= 2) { + tensor_idx += bias_start_idx; + } + using remove_cv_t = typename std::remove_cv::type; + params_vec->at(i)[j] = + raw_params_vec[tensor_idx]->template data(); + } + } +} + +template +class RnnXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto pre_state = ctx.MultiInput("PreState"); + auto weight_list = ctx.MultiInput("WeightList"); + auto state = ctx.MultiOutput("State"); + auto* output = ctx.Output("Out"); + auto* reserve_data = ctx.Output("Reserve"); + const int& num_layers = ctx.Attr("num_layers"); + const bool& is_bidirec = ctx.Attr("is_bidirec"); + const int& hidden_size = ctx.Attr("hidden_size"); + const std::string& mode = ctx.Attr("mode"); + + bool has_seq_length = ctx.HasInput("SequenceLength"); + const Tensor* sequence_length = nullptr; + if (has_seq_length) { + sequence_length = ctx.Input("SequenceLength"); + } + + PADDLE_ENFORCE_EQ( + mode, "LSTM", + platform::errors::InvalidArgument( + "XPU only support LSTM mode now, current mode is %s", mode)); + + PADDLE_ENFORCE_EQ(is_bidirec, false, + platform::errors::InvalidArgument( + "XPU only support unidirectional LSTM now")); + + PADDLE_ENFORCE_EQ( + num_layers, 1, + platform::errors::InvalidArgument( + "XPU only support 1 layer LSTM now, current layer num is %s", + num_layers)); + + auto init_h = pre_state[0]; + auto init_c = pre_state[1]; + auto last_h = state[0]; + auto last_c = state[1]; + + // check shape + int seq_len = input->dims()[0]; + int batch_size = input->dims()[1]; + int input_dim = input->dims()[2]; + + PADDLE_ENFORCE_EQ( + init_h->dims()[0], num_layers, + platform::errors::InvalidArgument("The num_layers of in RNN layer must" + " be the same as first dim of init " + "hidden, but received num_layers:%d," + " dim:%d", + num_layers, init_h->dims()[0])); + + PADDLE_ENFORCE_EQ( + init_c->dims()[0], num_layers, + platform::errors::InvalidArgument( + "The num_layers of in RNN layer must" + " be the same as first dim of cell state hidden, but received" + " num_layers:%d, dim:%d", + num_layers, init_c->dims()[0])); + + std::vector> parameter_lists; + parameter_lists.resize(num_layers); + reset_parameter_vector(weight_list, num_layers, is_bidirec, + ¶meter_lists); + + // init the output and allocate the memory + output->mutable_data(ctx.GetPlace()); + last_h->mutable_data(ctx.GetPlace()); + last_c->mutable_data(ctx.GetPlace()); + reserve_data->Resize({seq_len * batch_size * hidden_size * 5}); + reserve_data->mutable_data(ctx.GetPlace()); + + // get ptr from tensor + auto x = input->data(); + auto h_0 = init_h->data(); + auto c_0 = init_c->data(); + auto w_x = parameter_lists[0][0]; + auto w_h = parameter_lists[0][1]; + auto b_x = parameter_lists[0][2]; + auto b_h = parameter_lists[0][3]; + auto y = output->data(); + auto last_h_ptr = last_h->data(); + auto last_c_ptr = last_c->data(); + auto i_f_g_o = reserve_data->data(); + auto c = i_f_g_o + seq_len * batch_size * hidden_size * 4; + + std::vector seq_len_tensor(batch_size, seq_len); + if (has_seq_length) { + seq_len_tensor = operators::GetDataFromTensor(sequence_length); + } + + // run kernel + auto& dev_ctx = ctx.template device_context(); + int r = xpu::lstm_train( + dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0, + (const T*)w_x, (const T*)w_h, (const T*)b_x, (const T*)b_h, + reinterpret_cast(y), reinterpret_cast(last_h_ptr), + reinterpret_cast(last_c_ptr), batch_size, input_dim, hidden_size, + seq_len, seq_len_tensor, nullptr, nullptr, nullptr, nullptr, + reinterpret_cast(i_f_g_o), reinterpret_cast(c)); + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::External("RnnXPU(lstm) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + } +}; + +template +class RnnXPUGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // get the tensor pointer for the input + auto* input = ctx.Input("Input"); + auto pre_state = ctx.MultiInput("PreState"); + auto weight_list = ctx.MultiInput("WeightList"); + auto* output = ctx.Input("Out"); + auto* reserve_data = ctx.Input("Reserve"); + const int& num_layers = ctx.Attr("num_layers"); + const bool& is_bidirec = ctx.Attr("is_bidirec"); + const int& hidden_size = ctx.Attr("hidden_size"); + const std::string& mode = ctx.Attr("mode"); + + bool has_seq_length = ctx.HasInput("SequenceLength"); + const Tensor* sequence_length = nullptr; + if (has_seq_length) { + sequence_length = ctx.Input("SequenceLength"); + } + + PADDLE_ENFORCE_EQ( + mode, "LSTM", + platform::errors::InvalidArgument( + "XPU only support LSTM mode now, current mode is %s", mode)); + + PADDLE_ENFORCE_EQ(is_bidirec, false, + platform::errors::InvalidArgument( + "XPU only support unidirectional LSTM now")); + + PADDLE_ENFORCE_EQ( + num_layers, 1, + platform::errors::InvalidArgument( + "XPU only support 1 layer LSTM now, current layer num is %s", + num_layers)); + + auto init_h = pre_state[0]; + auto init_c = pre_state[1]; + + auto output_grad = ctx.Input(framework::GradVarName("Out")); + auto state_grad = ctx.MultiInput(framework::GradVarName("State")); + auto last_h_grad = state_grad[0]; + auto last_c_grad = state_grad[1]; + + // get the tensor pointer for the output + auto* input_grad = ctx.Output(framework::GradVarName("Input")); + auto weight_grad_list = ctx.MultiOutput( + framework::GradVarName("WeightList")); + auto pre_state_grad = + ctx.MultiOutput(framework::GradVarName("PreState")); + Tensor* init_h_grad = nullptr; + Tensor* init_c_grad = nullptr; + if (pre_state_grad.size() > 0) { // has gradient + init_h_grad = pre_state_grad[0]; + init_c_grad = pre_state_grad[1]; + } + + // check shape + int seq_len = input->dims()[0]; + int batch_size = input->dims()[1]; + int input_dim = input->dims()[2]; + + PADDLE_ENFORCE_EQ( + init_h->dims()[0], num_layers, + platform::errors::InvalidArgument("The num_layers of in RNN layer must" + " be the same as first dim of init " + "hidden, but received num_layers:%d," + " dim:%d", + num_layers, init_h->dims()[0])); + + PADDLE_ENFORCE_EQ( + init_c->dims()[0], num_layers, + platform::errors::InvalidArgument( + "The num_layers of in RNN layer must" + " be the same as first dim of cell state hidden, but received" + " num_layers:%d, dim:%d", + num_layers, init_c->dims()[0])); + + std::vector> parameter_lists; + parameter_lists.resize(num_layers); + reset_parameter_vector(weight_list, num_layers, is_bidirec, + ¶meter_lists); + + for (unsigned int i = 0; i < weight_grad_list.size(); ++i) { + weight_grad_list[i]->mutable_data(ctx.GetPlace()); + } + std::vector> parameter_lists_grad; + parameter_lists_grad.resize(num_layers); + reset_parameter_vector(weight_grad_list, num_layers, is_bidirec, + ¶meter_lists_grad); + + // allocate the memory and initization the input_grad + input_grad->mutable_data(input->dims(), ctx.GetPlace()); + if (init_h_grad) { + init_h_grad->mutable_data(init_h->dims(), ctx.GetPlace()); + } + if (init_c_grad) { + init_c_grad->mutable_data(init_c->dims(), ctx.GetPlace()); + } + + // get ptr from tensor + auto x = input->data(); + auto h_0 = init_h->data(); + auto c_0 = init_c->data(); + auto w_x = parameter_lists[0][0]; + auto w_h = parameter_lists[0][1]; + auto y = output->data(); + auto y_grad = output_grad->data(); + auto last_h_grad_ptr = last_h_grad->data(); + auto last_c_grad_ptr = last_c_grad->data(); + auto x_grad = input_grad->data(); + auto h_0_grad = init_h_grad ? init_h_grad->data() : nullptr; + auto c_0_grad = init_c_grad ? init_c_grad->data() : nullptr; + auto w_x_grad = parameter_lists_grad[0][0]; + auto w_h_grad = parameter_lists_grad[0][1]; + auto b_x_grad = parameter_lists_grad[0][2]; + auto b_h_grad = parameter_lists_grad[0][3]; + auto i_f_g_o = reserve_data->data(); + auto c = i_f_g_o + seq_len * batch_size * hidden_size * 4; + + std::vector seq_len_tensor(batch_size, seq_len); + if (has_seq_length) { + seq_len_tensor = operators::GetDataFromTensor(sequence_length); + } + + auto& dev_ctx = ctx.template device_context(); + int r = xpu::lstm_grad( + dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0, + (const T*)w_x, (const T*)w_h, (const T*)y, (const T*)y_grad, + (const T*)last_h_grad_ptr, (const T*)last_c_grad_ptr, + reinterpret_cast(x_grad), reinterpret_cast(h_0_grad), + reinterpret_cast(c_0_grad), w_x_grad, w_h_grad, b_x_grad, b_h_grad, + batch_size, input_dim, hidden_size, seq_len, seq_len_tensor, nullptr, + nullptr, nullptr, nullptr, i_f_g_o, c); + PADDLE_ENFORCE_EQ( + r, xpu::Error_t::SUCCESS, + platform::errors::External("RnnXPUGrad(lstm) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + rnn, ops::RnnXPUKernel); +REGISTER_OP_XPU_KERNEL( + rnn_grad, ops::RnnXPUGradKernel); + +#endif // PADDLE_WITH_XPU diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py index 37b446174d..133367a5f3 100644 --- a/python/paddle/fluid/tests/unittests/op_test_xpu.py +++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py @@ -296,7 +296,7 @@ class XPUOpTest(OpTest): no_grad_set=no_grad_set) self._assert_is_close(a1, a2, inputs_to_check, 0.00000001, "Gradient Check On two xpu") - self._assert_is_close(a1, a3, inputs_to_check, 0.001, + self._assert_is_close(a1, a3, inputs_to_check, max_relative_error, "Gradient Check On cpu & xpu") def get_grad_with_place(self, diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py new file mode 100755 index 0000000000..a27d806319 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py @@ -0,0 +1,173 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import math +import paddle.fluid.core as core +import paddle +import paddle.fluid as fluid +import paddle.fluid.layers as layers +import random +import sys + +sys.path.append("..") +from op_test_xpu import XPUOpTest +sys.path.append("../rnn") +from rnn_numpy import SimpleRNN, LSTM, GRU +from convert import get_params_for_net + +random.seed(2) +np.set_printoptions(threshold=np.inf) +paddle.enable_static() + + +class TestRNNOp(XPUOpTest): + def init_size(self): + self.seq_length = 1 + self.batch_size = 1 + self.input_size = 5 + self.hidden_size = 16 + + def get_weight_names(self): + weight_names = [] + for i in range(self.num_layers): + for j in range(0, 2 * self.direction_num): + weight_names.append("{}.weight_{}".format(i, j)) + for i in range(self.num_layers): + for j in range(0, 2 * self.direction_num): + weight_names.append("{}.bias_{}".format(i, j)) + return weight_names + + def setUp(self): + self.init_size() + self.op_type = "rnn" + self.dtype = np.float32 + self.sequence_length = np.ones( + (self.batch_size, ), dtype=np.int32) * self.seq_length + self.num_layers = 1 + self.is_bidirec = False + self.mode = "LSTM" + self.is_test = False + self.dropout = 0.0 + self.set_attrs() + + self.direction_num = 2 if self.is_bidirec else 1 + direction = "bidirectional" if self.is_bidirec else "forward" + + input = np.random.uniform( + low=-0.1, + high=0.1, + size=(self.seq_length, self.batch_size, + self.input_size)).astype(self.dtype) + + rnn1 = LSTM( + self.input_size, + self.hidden_size, + num_layers=self.num_layers, + time_major=True, + direction=direction, + dropout=self.dropout, + dtype="float32") + + flat_w = get_params_for_net(rnn1) + output, (last_hidden, last_cell) = rnn1( + input, sequence_length=self.sequence_length) + + init_h = np.zeros( + (self.num_layers * self.direction_num, self.batch_size, + self.hidden_size)).astype(self.dtype) + init_c = np.zeros( + (self.num_layers * self.direction_num, self.batch_size, + self.hidden_size)).astype(self.dtype) + state_out = np.ndarray((300)).astype("uint8") + + self.inputs = { + 'Input': input, + 'WeightList': flat_w, + 'PreState': [('init_h', init_h), ('init_c', init_c)], + 'SequenceLength': self.sequence_length + } + if self.sequence_length is None: + self.inputs = { + 'Input': input, + 'WeightList': flat_w, + 'PreState': [('init_h', init_h), ('init_c', init_c)], + } + self.attrs = { + 'dropout_prob': self.dropout, + 'is_bidirec': self.is_bidirec, + 'input_size': self.input_size, + 'hidden_size': self.hidden_size, + 'num_layers': self.num_layers, + 'mode': self.mode, + 'is_test': self.is_test + } + self.outputs = { + 'Out': output, + "State": [('last_hidden', last_hidden), ('last_cell', last_cell)], + 'Reserve': np.ndarray((400)).astype("uint8"), + 'DropoutState': state_out + } + + def test_output(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + self.check_output_with_place( + place, atol=0.01, no_check_set=['Reserve', 'DropoutState']) + + def set_attrs(self): + pass + + def test_grad(self): + if paddle.is_compiled_with_xpu(): + place = paddle.XPUPlace(0) + if not self.is_test: + var_name_list = self.get_weight_names() + grad_check_list = ['Input', 'init_h', 'init_c'] + grad_check_list.extend(var_name_list) + self.check_grad_with_place( + place, + set(grad_check_list), ['Out', 'last_hidden', 'last_cell'], + max_relative_error=0.1) + + +class TestRNNOpCase0(TestRNNOp): + def init_size(self): + self.seq_length = 2 + self.batch_size = 4 + self.input_size = 10 + self.hidden_size = 32 + + +class TestRNNOpCase1(TestRNNOp): + def init_size(self): + self.seq_length = 5 + self.batch_size = 16 + self.input_size = 30 + self.hidden_size = 64 + + +class TestRNNOpCase2(TestRNNOp): + def init_size(self): + self.seq_length = 10 + self.batch_size = 64 + self.input_size = 50 + self.hidden_size = 64 + + +if __name__ == '__main__': + unittest.main() -- GitLab