From b6f8ccd278ea3ec170e7f7fd14f740bc2b9d5153 Mon Sep 17 00:00:00 2001
From: shanliang1992 <shanliang1992@163.com>
Date: Fri, 23 Apr 2021 13:58:18 +0800
Subject: [PATCH] add lstm support on xpu test=kunlun (#32436)

---
 paddle/fluid/operators/rnn_op_xpu.cc          | 314 ++++++++++++++++++
 .../fluid/tests/unittests/op_test_xpu.py      |   2 +-
 .../tests/unittests/xpu/test_rnn_op_xpu.py    | 173 ++++++++++
 3 files changed, 488 insertions(+), 1 deletion(-)
 create mode 100644 paddle/fluid/operators/rnn_op_xpu.cc
 create mode 100755 python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc
new file mode 100644
index 0000000000..fb82d18e62
--- /dev/null
+++ b/paddle/fluid/operators/rnn_op_xpu.cc
@@ -0,0 +1,314 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_XPU
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/xpu_header.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+using TensorList = std::vector<framework::Tensor>;
+
+template <typename TensorType, typename T>
+void reset_parameter_vector(const std::vector<TensorType>& raw_params_vec,
+                            const int& num_layers, const bool& is_bidirec,
+                            std::vector<std::vector<T*>>* params_vec) {
+  // the parameter raw seuquence is [FWhi, FWhh, BWhi, BWhh] * num_layers
+  // + [FBhi, FBhh, BBhi, BBhh] * num_layers, we will reset the parameter to
+  // ([FWhi, FWhh, FBhi, FBhh] + [BWhi, BWhh, BBhi, BBhh]) * num_layers
+  const int& direction_num = is_bidirec ? 2 : 1;
+  const int& layer_weight_size = 4 * direction_num;
+  const int& all_weight_size = num_layers * layer_weight_size;
+  const int& bias_start_idx = all_weight_size / 2;
+  for (int i = 0; i < num_layers; i++) {
+    params_vec->at(i).resize(layer_weight_size);
+    for (int j = 0; j < layer_weight_size; j++) {
+      int k = j % 4;
+      const int& section = j / 4;
+      int tensor_idx = i * 2 * direction_num + section * 2 + k % 2;
+      if (k >= 2) {
+        tensor_idx += bias_start_idx;
+      }
+      using remove_cv_t = typename std::remove_cv<T>::type;
+      params_vec->at(i)[j] =
+          raw_params_vec[tensor_idx]->template data<remove_cv_t>();
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class RnnXPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto pre_state = ctx.MultiInput<Tensor>("PreState");
+    auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+    auto state = ctx.MultiOutput<Tensor>("State");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* reserve_data = ctx.Output<Tensor>("Reserve");
+    const int& num_layers = ctx.Attr<int>("num_layers");
+    const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
+    const int& hidden_size = ctx.Attr<int>("hidden_size");
+    const std::string& mode = ctx.Attr<std::string>("mode");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    const Tensor* sequence_length = nullptr;
+    if (has_seq_length) {
+      sequence_length = ctx.Input<Tensor>("SequenceLength");
+    }
+
+    PADDLE_ENFORCE_EQ(
+        mode, "LSTM",
+        platform::errors::InvalidArgument(
+            "XPU only support LSTM mode now, current mode is %s", mode));
+
+    PADDLE_ENFORCE_EQ(is_bidirec, false,
+                      platform::errors::InvalidArgument(
+                          "XPU only support unidirectional LSTM now"));
+
+    PADDLE_ENFORCE_EQ(
+        num_layers, 1,
+        platform::errors::InvalidArgument(
+            "XPU only support 1 layer LSTM now, current layer num is %s",
+            num_layers));
+
+    auto init_h = pre_state[0];
+    auto init_c = pre_state[1];
+    auto last_h = state[0];
+    auto last_c = state[1];
+
+    // check shape
+    int seq_len = input->dims()[0];
+    int batch_size = input->dims()[1];
+    int input_dim = input->dims()[2];
+
+    PADDLE_ENFORCE_EQ(
+        init_h->dims()[0], num_layers,
+        platform::errors::InvalidArgument("The num_layers of in RNN layer must"
+                                          " be the same as first dim of init "
+                                          "hidden, but received num_layers:%d,"
+                                          " dim:%d",
+                                          num_layers, init_h->dims()[0]));
+
+    PADDLE_ENFORCE_EQ(
+        init_c->dims()[0], num_layers,
+        platform::errors::InvalidArgument(
+            "The num_layers of in RNN layer must"
+            " be the same as first dim of cell state hidden, but received"
+            " num_layers:%d, dim:%d",
+            num_layers, init_c->dims()[0]));
+
+    std::vector<std::vector<const T*>> parameter_lists;
+    parameter_lists.resize(num_layers);
+    reset_parameter_vector(weight_list, num_layers, is_bidirec,
+                           &parameter_lists);
+
+    // init the output and allocate the memory
+    output->mutable_data<T>(ctx.GetPlace());
+    last_h->mutable_data<T>(ctx.GetPlace());
+    last_c->mutable_data<T>(ctx.GetPlace());
+    reserve_data->Resize({seq_len * batch_size * hidden_size * 5});
+    reserve_data->mutable_data<T>(ctx.GetPlace());
+
+    // get ptr from tensor
+    auto x = input->data<T>();
+    auto h_0 = init_h->data<T>();
+    auto c_0 = init_c->data<T>();
+    auto w_x = parameter_lists[0][0];
+    auto w_h = parameter_lists[0][1];
+    auto b_x = parameter_lists[0][2];
+    auto b_h = parameter_lists[0][3];
+    auto y = output->data<T>();
+    auto last_h_ptr = last_h->data<T>();
+    auto last_c_ptr = last_c->data<T>();
+    auto i_f_g_o = reserve_data->data<T>();
+    auto c = i_f_g_o + seq_len * batch_size * hidden_size * 4;
+
+    std::vector<int> seq_len_tensor(batch_size, seq_len);
+    if (has_seq_length) {
+      seq_len_tensor = operators::GetDataFromTensor(sequence_length);
+    }
+
+    // run kernel
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::lstm_train<T, T, int16_t>(
+        dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0,
+        (const T*)w_x, (const T*)w_h, (const T*)b_x, (const T*)b_h,
+        reinterpret_cast<T*>(y), reinterpret_cast<T*>(last_h_ptr),
+        reinterpret_cast<T*>(last_c_ptr), batch_size, input_dim, hidden_size,
+        seq_len, seq_len_tensor, nullptr, nullptr, nullptr, nullptr,
+        reinterpret_cast<T*>(i_f_g_o), reinterpret_cast<T*>(c));
+    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                      platform::errors::External("RnnXPU(lstm) return wrong "
+                                                 "value[%d %s]",
+                                                 r, XPUAPIErrorMsg[r]));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class RnnXPUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // get the tensor pointer for the input
+    auto* input = ctx.Input<Tensor>("Input");
+    auto pre_state = ctx.MultiInput<Tensor>("PreState");
+    auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+    auto* output = ctx.Input<Tensor>("Out");
+    auto* reserve_data = ctx.Input<Tensor>("Reserve");
+    const int& num_layers = ctx.Attr<int>("num_layers");
+    const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
+    const int& hidden_size = ctx.Attr<int>("hidden_size");
+    const std::string& mode = ctx.Attr<std::string>("mode");
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    const Tensor* sequence_length = nullptr;
+    if (has_seq_length) {
+      sequence_length = ctx.Input<Tensor>("SequenceLength");
+    }
+
+    PADDLE_ENFORCE_EQ(
+        mode, "LSTM",
+        platform::errors::InvalidArgument(
+            "XPU only support LSTM mode now, current mode is %s", mode));
+
+    PADDLE_ENFORCE_EQ(is_bidirec, false,
+                      platform::errors::InvalidArgument(
+                          "XPU only support unidirectional LSTM now"));
+
+    PADDLE_ENFORCE_EQ(
+        num_layers, 1,
+        platform::errors::InvalidArgument(
+            "XPU only support 1 layer LSTM now, current layer num is %s",
+            num_layers));
+
+    auto init_h = pre_state[0];
+    auto init_c = pre_state[1];
+
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto state_grad = ctx.MultiInput<Tensor>(framework::GradVarName("State"));
+    auto last_h_grad = state_grad[0];
+    auto last_c_grad = state_grad[1];
+
+    // get the tensor pointer for the output
+    auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto weight_grad_list = ctx.MultiOutput<framework::Tensor>(
+        framework::GradVarName("WeightList"));
+    auto pre_state_grad =
+        ctx.MultiOutput<Tensor>(framework::GradVarName("PreState"));
+    Tensor* init_h_grad = nullptr;
+    Tensor* init_c_grad = nullptr;
+    if (pre_state_grad.size() > 0) {  // has gradient
+      init_h_grad = pre_state_grad[0];
+      init_c_grad = pre_state_grad[1];
+    }
+
+    // check shape
+    int seq_len = input->dims()[0];
+    int batch_size = input->dims()[1];
+    int input_dim = input->dims()[2];
+
+    PADDLE_ENFORCE_EQ(
+        init_h->dims()[0], num_layers,
+        platform::errors::InvalidArgument("The num_layers of in RNN layer must"
+                                          " be the same as first dim of init "
+                                          "hidden, but received num_layers:%d,"
+                                          " dim:%d",
+                                          num_layers, init_h->dims()[0]));
+
+    PADDLE_ENFORCE_EQ(
+        init_c->dims()[0], num_layers,
+        platform::errors::InvalidArgument(
+            "The num_layers of in RNN layer must"
+            " be the same as first dim of cell state hidden, but received"
+            " num_layers:%d, dim:%d",
+            num_layers, init_c->dims()[0]));
+
+    std::vector<std::vector<const T*>> parameter_lists;
+    parameter_lists.resize(num_layers);
+    reset_parameter_vector(weight_list, num_layers, is_bidirec,
+                           &parameter_lists);
+
+    for (unsigned int i = 0; i < weight_grad_list.size(); ++i) {
+      weight_grad_list[i]->mutable_data<T>(ctx.GetPlace());
+    }
+    std::vector<std::vector<T*>> parameter_lists_grad;
+    parameter_lists_grad.resize(num_layers);
+    reset_parameter_vector(weight_grad_list, num_layers, is_bidirec,
+                           &parameter_lists_grad);
+
+    // allocate the memory and initization the input_grad
+    input_grad->mutable_data<T>(input->dims(), ctx.GetPlace());
+    if (init_h_grad) {
+      init_h_grad->mutable_data<T>(init_h->dims(), ctx.GetPlace());
+    }
+    if (init_c_grad) {
+      init_c_grad->mutable_data<T>(init_c->dims(), ctx.GetPlace());
+    }
+
+    // get ptr from tensor
+    auto x = input->data<T>();
+    auto h_0 = init_h->data<T>();
+    auto c_0 = init_c->data<T>();
+    auto w_x = parameter_lists[0][0];
+    auto w_h = parameter_lists[0][1];
+    auto y = output->data<T>();
+    auto y_grad = output_grad->data<T>();
+    auto last_h_grad_ptr = last_h_grad->data<T>();
+    auto last_c_grad_ptr = last_c_grad->data<T>();
+    auto x_grad = input_grad->data<T>();
+    auto h_0_grad = init_h_grad ? init_h_grad->data<T>() : nullptr;
+    auto c_0_grad = init_c_grad ? init_c_grad->data<T>() : nullptr;
+    auto w_x_grad = parameter_lists_grad[0][0];
+    auto w_h_grad = parameter_lists_grad[0][1];
+    auto b_x_grad = parameter_lists_grad[0][2];
+    auto b_h_grad = parameter_lists_grad[0][3];
+    auto i_f_g_o = reserve_data->data<T>();
+    auto c = i_f_g_o + seq_len * batch_size * hidden_size * 4;
+
+    std::vector<int> seq_len_tensor(batch_size, seq_len);
+    if (has_seq_length) {
+      seq_len_tensor = operators::GetDataFromTensor(sequence_length);
+    }
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    int r = xpu::lstm_grad<T, T, int16_t>(
+        dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0,
+        (const T*)w_x, (const T*)w_h, (const T*)y, (const T*)y_grad,
+        (const T*)last_h_grad_ptr, (const T*)last_c_grad_ptr,
+        reinterpret_cast<T*>(x_grad), reinterpret_cast<T*>(h_0_grad),
+        reinterpret_cast<T*>(c_0_grad), w_x_grad, w_h_grad, b_x_grad, b_h_grad,
+        batch_size, input_dim, hidden_size, seq_len, seq_len_tensor, nullptr,
+        nullptr, nullptr, nullptr, i_f_g_o, c);
+    PADDLE_ENFORCE_EQ(
+        r, xpu::Error_t::SUCCESS,
+        platform::errors::External("RnnXPUGrad(lstm) return wrong "
+                                   "value[%d %s]",
+                                   r, XPUAPIErrorMsg[r]));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_XPU_KERNEL(
+    rnn, ops::RnnXPUKernel<paddle::platform::XPUDeviceContext, float>);
+REGISTER_OP_XPU_KERNEL(
+    rnn_grad, ops::RnnXPUGradKernel<paddle::platform::XPUDeviceContext, float>);
+
+#endif  // PADDLE_WITH_XPU
diff --git a/python/paddle/fluid/tests/unittests/op_test_xpu.py b/python/paddle/fluid/tests/unittests/op_test_xpu.py
index 37b446174d..133367a5f3 100644
--- a/python/paddle/fluid/tests/unittests/op_test_xpu.py
+++ b/python/paddle/fluid/tests/unittests/op_test_xpu.py
@@ -296,7 +296,7 @@ class XPUOpTest(OpTest):
             no_grad_set=no_grad_set)
         self._assert_is_close(a1, a2, inputs_to_check, 0.00000001,
                               "Gradient Check On two xpu")
-        self._assert_is_close(a1, a3, inputs_to_check, 0.001,
+        self._assert_is_close(a1, a3, inputs_to_check, max_relative_error,
                               "Gradient Check On cpu & xpu")
 
     def get_grad_with_place(self,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
new file mode 100755
index 0000000000..a27d806319
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_rnn_op_xpu.py
@@ -0,0 +1,173 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import math
+import paddle.fluid.core as core
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import random
+import sys
+
+sys.path.append("..")
+from op_test_xpu import XPUOpTest
+sys.path.append("../rnn")
+from rnn_numpy import SimpleRNN, LSTM, GRU
+from convert import get_params_for_net
+
+random.seed(2)
+np.set_printoptions(threshold=np.inf)
+paddle.enable_static()
+
+
+class TestRNNOp(XPUOpTest):
+    def init_size(self):
+        self.seq_length = 1
+        self.batch_size = 1
+        self.input_size = 5
+        self.hidden_size = 16
+
+    def get_weight_names(self):
+        weight_names = []
+        for i in range(self.num_layers):
+            for j in range(0, 2 * self.direction_num):
+                weight_names.append("{}.weight_{}".format(i, j))
+        for i in range(self.num_layers):
+            for j in range(0, 2 * self.direction_num):
+                weight_names.append("{}.bias_{}".format(i, j))
+        return weight_names
+
+    def setUp(self):
+        self.init_size()
+        self.op_type = "rnn"
+        self.dtype = np.float32
+        self.sequence_length = np.ones(
+            (self.batch_size, ), dtype=np.int32) * self.seq_length
+        self.num_layers = 1
+        self.is_bidirec = False
+        self.mode = "LSTM"
+        self.is_test = False
+        self.dropout = 0.0
+        self.set_attrs()
+
+        self.direction_num = 2 if self.is_bidirec else 1
+        direction = "bidirectional" if self.is_bidirec else "forward"
+
+        input = np.random.uniform(
+            low=-0.1,
+            high=0.1,
+            size=(self.seq_length, self.batch_size,
+                  self.input_size)).astype(self.dtype)
+
+        rnn1 = LSTM(
+            self.input_size,
+            self.hidden_size,
+            num_layers=self.num_layers,
+            time_major=True,
+            direction=direction,
+            dropout=self.dropout,
+            dtype="float32")
+
+        flat_w = get_params_for_net(rnn1)
+        output, (last_hidden, last_cell) = rnn1(
+            input, sequence_length=self.sequence_length)
+
+        init_h = np.zeros(
+            (self.num_layers * self.direction_num, self.batch_size,
+             self.hidden_size)).astype(self.dtype)
+        init_c = np.zeros(
+            (self.num_layers * self.direction_num, self.batch_size,
+             self.hidden_size)).astype(self.dtype)
+        state_out = np.ndarray((300)).astype("uint8")
+
+        self.inputs = {
+            'Input': input,
+            'WeightList': flat_w,
+            'PreState': [('init_h', init_h), ('init_c', init_c)],
+            'SequenceLength': self.sequence_length
+        }
+        if self.sequence_length is None:
+            self.inputs = {
+                'Input': input,
+                'WeightList': flat_w,
+                'PreState': [('init_h', init_h), ('init_c', init_c)],
+            }
+        self.attrs = {
+            'dropout_prob': self.dropout,
+            'is_bidirec': self.is_bidirec,
+            'input_size': self.input_size,
+            'hidden_size': self.hidden_size,
+            'num_layers': self.num_layers,
+            'mode': self.mode,
+            'is_test': self.is_test
+        }
+        self.outputs = {
+            'Out': output,
+            "State": [('last_hidden', last_hidden), ('last_cell', last_cell)],
+            'Reserve': np.ndarray((400)).astype("uint8"),
+            'DropoutState': state_out
+        }
+
+    def test_output(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            self.check_output_with_place(
+                place, atol=0.01, no_check_set=['Reserve', 'DropoutState'])
+
+    def set_attrs(self):
+        pass
+
+    def test_grad(self):
+        if paddle.is_compiled_with_xpu():
+            place = paddle.XPUPlace(0)
+            if not self.is_test:
+                var_name_list = self.get_weight_names()
+                grad_check_list = ['Input', 'init_h', 'init_c']
+                grad_check_list.extend(var_name_list)
+                self.check_grad_with_place(
+                    place,
+                    set(grad_check_list), ['Out', 'last_hidden', 'last_cell'],
+                    max_relative_error=0.1)
+
+
+class TestRNNOpCase0(TestRNNOp):
+    def init_size(self):
+        self.seq_length = 2
+        self.batch_size = 4
+        self.input_size = 10
+        self.hidden_size = 32
+
+
+class TestRNNOpCase1(TestRNNOp):
+    def init_size(self):
+        self.seq_length = 5
+        self.batch_size = 16
+        self.input_size = 30
+        self.hidden_size = 64
+
+
+class TestRNNOpCase2(TestRNNOp):
+    def init_size(self):
+        self.seq_length = 10
+        self.batch_size = 64
+        self.input_size = 50
+        self.hidden_size = 64
+
+
+if __name__ == '__main__':
+    unittest.main()
-- 
GitLab