From 49a8d86d1983e8e976c1114402edbe8cde642bb3 Mon Sep 17 00:00:00 2001
From: hjchen2 <chenhoujiangcug@gmail.com>
Date: Sat, 26 Jan 2019 19:18:11 +0800
Subject: [PATCH] Add gru, conv_bn_relu and dwconv_bn_relu unit test, fix bugs

---
 .../central-arm-func/conv_bn_relu_arm_func.h  |   1 +
 .../dwconv_bn_relu_arm_func.h                 |   1 +
 src/operators/math/depthwise_conv3x3.cpp      |   7 +-
 test/CMakeLists.txt                           |   5 +
 test/operators/test_conv_bn_relu_op.cpp       | 172 ++++++++++++++++++
 test/operators/test_dwconv_bn_relu_op.cpp     | 145 +++++++++++++++
 test/operators/test_gru_op.cpp                |  87 ++++++++-
 7 files changed, 407 insertions(+), 11 deletions(-)
 create mode 100644 test/operators/test_conv_bn_relu_op.cpp
 create mode 100644 test/operators/test_dwconv_bn_relu_op.cpp
diff --git a/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
index 6e8aec99e5..7eeb7f7667 100644
--- a/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_bn_relu_arm_func.h
@@ -32,6 +32,7 @@ void ConvBNReluBasic(const FusionConvBNReluParam<CPU> &param) {
   Tensor new_scale = *param.NewScale();
 
   Tensor *output = param.Output();
+  output->mutable_data<float>();
 
   int groups = param.Groups();
   std::vector<int> strides = param.Strides();
diff --git a/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h b/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
index cef297daad..e0299d00ae 100644
--- a/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/dwconv_bn_relu_arm_func.h
@@ -32,6 +32,7 @@ void DWConvBNReluBasic(const FusionDWConvBNReluParam<CPU> &param) {
   Tensor new_scale = *param.NewScale();
 
   Tensor *output = param.Output();
+  output->mutable_data<float>();
 
   int groups = param.Groups();
   std::vector<int> strides = param.Strides();
diff --git a/src/operators/math/depthwise_conv3x3.cpp b/src/operators/math/depthwise_conv3x3.cpp
index ab47126329..8220e20429 100644
--- a/src/operators/math/depthwise_conv3x3.cpp
+++ b/src/operators/math/depthwise_conv3x3.cpp
@@ -564,7 +564,7 @@ void DepthwiseConvAddBNRelu3x3s1p1(const framework::Tensor *input,
 #if __ARM_NEON
   const float *input_data = input->data<float>();
   const float *filter_data = filter->data<float>();
-  float *output_data = output->data<float>();
+  float *output_data = output->mutable_data<float>();
   const float *newscale_data = new_scale->data<float>();
   const float *newbias_data = new_bias->data<float>();
 
@@ -1309,7 +1309,7 @@ void DepthwiseConv3x3s2p1v2(const framework::Tensor *input,
 #if __ARM_NEON
   const float *input_data = input->data<float>();
   const float *filter_data = filter->data<float>();
-  float *output_data = output->data<float>();
+  float *output_data = output->mutable_data<float>();
   const float *bias_data;
   if (if_bias) {
     bias_data = bias->data<float>();
@@ -1729,7 +1729,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input,
 
   const float *input_data = input->data<float>();
   const float *filter_data = filter->data<float>();
-  float *output_data = output->data<float>();
+  float *output_data = output->mutable_data<float>();
   const float *newscale_data = new_scale->data<float>();
   const float *newbias_data = new_bias->data<float>();
 
@@ -1978,6 +1978,7 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input,
   const int output_width = static_cast<int>(output->dims()[3]);
   const int inhxw = input_height * input_width;
   const int outhxw = output_height * output_width;
+  output->mutable_data<float>();
 
   float32x4_t zero = vdupq_n_f32(0.0);
   for (int b = 0; b < batch_size; b++) {
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index aa47f270a1..8b52faf184 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -437,4 +437,9 @@ if (NOT FOUND_MATCH)
     ADD_EXECUTABLE(test-logical-xor-op operators/test_logical_xor_op.cpp test_helper.h test_include.h)
     target_link_libraries(test-logical-xor-op paddle-mobile)
 
+    ADD_EXECUTABLE(test-conv-bn-relu-op operators/test_conv_bn_relu_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-conv-bn-relu-op paddle-mobile)
+
+    ADD_EXECUTABLE(test-dwconv-bn-relu-op operators/test_dwconv_bn_relu_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-dwconv-bn-relu-op paddle-mobile)
 endif ()
diff --git a/test/operators/test_conv_bn_relu_op.cpp b/test/operators/test_conv_bn_relu_op.cpp
new file mode 100644
index 0000000000..6a09d838e0
--- /dev/null
+++ b/test/operators/test_conv_bn_relu_op.cpp
@@ -0,0 +1,172 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/fusion_conv_bn_relu_op.h"
+
+namespace paddle_mobile {
+
+// Reference convolution from Caffe for checking results.
+// accumulate through explicit loops over input, output, and filters.
+template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
+int TestConvBnReluOp(int in_channels, int in_height, int in_width,
+                     int out_channels, int groups, std::string opname) {
+  int kernel_h = Kernel;
+  int kernel_w = Kernel;
+  int pad_h = Pad;
+  int pad_w = Pad;
+  int stride_h = Stride;
+  int stride_w = Stride;
+  int dilation_h = 1;
+  int dilation_w = 1;
+
+  int batch_size = 1;
+  int input_c = in_channels;
+  int input_h = in_height;
+  int input_w = in_width;
+  int output_c = out_channels;
+  framework::DDim input_shape =
+      framework::make_ddim({batch_size, input_c, input_h, input_w});
+  framework::DDim filter_shape =
+      framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w});
+  framework::DDim shape = framework::make_ddim({output_c});
+
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["Input"] = std::vector<std::string>({"input"});
+  inputs["Filter"] = std::vector<std::string>({"filter"});
+  outputs["Out"] = std::vector<std::string>({"output"});
+  inputs["Mean"] = std::vector<std::string>({"input_mean"});
+  inputs["Variance"] = std::vector<std::string>({"input_variance"});
+  inputs["Scale"] = std::vector<std::string>({"input_scale"});
+  inputs["Bias"] = std::vector<std::string>({"input_bias"});
+  auto input_var = scope.get()->Var("input");
+  auto input = input_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<Itype>(input, input_shape, -20.0, 20.0);
+
+  auto filter_var = scope.get()->Var("filter");
+  auto filter = filter_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<Itype>(filter, filter_shape, -20, 20);
+
+  auto input_mean_var = scope.get()->Var("input_mean");
+  auto input_mean = input_mean_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<float>(input_mean, shape, -10.0, 10.0);
+  auto vari_var = scope.get()->Var("input_variance");
+  auto vari = vari_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<float>(vari, shape, -10.0, 10.0);
+  auto scale_var = scope.get()->Var("input_scale");
+  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<float>(scale, shape, -10.0, 10.0);
+  auto input_bias_var = scope.get()->Var("input_bias");
+  auto input_bias = input_bias_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<float>(input_bias, shape, -10.0, 10.0);
+
+  auto output_var = scope.get()->Var("output");
+  framework::AttributeMap attrs;
+  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
+  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
+  attrs["dilations"].Set<vector<int>>(
+      std::vector<int>({dilation_h, dilation_w}));
+  attrs["groups"].Set<int>(groups);
+  attrs["epsilon"].Set<float>(1e-6);
+  attrs["momentum"].Set<float>(0.f);
+  auto *op = new operators::FusionConvBNReluOp<CPU, float>(
+      "fusion_conv_bn_relu", inputs, outputs, attrs, scope);
+  op->InferShape();
+  op->Init();
+  for (int i = 0; i < 10; ++i) {
+    op->Run();
+  }
+  auto time1 = time();
+  for (int i = 0; i < 10; ++i) {
+    op->Run();
+  }
+  auto time2 = time();
+  std::ofstream out_file("./out_conv.txt", std::ios::app);
+  out_file << opname << " cost :" << time_diff(time1, time2) / 10.0 << "ms"
+           << std::endl;
+  out_file.close();
+
+  delete op;
+  return 0;
+}
+
+}  // namespace paddle_mobile
+
+int main(int argc, char *argv[]) {
+  // kernel = 3, pad = 1, stride = 2
+  paddle_mobile::TestConvBnReluOp<float, float, 3, 1, 2>(3, 48, 48, 16, 1,
+                                                         "conv_bn_relu");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(16, 24, 24, 8, 1,
+                                                         "depthwise_seperable");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(8, 24, 24, 24, 1,
+                                                         "MBConv_3x3_conv1");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(24, 24, 24, 8, 1,
+                                                         "MBConv_3x3_pw1");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(8, 24, 24, 24, 1,
+                                                         "MBConv_3x3_conv2");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(24, 24, 24, 8, 1,
+                                                         "MBConv_3x3_pw2");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(8, 24, 24, 24, 1,
+                                                         "MBConv_3x3_conv3");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(24, 12, 12, 16, 1,
+                                                         "MBConv_3x3_pw3");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
+      16, 12, 12, 48, 1, "MBConv_5x5_stage1_conv1");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
+      48, 12, 12, 16, 1, "MBConv_5x5_stage1_pw1");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
+      16, 12, 12, 48, 1, "MBConv_5x5_stage1_conv2");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
+      48, 12, 12, 16, 1, "MBConv_5x5_stage1_pw2");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
+      16, 12, 12, 48, 1, "MBConv_5x5_stage1_conv3");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
+      48, 6, 6, 32, 1, "MBConv_5x5_stage1_pw3");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
+      32, 6, 6, 192, 1, "MBConv_5x5_stage2_conv1");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
+      192, 6, 6, 32, 1, "MBConv_5x5_stage2_pw1");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
+      32, 6, 6, 192, 1, "MBConv_5x5_stage2_conv2");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
+      192, 6, 6, 32, 1, "MBConv_5x5_stage2_pw2");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
+      32, 6, 6, 192, 1, "MBConv_5x5_stage2_conv3");
+  // kernel = 1, pad = 0, stride = 1
+  paddle_mobile::TestConvBnReluOp<float, float, 1, 0, 1>(
+      192, 6, 6, 64, 1, "MBConv_5x5_stage2_pw3");
+
+  return 0;
+}
diff --git a/test/operators/test_dwconv_bn_relu_op.cpp b/test/operators/test_dwconv_bn_relu_op.cpp
new file mode 100644
index 0000000000..7fcf10d903
--- /dev/null
+++ b/test/operators/test_dwconv_bn_relu_op.cpp
@@ -0,0 +1,145 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/fusion_dwconv_bn_relu_op.h"
+
+namespace paddle_mobile {
+
+template <typename Itype, typename Otype, int Kernel, int Pad, int Stride>
+int TestDWConvAddBnReluOp(int in_channels, int in_height, int in_width,
+                          int out_channels, int groups, std::string opname) {
+  int kernel_h = Kernel;
+  int kernel_w = Kernel;
+  int pad_h = Pad;
+  int pad_w = Pad;
+  int stride_h = Stride;
+  int stride_w = Stride;
+  int dilation_h = 1;
+  int dilation_w = 1;
+
+  int batch_size = 1;
+  int input_c = in_channels;
+  int input_h = in_height;
+  int input_w = in_width;
+  int output_c = out_channels;
+  framework::DDim input_shape =
+      framework::make_ddim({batch_size, input_c, input_h, input_w});
+  framework::DDim filter_shape =
+      framework::make_ddim({output_c, input_c / groups, kernel_h, kernel_w});
+  framework::DDim shape = framework::make_ddim({output_c});
+
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["Input"] = std::vector<std::string>({"input"});
+  inputs["Filter"] = std::vector<std::string>({"filter"});
+  inputs["Mean"] = std::vector<std::string>({"mean"});
+  inputs["Variance"] = std::vector<std::string>({"variance"});
+  inputs["Scale"] = std::vector<std::string>({"scale"});
+  inputs["Bias"] = std::vector<std::string>({"bias"});
+  outputs["Out"] = std::vector<std::string>({"output"});
+
+  auto input_var = scope.get()->Var("input");
+  auto input = input_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<Itype>(input, input_shape, -20.0, 20.0);
+
+  auto filter_var = scope.get()->Var("filter");
+  auto filter = filter_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<Itype>(filter, filter_shape, -20, 20);
+
+  auto mean_var = scope.get()->Var("mean");
+  auto mean = mean_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<float>(mean, shape, -10.0, 10.0);
+
+  auto vari_var = scope.get()->Var("variance");
+  auto vari = vari_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<float>(vari, shape, -10.0, 10.0);
+
+  auto scale_var = scope.get()->Var("scale");
+  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<float>(scale, shape, -10.0, 10.0);
+
+  auto bias_var = scope.get()->Var("bias");
+  auto bias = bias_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<float>(bias, shape, -10.0, 10.0);
+
+  auto output_var = scope.get()->Var("output");
+  framework::AttributeMap attrs;
+  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
+  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
+  attrs["dilations"].Set<vector<int>>(
+      std::vector<int>({dilation_h, dilation_w}));
+  attrs["groups"].Set<int>(groups);
+  attrs["epsilon"].Set<float>(1e-6);
+  attrs["momentum"].Set<float>(0.f);
+
+  auto *op = new operators::FusionDWConvBNReluOp<CPU, float>(
+      "fusion_dwconv_bn_relu", inputs, outputs, attrs, scope);
+  op->InferShape();
+  op->Init();
+  for (int i = 0; i < 10; ++i) {
+    op->Run();
+  }
+  auto time1 = time();
+  for (int i = 0; i < 10; ++i) {
+    op->Run();
+  }
+  auto time2 = time();
+  std::ofstream out_file("./out_dwconv.txt", std::ios::app);
+  out_file << opname << " cost :" << time_diff(time1, time2) / 10.0 << "ms"
+           << std::endl;
+  out_file.close();
+
+  delete op;
+  return 0;
+}
+
+}  // namespace paddle_mobile
+
+int main(int argc, char *argv[]) {
+  // kernel = 3, pad = 1, stride = 1
+  paddle_mobile::TestDWConvAddBnReluOp<float, float, 3, 1, 1>(
+      16, 24, 24, 16, 16, "depthwise_seperable");
+  // kernel = 3, pad = 1, stride = 1
+  paddle_mobile::TestDWConvAddBnReluOp<float, float, 3, 1, 1>(
+      24, 24, 24, 24, 24, "MBConv_3x3_dw1");
+  // kernel = 3, pad = 1, stride = 1
+  paddle_mobile::TestDWConvAddBnReluOp<float, float, 3, 1, 1>(
+      24, 24, 24, 24, 24, "MBConv_3x3_dw2");
+  // kernel = 3, pad = 1, stride = 2
+  paddle_mobile::TestDWConvAddBnReluOp<float, float, 3, 1, 2>(
+      24, 24, 24, 24, 24, "MBConv_3x3_dw3");
+  // kernel = 5, pad = 2, stride = 1
+  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 1>(
+      48, 12, 12, 48, 48, "MBConv_5x5_stage1_dw1");
+  // kernel = 5, pad = 2, stride = 1
+  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 1>(
+      48, 12, 12, 48, 48, "MBConv_5x5_stage1_dw2");
+  // kernel = 5, pad = 2, stride = 2
+  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 2>(
+      48, 12, 12, 48, 48, "MBConv_5x5_stage1_dw3");
+  // kernel = 5, pad = 2, stride = 1
+  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 1>(
+      192, 6, 6, 192, 192, "MBConv_5x5_stage2_dw1");
+  // kernel = 5, pad = 2, stride = 1
+  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 1>(
+      192, 6, 6, 192, 192, "MBConv_5x5_stage2_dw2");
+  // kernel = 5, pad = 2, stride = 1
+  paddle_mobile::TestDWConvAddBnReluOp<float, float, 5, 2, 1>(
+      192, 6, 6, 192, 192, "MBConv_5x5_stage2_dw3");
+
+  return 0;
+}
diff --git a/test/operators/test_gru_op.cpp b/test/operators/test_gru_op.cpp
index f2ce833661..b11ec4f5f7 100644
--- a/test/operators/test_gru_op.cpp
+++ b/test/operators/test_gru_op.cpp
@@ -12,18 +12,89 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/gru_op.h"
 
-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(g_nlp);
-  PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr,
-                        "program file read fail");
+namespace paddle_mobile {
 
-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::GruOp<paddle_mobile::CPU, float>>
-      executor(program, "gru");
+template <typename Itype, typename Otype>
+int TestGruOp(int in_channels, int out_channels, std::string opname) {
+  int input_c = in_channels;
+  int output_c = out_channels;
+  paddle_mobile::framework::LoD lod{{0, input_c}};
+  int batch_size = lod.size();
+  framework::DDim input_shape = framework::make_ddim({input_c, output_c * 3});
+  framework::DDim weight_shape = framework::make_ddim({output_c, output_c * 3});
+  framework::DDim h0_shape = framework::make_ddim({batch_size, output_c});
+  framework::DDim bias_shape = framework::make_ddim({batch_size, output_c * 3});
 
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["Input"] = std::vector<std::string>({"input"});
+  inputs["Weight"] = std::vector<std::string>({"weight"});
+  inputs["H0"] = std::vector<std::string>({"h0"});
+  inputs["Bias"] = std::vector<std::string>({"bias"});
+
+  outputs["BatchGate"] = std::vector<std::string>({"output_batch_gate"});
+  outputs["BatchResetHiddenPrev"] =
+      std::vector<std::string>({"output_batch_reset_hidden_prev"});
+  outputs["BatchHidden"] = std::vector<std::string>({"output_batch_hidden"});
+  outputs["Hidden"] = std::vector<std::string>({"output_hidden"});
+
+  auto input_var = scope.get()->Var("input");
+  auto input = input_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<Itype>(input, input_shape, -127, 127);
+  input->set_lod(lod);
+
+  auto weight_var = scope.get()->Var("weight");
+  auto weight = weight_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<Itype>(weight, weight_shape, -127, 127);
+
+  auto h0_var = scope.get()->Var("h0");
+  auto h0 = h0_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<Itype>(h0, h0_shape, -127, 127);
+
+  auto bias_var = scope.get()->Var("bias");
+  auto bias = bias_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<Itype>(bias, bias_shape, -127, 127);
+
+  auto batch_gate_var = scope.get()->Var("output_batch_gate");
+  auto batch_reset_hidden_prev_var =
+      scope.get()->Var("output_batch_reset_hidden_prev");
+  auto batch_hidden_var = scope.get()->Var("output_batch_hidden");
+  auto hidden_var = scope.get()->Var("output_hidden");
+
+  framework::AttributeMap attrs;
+  attrs["activation"].SetString(std::string("relu"));
+  attrs["gate_activation"].SetString(std::string("sigmoid"));
+  attrs["is_reverse"].Set<bool>(false);
+
+  auto *op =
+      new operators::GruOp<CPU, float>("gru", inputs, outputs, attrs, scope);
+  op->InferShape();
+  op->Init();
+  for (int i = 0; i < 10; ++i) {
+    op->Run();
+  }
+  auto time1 = time();
+  for (int i = 0; i < 10; ++i) {
+    op->Run();
+  }
+  auto time2 = time();
+  std::ofstream out_file("./out_gru.txt", std::ios::app);
+  out_file << opname << " cost :" << time_diff(time1, time2) / 10.0 << "ms"
+           << std::endl;
+  out_file.close();
+
+  delete op;
+  return 0;
+}
+
+}  // namespace paddle_mobile
+
+int main(int argc, char *argv[]) {
+  paddle_mobile::TestGruOp<float, float>(384, 120, "gru_forward");
   return 0;
 }
-- 
GitLab