Fix gather and concat, add abs op, test=develop (#3395)

ae3ebea5 · cc · GitHub · 4a7284f9 · ae3ebea5 · ae3ebea5
12 changed file
--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -187,6 +187,10 @@ void Run(const std::vector<int64_t>& input_shape,
    }
    LOG(INFO) << "max_value:" << max_value;
    LOG(INFO) << "max_index:" << max_index;
+    LOG(INFO) << "output data[0:10]:";
+    for (int i = 0; i < 10; i++) {
+      LOG(INFO) << out_data[i];
+    }
  }
 }
 #endif
@@ -198,32 +202,33 @@ void print_usage() {
  std::string help_info =
      "Usage: \n"
      "./benchmark_bin \n"
-      "  --optimized_model_path (the path of the model that is optimized\n"
-      "    by opt.) type: string \n"
-      "  --model_dir (the path of the model that is not optimized by opt,\n"
+      "  --optimized_model_path (The path of the model that is optimized\n"
+      "    by opt. If the model is optimized, please set the param.) \n"
+      "    type: string \n"
+      "  --model_dir (The path of the model that is not optimized by opt,\n"
      "    the model and param files is under model_dir.) type: string \n"
-      "  --model_filename (the filename of model file. When the model is\n "
+      "  --model_filename (The filename of model file. When the model is\n "
      "    combined formate, please set model_file. Otherwise, it is not\n"
      "    necessary to set it.) type: string \n"
-      "  --param_filename (the filename of param file, set param_file when\n"
+      "  --param_filename (The filename of param file, set param_file when\n"
      "    the model is combined formate. Otherwise, it is not necessary\n"
      "    to set it.) type: string \n"
-      "  --input_shape (set input shapes according to the model, separated by\n"
+      "  --input_shape (Tet input shapes according to the model, separated by\n"
      "    colon and comma, such as 1,3,244,244) type: string\n"
      "    default: 1,3,224,224 \n"
-      "  --input_img_path (the path of input image, if not set\n"
+      "  --input_img_path (The path of input image, if not set\n"
      "    input_img_path, the input will be 1.0.) type: string \n "
-      "  --power_mode (arm power mode: 0 for big cluster, 1 for little\n"
+      "  --power_mode (Arm power mode: 0 for big cluster, 1 for little\n"
      "    cluster, 2 for all cores, 3 for no bind) type: int32 default: 3\n"
-      "  --repeats (repeats times) type: int32 default: 1 \n"
-      "  --result_filename (save the inference time to the file.) type: \n"
+      "  --repeats (Repeats times) type: int32 default: 1 \n"
+      "  --result_filename (Save the inference time to the file.) type: \n"
      "    string default: result.txt \n"
-      "  --threads (threads num) type: int32 default: 1 \n"
-      "  --warmup (warmup times) type: int32 default: 0 \n"
+      "  --threads (Threads num) type: int32 default: 1 \n"
+      "  --warmup (Warmup times) type: int32 default: 0 \n"
      "Note that: \n"
-      "  If load the optimized model, set optimized_model_path, or set\n"
-      "    model_dir, model_filename and param_filename according to the\n"
-      "    model. \n";
+      "  If load the optimized model, set optimized_model_path. Otherwise, \n"
+      "    set model_dir, model_filename and param_filename according to \n"
+      "    the model. \n";
  LOG(INFO) << help_info;
 }


--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -295,6 +295,10 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
  inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
  inner_places.emplace_back(
      TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
+  inner_places.emplace_back(
+      TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kNCHW));
+  inner_places.emplace_back(
+      TARGET(kHost), PRECISION(kInt64), DATALAYOUT(kNCHW));

  // Analysis whether the modle is quantized.
  // For quantized model, add place(arm, int8) to inner_places

--- a/lite/backends/arm/math/activation.cc
+++ b/lite/backends/arm/math/activation.cc
@@ -744,6 +744,15 @@ void act_reciprocal<float>(const float* din,
  }
 }

+template <>
+void act_abs<float>(const float* din, float* dout, int size, int threads) {
+  for (int i = 0; i < size; ++i) {
+    dout[0] = (din[0] > 0 ? din[0] : -din[0]);
+    din++;
+    dout++;
+  }
+}
+
 #ifdef LITE_WITH_TRAIN
 template <>
 void act_square_grad(const float* din,

--- a/lite/backends/arm/math/activation.h
+++ b/lite/backends/arm/math/activation.h
@@ -83,6 +83,9 @@ void act_hard_swish(const T* din,
 template <typename T>
 void act_reciprocal(const T* din, T* dout, int size, int threads);

+template <typename T>
+void act_abs(const T* din, T* dout, int size, int threads);
+
 #ifdef LITE_WITH_TRAIN
 template <typename T>
 void act_square_grad(

--- a/lite/backends/arm/math/concat.cc
+++ b/lite/backends/arm/math/concat.cc
@@ -16,46 +16,3 @@
 #include <algorithm>
 #include <limits>
 #include <memory>
-#include "lite/backends/arm/math/funcs.h"
-
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-
-void concat_func(const std::vector<lite::Tensor *> &input,
-                 const int axis,
-                 lite::Tensor *output) {
-  int64_t concat_input_size = 1;
-  int64_t num_cancats = 1;
-  auto dim_0 = input[0]->dims();
-  size_t num = input.size();
-  for (int i = axis + 1; i < dim_0.size(); i++) {
-    concat_input_size *= dim_0[i];
-  }
-  for (int i = 0; i < axis; i++) {
-    num_cancats *= dim_0[i];
-  }
-  float *dst_ptr = output->mutable_data<float>();
-  const int out_concat_axis = output->dims()[axis];
-  int64_t offset_concat_axis = 0;
-  int64_t out_sum = out_concat_axis * concat_input_size;
-  for (int n = 0; n < num; n++) {
-    auto dims = input[n]->dims();
-    const float *src_ptr = input[n]->data<float>();
-    int64_t in_concat_axis = dims[axis];
-    float *dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
-    int64_t in_sum = in_concat_axis * concat_input_size;
-    for (int i = 0; i < num_cancats; i++) {
-      std::memcpy(dout_ptr, src_ptr, sizeof(float) * in_sum);
-      dout_ptr += out_sum;
-      src_ptr += in_sum;
-    }
-    offset_concat_axis += in_concat_axis;
-  }
-}
-
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
--- a/lite/backends/arm/math/concat.h
+++ b/lite/backends/arm/math/concat.h
@@ -25,9 +25,39 @@ namespace lite {
 namespace arm {
 namespace math {

-void concat_func(const std::vector<lite::Tensor *> &input,
+template <typename T>
+void concat_func(const std::vector<lite::Tensor*>& input,
                 const int axis,
-                 lite::Tensor *output);
+                 lite::Tensor* output) {
+  size_t num = input.size();
+  auto dim_0 = input[0]->dims();
+  int64_t concat_input_size = 1;
+  int64_t num_cancats = 1;
+  for (int i = axis + 1; i < dim_0.size(); i++) {
+    concat_input_size *= dim_0[i];
+  }
+  for (int i = 0; i < axis; i++) {
+    num_cancats *= dim_0[i];
+  }
+
+  auto* dst_ptr = output->mutable_data<T>();
+  const int out_concat_axis = output->dims()[axis];
+  int64_t offset_concat_axis = 0;
+  int64_t out_sum = out_concat_axis * concat_input_size;
+  for (int n = 0; n < num; n++) {
+    auto dims = input[n]->dims();
+    auto* src_ptr = input[n]->data<T>();
+    int64_t in_concat_axis = dims[axis];
+    auto* dout_ptr = dst_ptr + offset_concat_axis * concat_input_size;
+    int64_t in_sum = in_concat_axis * concat_input_size;
+    for (int i = 0; i < num_cancats; i++) {
+      std::memcpy(dout_ptr, src_ptr, sizeof(T) * in_sum);
+      dout_ptr += out_sum;
+      src_ptr += in_sum;
+    }
+    offset_concat_axis += in_concat_axis;
+  }
+}

 }  // namespace math
 }  // namespace arm

--- a/lite/kernels/arm/activation_compute.cc
+++ b/lite/kernels/arm/activation_compute.cc
@@ -207,6 +207,16 @@ void ReciprocalCompute::Run() {
      x_data, output_data, x_dims.production(), ctx.threads());
 }

+void AbsCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<ARMContext>();
+  auto x_dims = param.X->dims();
+  auto x_data = param.X->data<float>();
+  auto output_data = param.Out->mutable_data<float>();
+  lite::arm::math::act_abs<float>(
+      x_data, output_data, x_dims.production(), ctx.threads());
+}
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite
@@ -321,3 +331,8 @@ REGISTER_LITE_KERNEL(reciprocal,
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
+REGISTER_LITE_KERNEL(
+    abs, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::AbsCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/lite/kernels/arm/activation_compute.h
+++ b/lite/kernels/arm/activation_compute.h
@@ -166,6 +166,15 @@ class ReciprocalCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
  virtual ~ReciprocalCompute() = default;
 };

+class AbsCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::ActivationParam;
+
+  void Run() override;
+
+  virtual ~AbsCompute() = default;
+};
+
 }  // namespace arm
 }  // namespace kernels
 }  // namespace lite

--- a/lite/kernels/arm/concat_compute.cc
+++ b/lite/kernels/arm/concat_compute.cc
@@ -34,40 +34,21 @@ std::vector<size_t> stride_numel(const DDim& ddim) {
  return strides;
 }

-void ConcatCompute::Run() {
-  auto& param = Param<operators::ConcatParam>();
-  std::vector<lite::Tensor*> inputs = param.x;
-  auto* out = param.output;
-  int axis = param.axis;
-  auto* axis_tensor = param.axis_tensor;
-  if (axis_tensor != nullptr) {
-    auto* axis_tensor_data = axis_tensor->data<int>();
-    axis = axis_tensor_data[0];
-  }
-  out->mutable_data<float>();
-
-  /// Sometimes direct copies will be faster, this maybe need deeply analysis.
+template <typename T>
+void ConcatFunc(const std::vector<lite::Tensor*> inputs,
+                int axis,
+                lite::Tensor* out) {
+  // Sometimes direct copies will be faster, this maybe need deeply analysis.
  if (axis == 0 && inputs.size() < 10) {
    size_t output_offset = 0;
    for (auto* in : inputs) {
      auto in_stride = stride_numel(in->dims());
      auto out_stride = stride_numel(out->dims());
-      void* dst = out->mutable_data<float>() + output_offset;
-      const void* src = in->data<float>();
-#if 0
-      LOG(INFO) << "out_stride.size():" << out_stride.size();
-      LOG(INFO) << "out_stride[0]" << out_stride[0];
-      for (int i=0; i < out_stride.size(); ++i) {
-        LOG(INFO) << "out_stride[" << i << "]:" << out_stride[i];
-      }
-      LOG(INFO) << "in_stride.size():" << in_stride.size();
-      for (int i=0; i < in_stride.size(); ++i) {
-        LOG(INFO) << "in_stride[" << i << "]:" << in_stride[i];
-      }
-#endif
+      void* dst = out->mutable_data<T>() + output_offset;
+      const void* src = in->data<T>();
      // src and dst tensor should have the same dims size.
      CHECK(in_stride.size() == out_stride.size());
-      std::memcpy(dst, src, sizeof(float) * in_stride[0]);
+      std::memcpy(dst, src, sizeof(T) * in_stride[0]);
      output_offset += in_stride[0];
    }
  } else {
@@ -75,9 +56,37 @@ void ConcatCompute::Run() {
    for (int j = 0; j < inputs.size(); ++j) {
      inputs_concat[j] = inputs[j];
    }
-    lite::arm::math::concat_func(inputs_concat, axis, out);
+    lite::arm::math::concat_func<T>(inputs_concat, axis, out);
+  }
+}
+
+void ConcatCompute::Run() {
+  auto& param = Param<operators::ConcatParam>();
+  std::vector<lite::Tensor*> inputs = param.x;
+  CHECK_GE(inputs.size(), 1);
+  auto* out = param.output;
+  int axis = param.axis;
+  auto* axis_tensor = param.axis_tensor;
+  if (axis_tensor != nullptr) {
+    auto* axis_tensor_data = axis_tensor->data<int>();
+    axis = axis_tensor_data[0];
+  }
+
+  switch (inputs.front()->precision()) {
+    case PRECISION(kFloat):
+      ConcatFunc<float>(inputs, axis, out);
+      break;
+    case PRECISION(kInt32):
+      ConcatFunc<int32_t>(inputs, axis, out);
+      break;
+    case PRECISION(kInt64):
+      ConcatFunc<int64_t>(inputs, axis, out);
+      break;
+    default:
+      LOG(FATAL) << "Concat does not implement for the "
+                 << "input type:"
+                 << static_cast<int>(inputs.front()->precision());
  }
-  return;
 }

 }  // namespace arm
@@ -86,9 +95,9 @@ void ConcatCompute::Run() {
 }  // namespace paddle

 REGISTER_LITE_KERNEL(
-    concat, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ConcatCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    concat, kARM, kAny, kNCHW, paddle::lite::kernels::arm::ConcatCompute, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .BindInput("AxisTensor",
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .Finalize();
--- a/lite/kernels/arm/concat_compute.h
+++ b/lite/kernels/arm/concat_compute.h
@@ -22,7 +22,7 @@ namespace lite {
 namespace kernels {
 namespace arm {

-class ConcatCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class ConcatCompute : public KernelLite<TARGET(kARM), PRECISION(kAny)> {
 public:
  using param_t = operators::ConcatParam;


--- a/lite/kernels/arm/concat_compute_test.cc
+++ b/lite/kernels/arm/concat_compute_test.cc
@@ -95,7 +95,7 @@ void concat_compute_ref(const operators::ConcatParam& param) {

 TEST(concat_arm, init) {
  ConcatCompute concat;
-  ASSERT_EQ(concat.precision(), PRECISION(kFloat));
+  ASSERT_EQ(concat.precision(), PRECISION(kAny));
  ASSERT_EQ(concat.target(), TARGET(kARM));
 }

@@ -222,8 +222,7 @@ TEST(concat_arm, compute_input_multi) {

 TEST(concat, retrive_op) {
  auto concat =
-      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kFloat)>(
-          "concat");
+      KernelRegistry::Global().Create<TARGET(kARM), PRECISION(kAny)>("concat");
  ASSERT_FALSE(concat.empty());
  ASSERT_TRUE(concat.front());
 }
@@ -233,4 +232,4 @@ TEST(concat, retrive_op) {
 }  // namespace lite
 }  // namespace paddle

-USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(concat, kARM, kAny, kNCHW, def);
--- a/lite/kernels/arm/gather_compute.cc
+++ b/lite/kernels/arm/gather_compute.cc
@@ -20,24 +20,48 @@ namespace lite {
 namespace kernels {
 namespace arm {

-void GatherCompute::Run() {
-  auto& param = this->Param<operators::GatherParam>();
-
-  auto* p_output = param.Out->mutable_data<float>();
-  auto index_size = param.Index->dims()[0];
+template <typename T>
+void GatherFunc(const operators::GatherParam& param) {
  auto src_dims = param.X->dims();
-  const float* p_src = param.X->data<float>();
+  auto index_size = param.Index->dims()[0];
+  auto* p_src = param.X->data<T>();
  const int* p_index = param.Index->data<int>();
+  auto* p_output = param.Out->mutable_data<T>();

  int slice_size = 1;
-  for (int i = 1; i < src_dims.size(); ++i) {
+  for (size_t i = 1; i < src_dims.size(); ++i) {
    slice_size *= src_dims[i];
  }
  for (int i = 0; i < index_size; ++i) {
    int index_ = p_index[i];
    memcpy(p_output + i * slice_size,
           p_src + index_ * slice_size,
-           slice_size * sizeof(float));
+           slice_size * sizeof(T));
+  }
+}
+
+void GatherCompute::Run() {
+  auto& param = this->Param<operators::GatherParam>();
+
+  switch (param.X->precision()) {
+    case PRECISION(kFloat):
+      GatherFunc<float>(param);
+      break;
+    case PRECISION(kInt8):
+      GatherFunc<int8_t>(param);
+      break;
+    case PRECISION(kInt16):
+      GatherFunc<int16_t>(param);
+      break;
+    case PRECISION(kInt32):
+      GatherFunc<int32_t>(param);
+      break;
+    case PRECISION(kInt64):
+      GatherFunc<int64_t>(param);
+      break;
+    default:
+      LOG(FATAL) << "Gather does not implement for the "
+                 << "input type:" << static_cast<int>(param.X->precision());
  }
 }

@@ -48,8 +72,8 @@ void GatherCompute::Run() {

 REGISTER_LITE_KERNEL(
    gather, kARM, kAny, kNCHW, paddle::lite::kernels::arm::GatherCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .BindInput("Index",
               {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny))})
    .Finalize();