Add gather op on x86 platform (#2419)

* add gather op on x86 platform and add its unittests, test=develop

Add gather op on x86 platform (#2419)
* add gather op on x86 platform and add its unittests, test=develop
054c334f · liu zhengxi · GitHub · 5df88f21 · 054c334f · 054c334f
4 changed file
--- a/lite/kernels/x86/CMakeLists.txt
+++ b/lite/kernels/x86/CMakeLists.txt
@@ -29,6 +29,7 @@ add_kernel(sequence_expand_as_compute_x86 X86 basic SRCS sequence_expand_as_comp
 # lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86)
 # lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
+add_kernel(gather_compute_x86 X86 basic SRCS gather_compute.cc DEPS ${lite_kernel_deps})
 # lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
 # lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
 # lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86)
@@ -65,6 +66,7 @@ add_kernel(matmul_compute_x86 X86 basic SRCS matmul_compute.cc DEPS ${lite_kerne
 lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
 lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86)
+lite_cc_test(test_gather_compute_x86 SRCS gather_compute_test.cc DEPS gather_compute_x86)
 lite_cc_test(test_slice_compute_x86 SRCS slice_compute_test.cc DEPS slice_compute_x86)
 lite_cc_test(test_squeeze_compute_x86 SRCS squeeze_compute_test.cc DEPS squeeze_compute_x86)
 lite_cc_test(test_fill_constant_batch_size_like_compute_x86 SRCS fill_constant_batch_size_like_compute_test.cc DEPS fill_constant_batch_size_like_compute_x86)

--- a/lite/kernels/x86/gather_compute.cc
+++ b/lite/kernels/x86/gather_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/x86/gather_compute.h"
+typedef paddle::lite::kernels::x86::GatherCompute<float, int32_t> GatherInt32;
+typedef paddle::lite::kernels::x86::GatherCompute<float, int64_t> GatherInt64;
+REGISTER_LITE_KERNEL(gather, kX86, kFloat, kNCHW, GatherInt32, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
+REGISTER_LITE_KERNEL(gather, kX86, kFloat, kNCHW, GatherInt64, int64_in)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Index",
+               {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/lite/kernels/x86/gather_compute.h
+++ b/lite/kernels/x86/gather_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <vector>
+#include "lite/api/paddle_place.h"
+#include "lite/core/kernel.h"
+#include "lite/core/op_registry.h"
+#include "lite/core/types.h"
+#include "lite/fluid/data_type.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+/**
+ * A thin wrapper for gathering on cpu tensor
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-IndexT index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T, typename IndexT = int>
+void CPUGather(const lite::Tensor* src,
+               const lite::Tensor* index,
+               lite::Tensor* output) {
+  // check index of shape 1-D
+  if (index->dims().size() == 2) {
+    CHECK(index->dims()[1] == 1) << "Index(Input)'s dimension[1] should be 1 "
+                                    "when Index(input)'s dimension's size "
+                                    "equal to 2 in Gather(Op).";
+  } else {
+    CHECK(index->dims().size() == 1)
+        << "Index(Input)'s dimension's size() should be 1 or 2 in Gather(Op).";
+  }
+  int64_t index_size = index->dims()[0];
+  auto src_dims = src->dims();
+  const T* p_src = src->data<T>();
+  const IndexT* p_index = index->data<IndexT>();
+  T* p_output = output->mutable_data<T>();
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+  const size_t slice_bytes = slice_size * sizeof(T);
+  for (int64_t i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
+  }
+}
+template <typename T, typename IndexT>
+class GatherCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::GatherParam;
+  void Run() override {
+    auto& param = *param_.get_mutable<param_t>();
+    auto x = param.X;
+    auto index = param.Index;
+    auto out = param.Out;
+    out->mutable_data<T>();
+    if (x->dims().production() == 0) return;
+    /*
+     * Since there's no type defined for lite::Tensor in Paddle-Lite, then
+     * convert the Index's value to float which must be int32_t or int64_t and
+     * this supposes to cause no precision difference during inference just for
+     * now.
+     * Alternatively, if define the Tensor's type during registering, may cause
+     * a redefinition error.
+     */
+    CPUGather<T, IndexT>(x, index, out);
+  }
+  virtual ~GatherCompute() = default;
+};
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/lite/kernels/x86/gather_compute_test.cc
+++ b/lite/kernels/x86/gather_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/x86/gather_compute.h"
+#include <gtest/gtest.h>
+#include <memory>
+#include <utility>
+#include <vector>
+#include "lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+TEST(gather_x86, retrive_op) {
+  auto gather =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "gather");
+  ASSERT_FALSE(gather.empty());
+  int cnt = 0;
+  for (auto item = gather.begin(); item != gather.end(); ++item) {
+    cnt++;
+    ASSERT_TRUE(*item);
+  }
+  ASSERT_EQ(cnt, 2);
+}
+TEST(gather_x86, int32_init) {
+  GatherCompute<float, int32_t> gather;
+  ASSERT_EQ(gather.precision(), PRECISION(kFloat));
+  ASSERT_EQ(gather.target(), TARGET(kX86));
+}
+TEST(gather_x86, int64_init) {
+  GatherCompute<float, int64_t> gather;
+  ASSERT_EQ(gather.precision(), PRECISION(kFloat));
+  ASSERT_EQ(gather.target(), TARGET(kX86));
+}
+template <typename T>
+void test_case_1dims() {
+  lite::Tensor x, index, out;
+  std::vector<int64_t> x_shape{10};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> index_shape{3};
+  index.Resize(lite::DDim(index_shape));
+  std::vector<int64_t> out_shape{3};
+  out.Resize(lite::DDim(out_shape));
+  auto x_data = x.mutable_data<float>();
+  auto index_data = index.mutable_data<T>();
+  auto out_data = out.mutable_data<float>();
+  for (int64_t i = 0; i < x.dims().production(); ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+  std::vector<float> index_value{1, 3, 5};
+  for (int i = 0; i < index.dims().production(); ++i) {
+    index_data[i] = static_cast<T>(index_value[i]);
+  }
+  GatherCompute<float, T> gather;
+  operators::GatherParam param;
+  param.X = &x;
+  param.Index = &index;
+  param.Out = &out;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  gather.SetContext(std::move(ctx));
+  gather.SetParam(param);
+  gather.Run();
+  std::vector<float> ref_data{1, 3, 5};
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
+  }
+}
+template <typename T>
+void test_case_2dims() {
+  lite::Tensor x, index, out;
+  std::vector<int64_t> x_shape{10, 20};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> index_shape{3};
+  index.Resize(lite::DDim(index_shape));
+  std::vector<int64_t> out_shape{3, 20};
+  out.Resize(lite::DDim(out_shape));
+  auto x_data = x.mutable_data<float>();
+  auto index_data = index.mutable_data<T>();
+  auto out_data = out.mutable_data<float>();
+  for (int64_t i = 0; i < x.dims().production(); ++i) {
+    x_data[i] = static_cast<float>(i);
+  }
+  std::vector<float> index_value{1, 3, 5};
+  for (int i = 0; i < index.dims().production(); ++i) {
+    index_data[i] = static_cast<T>(index_value[i]);
+  }
+  GatherCompute<float, T> gather;
+  operators::GatherParam param;
+  param.X = &x;
+  param.Index = &index;
+  param.Out = &out;
+  std::unique_ptr<KernelContext> ctx(new KernelContext);
+  ctx->As<X86Context>();
+  gather.SetContext(std::move(ctx));
+  gather.SetParam(param);
+  gather.Run();
+  std::vector<float> ref_data(60);
+  for (int i = 0; i < 20; ++i) {
+    ref_data[i] = static_cast<float>(20 + i);
+  }
+  for (int i = 20; i < 40; ++i) {
+    ref_data[i] = static_cast<float>(40 + i);
+  }
+  for (int i = 40; i < 60; ++i) {
+    ref_data[i] = static_cast<float>(60 + i);
+  }
+  for (int i = 0; i < out.dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], ref_data[i], 1e-5);
+  }
+}
+TEST(gather_x86, run_test_1dims) {
+  test_case_1dims<int32_t>();
+  test_case_1dims<int64_t>();
+}
+TEST(gather_x86, run_test_2dims) {
+  test_case_2dims<int32_t>();
+  test_case_2dims<int64_t>();
+}
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(gather, kX86, kFloat, kNCHW, def);
+USE_LITE_KERNEL(gather, kX86, kFloat, kNCHW, int64_in);