Merge branch 'Add_GatherOp' into develop

b0f4eae6 · jackzhang235 · 2fcef808 · 5df7a6f5 · b0f4eae6 · b0f4eae6
7 changed file
--- a/.github/workflows/github-CI.yml
+++ b/.github/workflows/github-CI.yml
@@ -14,7 +14,7 @@ jobs:
    steps:
    - uses: actions/checkout@v2
    - name: modity build.sh
-      run: sed -i 's/DLITE_WITH_PYTHON=ON/DLITE_WITH_PYTHON=OFF/' lite/tools/build_mlu.sh && sed -i 's/WITH_TESTING=OFF/WITH_TESTING=ON/' lite/tools/build_mlu.sh && sed -i 's/PRINT_HW_TIME false/PRINT_HW_TIME true/' lite/kernels/mlu/bridges/graph.h
+      run: sed -i 's/DLITE_WITH_PYTHON=ON/DLITE_WITH_PYTHON=OFF/' lite/tools/build_mlu.sh && sed -i 's/WITH_TESTING=OFF/WITH_TESTING=ON/' lite/tools/build_mlu.sh && sed -i 's/PRINT_HW_TIME false/PRINT_HW_TIME true/' lite/kernels/mlu/bridges/graph.h && sed -i 's/BUILD_EXTRA=OFF/BUILD_EXTRA=ON/' lite/tools/build_mlu.sh
    - name: build
      run: ./lite/tools/build_mlu.sh build
    - name: test_act_converter_mlu
@@ -47,6 +47,10 @@ jobs:
      run: ./build.lite.mlu/lite/kernels/mlu/bridges/test_argmax_converter_mlu
    - name: test_split_converter_mlu
      run: ./build.lite.mlu/lite/kernels/mlu/bridges/test_split_converter_mlu
+    - name: test_lrn_converter_mlu
+      run: ./build.lite.mlu/lite/kernels/mlu/bridges/test_lrn_converter_mlu
+    - name: test_gather_converter_mlu
+      run: ./build.lite.mlu/lite/kernels/mlu/bridges/test_gather_converter_mlu
    - name: test_classification
      run: |
        cd ..

--- a/lite/kernels/mlu/bridges/CMakeLists.txt
+++ b/lite/kernels/mlu/bridges/CMakeLists.txt
@@ -50,6 +50,8 @@ set(mlu_subgraph_bridges
 if (LITE_BUILD_EXTRA)
  lite_cc_library(subgraph_bridge_lrn_op_mlu SRCS lrn_op.cc DEPS ${subgraph_bridge_deps_mlu})
  list(APPEND mlu_subgraph_bridges subgraph_bridge_lrn_op_mlu)
+  lite_cc_library(subgraph_bridge_gather_op_mlu SRCS gather_op.cc DEPS ${subgraph_bridge_deps_mlu})
+  list(APPEND mlu_subgraph_bridges subgraph_bridge_gather_op_mlu)
 endif()
 lite_cc_library(subgraph_test_helper_mlu SRCS test_helper.cc DEPS ${mlu_subgraph_bridges})
@@ -71,5 +73,6 @@ lite_cc_test(test_argmax_converter_mlu SRCS argmax_op_test.cc DEPS scope optimiz
 lite_cc_test(test_squeeze_converter_mlu SRCS squeeze_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 if (LITE_BUILD_EXTRA)
  lite_cc_test(test_lrn_converter_mlu SRCS lrn_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
+  lite_cc_test(test_gather_converter_mlu SRCS gather_op_test.cc DEPS scope optimizer target_wrapper_host model_parser program ${mlu_subgraph_bridges} subgraph_compute_mlu subgraph_test_helper_mlu)
 endif()
 message(STATUS "+++++ mlu_subgraph_bridges: ${mlu_subgraph_bridges}")
--- a/lite/kernels/mlu/bridges/gather_op.cc
+++ b/lite/kernels/mlu/bridges/gather_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/mlu/bridges/graph.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[MLU] Converting " + op_type + "...";
+  auto x_var_name = op_info->Input("X").front();
+  auto index_var_name = op_info->Input("Index").front();
+  auto out_var_name = op_info->Output("Out").front();
+  auto output = scope->FindVar(out_var_name)->GetMutable<Tensor>();
+  auto output_dims = output->dims().Vectorize();
+  CHECK(graph->HasNode(x_var_name));
+  auto x_tensor = graph->GetNode(x_var_name);
+  auto index_tensor = graph->GetNode(index_var_name);
+  auto output_tensor = graph->AddNode(
+      out_var_name, output_dims, CNML_TENSOR, CNML_NCHW, graph->FPType());
+  cnmlBaseOp_t gather_op;
+  CNML_CALL(cnmlCreateGatherV2Op(&gather_op,
+                                 x_tensor->mlu_tensor(),
+                                 index_tensor->mlu_tensor(),
+                                 output_tensor->mlu_tensor(),
+                                 CNML_DIM_N));
+  graph->FuseOp(gather_op);
+  CNML_CALL(cnmlDestroyBaseOp(&gather_op));
+  return SUCCESS;
+}
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(gather,
+                         kMLU,
+                         paddle::lite::subgraph::mlu::GatherConverter);
--- a/lite/kernels/mlu/bridges/gather_op_test.cc
+++ b/lite/kernels/mlu/bridges/gather_op_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/operators/gather_op.h"
+#include <gtest/gtest.h>
+#include "lite/core/op_lite.h"
+#include "lite/core/op_registry.h"
+#include "lite/kernels/mlu/bridges/test_helper.h"
+#include "lite/kernels/mlu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace mlu {
+template <typename dtype>
+void gather_ref(const std::shared_ptr<operators::GatherOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
+  auto index =
+      scope->FindVar(op_info->Input("Index").front())->GetMutable<Tensor>();
+  auto out =
+      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
+  auto x_dims = x->dims();
+  auto index_dims = index->dims();
+  CHECK(index_dims.size() == 1 ||
+        (index_dims.size() == 2 && index_dims[1] == 1));
+  int batch_size = index_dims[0];
+  DDim out_dims = x_dims;
+  out_dims[0] = batch_size;
+  out->Resize(out_dims);
+  auto x_data = x->data<float>();
+  auto index_data = index->data<int>();
+  auto out_data = out->mutable_data<float>();
+  auto slice_num = x_dims[0];
+  auto slice_size = x_dims.Slice(1, x_dims.size()).production();
+  for (int i = 0; i < batch_size; i++) {
+    auto index = index_data[i];
+    CHECK_LT(index, slice_num) << "index <= slice_num";
+    CHECK_GE(index, 0) << "index > 0";
+    memcpy(out_data + i * slice_size,
+           x_data + index * slice_size,
+           slice_size * sizeof(float));
+  }
+}
+void test_gather() {
+  // prepare input&output variables
+  std::string x_var_name = "x";
+  std::string out_var_name = "out";
+  std::string out_ref_var_name = "out_ref";
+  std::string index_var_name = "index";
+  Scope scope;
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  auto* index = scope.Var(index_var_name)->GetMutable<Tensor>();
+  x->Resize({5, 4, 3, 2});
+  index->Resize({2});
+  // initialize input&output data
+  FillTensor<float>(x);
+  FillTensor<int>(index, 1, 3);
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("gather");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetInput("Index", {index_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  auto op = CreateOp<operators::GatherOp>(opdesc, &scope);
+  gather_ref<float>(op);
+  out_ref->CopyDataFrom(*out);
+  Tensor input;
+  input.Resize({5, 4, 3, 2});
+  transpose<float*>(x->mutable_data<float>(),
+                    input.mutable_data<float>(),
+                    {static_cast<int>(5),
+                     static_cast<int>(4),
+                     static_cast<int>(3),
+                     static_cast<int>(2)},
+                    {0, 2, 3, 1});
+  x->CopyDataFrom(input);
+  LaunchOp(op, {x_var_name, index_var_name}, {out_var_name});
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  Tensor output;
+  output.Resize(out->dims());
+  transpose<float*>(out_data,
+                    output.mutable_data<float>(),
+                    {static_cast<int>(out->dims()[0]),
+                     static_cast<int>(out->dims()[2]),
+                     static_cast<int>(out->dims()[3]),
+                     static_cast<int>(out->dims()[1])},
+                    {0, 3, 1, 2});
+  out_data = output.mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    VLOG(5) << i;
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 5e-4);
+  }
+}
+TEST(MLUBridges, gather) { test_gather(); }
+}  // namespace mlu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+USE_SUBGRAPH_BRIDGE(gather, kMLU);
--- a/lite/kernels/mlu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/mlu/bridges/paddle_use_bridges.h
@@ -38,5 +38,6 @@ USE_SUBGRAPH_BRIDGE(slice, kMLU);
 USE_SUBGRAPH_BRIDGE(squeeze, kMLU);
 USE_SUBGRAPH_BRIDGE(squeeze2, kMLU);
 #ifdef LITE_BUILD_EXTRA
+USE_SUBGRAPH_BRIDGE(gather, kMLU);
 USE_SUBGRAPH_BRIDGE(lrn, kMLU)
 #endif
--- a/lite/kernels/mlu/bridges/test_helper.cc
+++ b/lite/kernels/mlu/bridges/test_helper.cc
@@ -50,23 +50,54 @@ void LaunchOp(const std::shared_ptr<lite::OpLite> op,
  // Convert input data var and add it into the MLU IR graph
  for (auto& input_name : input_var_names) {
    auto input_tensor = scope->FindMutableTensor(input_name);
+    auto data_type = input_tensor->precision();
+    cnmlDataType_t fp_type;
+    switch (data_type) {
+      case paddle::lite_api::PrecisionType::kFP16:
+        fp_type = CNML_DATA_FLOAT16;
+        break;
+      case paddle::lite_api::PrecisionType::kFloat:
+        fp_type = CNML_DATA_FLOAT32;
+        break;
+      case paddle::lite_api::PrecisionType::kInt32:
+        fp_type = CNML_DATA_INT32;
+        break;
+      default:
+        CHECK(0);
+    }
    CHECK(input_tensor);
    Tensor temp_input;
    temp_input.Resize(input_tensor->dims().Vectorize());
    temp_input.CopyDataFrom(*input_tensor);
-    auto input_node =
+    if (fp_type == CNML_DATA_INT32) {
-        graph.AddNode(input_name,
+      auto input_node =
-                      input_tensor->dims().Vectorize(),
+          graph.AddNode(input_name,
-                      CNML_TENSOR,
+                        input_tensor->dims().Vectorize(),
-                      CNML_NCHW,
+                        CNML_TENSOR,
-                      graph.FPType(),
+                        CNML_NCHW,
-                      reinterpret_cast<void*>(
+                        fp_type,
-                          input_tensor->mutable_data<float>(TARGET(kMLU))));
+                        reinterpret_cast<void*>(
-    CHECK(input_node);
+                            input_tensor->mutable_data<int>(TARGET(kMLU))));
-    CNRT_CHECK(cnrtMemcpy(input_tensor->mutable_data<float>(),
+      CHECK(input_node);
-                          temp_input.mutable_data<float>(),
+      CNRT_CHECK(cnrtMemcpy(input_tensor->mutable_data<int>(),
-                          sizeof(float) * input_tensor->dims().production(),
+                            temp_input.mutable_data<int>(),
-                          CNRT_MEM_TRANS_DIR_HOST2DEV));
+                            sizeof(int) * input_tensor->dims().production(),
+                            CNRT_MEM_TRANS_DIR_HOST2DEV));
+    } else {
+      auto input_node =
+          graph.AddNode(input_name,
+                        input_tensor->dims().Vectorize(),
+                        CNML_TENSOR,
+                        CNML_NCHW,
+                        fp_type,
+                        reinterpret_cast<void*>(
+                            input_tensor->mutable_data<float>(TARGET(kMLU))));
+      CHECK(input_node);
+      CNRT_CHECK(cnrtMemcpy(input_tensor->mutable_data<float>(),
+                            temp_input.mutable_data<float>(),
+                            sizeof(float) * input_tensor->dims().production(),
+                            CNRT_MEM_TRANS_DIR_HOST2DEV));
+    }
  }
  op->CheckShape();
  op->InferShape();

--- a/lite/kernels/mlu/subgraph_compute.h
+++ b/lite/kernels/mlu/subgraph_compute.h
@@ -100,6 +100,21 @@ class SubgraphEngine : public subgraph::Engine {
    return true;
  }
+  inline cnmlDataType_t PrecisionToDatatype(PrecisionType data_type) {
+    switch (data_type) {
+      case paddle::lite_api::PrecisionType::kFP16:
+        return CNML_DATA_FLOAT16;
+      case paddle::lite_api::PrecisionType::kFloat:
+        return CNML_DATA_FLOAT32;
+      case paddle::lite_api::PrecisionType::kInt32:
+        return CNML_DATA_INT32;
+      case paddle::lite_api::PrecisionType::kInt8:
+        return CNML_DATA_INT8;
+      default:
+        return PrecisionToDatatype(fp_type_);
+    }
+  }
 protected:
  int BuildDeviceProgram() override {
    int status = 0;
@@ -113,6 +128,8 @@ class SubgraphEngine : public subgraph::Engine {
    status |= subgraph::REBUILD_WHEN_SHAPE_CHANGED;
    for (auto& input_name : input_names_) {
      auto input_tensor = scope_->FindMutableTensor(input_name);
+      auto data_type = input_tensor->precision();
+      cnmlDataType_t fp_type = PrecisionToDatatype(data_type);
      origin_itensors_.push_back(input_tensor);
      if (GetBoolFromEnv("BATCH_SIZE_CHANGEABLE")) {
        auto iv = input_tensor->dims().Vectorize();
@@ -127,7 +144,7 @@ class SubgraphEngine : public subgraph::Engine {
                                       input_tensor->dims().Vectorize(),
                                       CNML_TENSOR,
                                       CNML_NCHW,
-                                       graph->FPType());
+                                       fp_type);
      CHECK(input_node);
      // MLU doesn't support dynamic dimensions/shapes, so need to rebuild
      // the program when the shape of any input tensor is changed.