[NPU] support expand op (#3594)

f99e28b4 · zhupengyang · GitHub · e9b94f04 · f99e28b4 · f99e28b4
8 changed file
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -41,7 +41,6 @@ add_kernel(slice_compute_arm ARM basic SRCS slice_compute.cc DEPS ${lite_kernel_
 add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(squeeze_compute_arm ARM basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(unsqueeze_compute_arm ARM basic SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(expand_compute_arm ARM basic SRCS expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(reduce_mean_compute_arm ARM basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(stack_compute_arm ARM basic SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(affine_channel_compute_arm ARM basic SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)

--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -4,6 +4,7 @@ add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_
 add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(expand_compute_host Host basic SRCS expand_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(shape_compute_host Host extra SRCS shape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(is_empty_compute_host Host extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})

--- a/lite/kernels/arm/expand_compute.cc
+++ b/lite/kernels/arm/expand_compute.cc
@@ -12,24 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "lite/kernels/arm/expand_compute.h"
+#include "lite/kernels/host/expand_compute.h"
 #include <vector>
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
-void ExpandCompute::Run() {
+template <typename T, PrecisionType PType>
-  auto& param = Param<operators::ExpandParam>();
+void ExpandCompute<T, PType>::Run() {
+  auto& param = this->template Param<operators::ExpandParam>();
  const auto* x = param.X;
  auto* out = param.Out;
  std::vector<int> expand_times = param.expand_times;
-  const float* src = x->data<float>();
+  const T* src = x->template data<T>();
-  float* dst = out->mutable_data<float>();
+  T* dst = out->template mutable_data<T>();
  int dims = expand_times.size();
  DDim in_shape = x->dims();
@@ -42,7 +41,7 @@ void ExpandCompute::Run() {
    for (int k = 0; k < expand_times[i]; ++k) {
      memcpy(dst + (j * expand_times[i] + k) * inner_num,
             src + j * inner_num,
-             sizeof(float) * inner_num);
+             sizeof(T) * inner_num);
    }
  }
  inner_num *= expand_times[i];
@@ -53,20 +52,27 @@ void ExpandCompute::Run() {
      for (int k = expand_times[i] - 1; k >= 0; --k) {
        memcpy(dst + (j * expand_times[i] + k) * inner_num,
               dst + j * inner_num,
-               sizeof(float) * inner_num);
+               sizeof(T) * inner_num);
      }
    }
    inner_num *= expand_times[i];
  }
 }
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
-REGISTER_LITE_KERNEL(
+using expand_float =
-    expand, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ExpandCompute, def)
+    paddle::lite::kernels::host::ExpandCompute<float, PRECISION(kFloat)>;
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
+REGISTER_LITE_KERNEL(expand, kHost, kFloat, kAny, expand_float, def)
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kAny))})
    .Finalize();
--- a/lite/kernels/arm/expand_compute.h
+++ b/lite/kernels/arm/expand_compute.h
@@ -19,16 +19,18 @@
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
-class ExpandCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class ExpandCompute
+    : public KernelLite<TARGET(kHost), PType, DATALAYOUT(kAny)> {
 public:
  void Run() override;
  virtual ~ExpandCompute() = default;
 };
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -49,6 +49,7 @@ lite_cc_library(subgraph_bridge_fill_constant_op_npu SRCS fill_constant_op.cc DE
 lite_cc_library(subgraph_bridge_fill_constant_batch_size_like_op_npu SRCS fill_constant_batch_size_like_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_increment_op_npu SRCS increment_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_compare_op_npu SRCS compare_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_expand_op_npu SRCS expand_op.cc DEPS ${npu_subgraph_bridge_deps})
 #lite_cc_library(subgraph_bridge_shape_op_npu SRCS shape_op.cc DEPS ${npu_subgraph_bridge_deps})
@@ -87,6 +88,7 @@ set(npu_subgraph_bridges
        subgraph_bridge_fill_constant_batch_size_like_op_npu
        subgraph_bridge_increment_op_npu
        subgraph_bridge_compare_op_npu
+        subgraph_bridge_expand_op_npu
        CACHE INTERNAL "npu_subgraph_bridges")
 message(STATUS "+++++ npu_subgraph_bridges: ${npu_subgraph_bridges}")
--- a/lite/kernels/npu/bridges/expand_op.cc
+++ b/lite/kernels/npu/bridges/expand_op.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+int ExpandConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  auto expand_times = op_info->GetAttr<std::vector<int>>("expand_times");
+  // x node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+  // w node
+  std::shared_ptr<Node> w_node = graph->Add(out_name + "/w", expand_times);
+  // expand node
+  auto expand_node = graph->Add<ge::op::Tile>(out_name);
+  auto expand_op = expand_node->data<ge::op::Tile>();
+  expand_op->set_input_x(*x_node->data());
+  expand_op->set_input_w(*w_node->data());
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(expand,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ExpandConverter);
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
@@ -38,6 +38,7 @@ USE_SUBGRAPH_BRIDGE(elementwise_add, kNPU);
 USE_SUBGRAPH_BRIDGE(elementwise_sub, kNPU);
 USE_SUBGRAPH_BRIDGE(elementwise_mul, kNPU);
 USE_SUBGRAPH_BRIDGE(elementwise_div, kNPU);
+USE_SUBGRAPH_BRIDGE(expand, kNPU);
 USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kNPU);
 USE_SUBGRAPH_BRIDGE(fusion_elementwise_sub_activation, kNPU);
 USE_SUBGRAPH_BRIDGE(fusion_elementwise_mul_activation, kNPU);

--- a/lite/tests/kernels/expand_compute_test.cc
+++ b/lite/tests/kernels/expand_compute_test.cc
@@ -84,7 +84,7 @@ class ExpandComputeTester : public arena::TestCase {
  }
 };
-void test_expand_3dim(Place place) {
+void test_expand_3dim(Place place, float abs_error) {
  for (std::vector<int> expand_times : {std::vector<int>({2, 3, 1}),
                                        std::vector<int>({2, 2, 2}),
                                        std::vector<int>({3, 1, 2})}) {
@@ -93,7 +93,7 @@ void test_expand_3dim(Place place) {
        for (int W : {4}) {
          std::unique_ptr<arena::TestCase> tester(new ExpandComputeTester(
              place, "def", expand_times, DDim({C, H, W})));
-          arena::Arena arena(std::move(tester), place, 2e-5);
+          arena::Arena arena(std::move(tester), place, abs_error);
          arena.TestPrecision();
        }
      }
@@ -101,7 +101,7 @@ void test_expand_3dim(Place place) {
  }
 }
-void test_expand_4dim(Place place) {
+void test_expand_4dim(Place place, float abs_error) {
  for (std::vector<int> expand_times : {std::vector<int>({2, 3, 1, 4}),
                                        std::vector<int>({2, 2, 2, 2}),
                                        std::vector<int>({3, 1, 2, 1})}) {
@@ -111,7 +111,7 @@ void test_expand_4dim(Place place) {
          for (int W : {4}) {
            std::unique_ptr<arena::TestCase> tester(new ExpandComputeTester(
                place, "def", expand_times, DDim({N, C, H, W})));
-            arena::Arena arena(std::move(tester), place, 2e-5);
+            arena::Arena arena(std::move(tester), place, abs_error);
            arena.TestPrecision();
          }
        }
@@ -121,14 +121,19 @@ void test_expand_4dim(Place place) {
 }
 TEST(Expand, precision) {
-#ifdef LITE_WITH_X86
+  float abs_error = 1e-5;
-  Place place(TARGET(kX86));
+  Place place;
-#endif
+#if defined(LITE_WITH_NPU)
-#ifdef LITE_WITH_ARM
+  place = TARGET(kNPU);
-  Place place(TARGET(kARM));
+  abs_error = 1e-2;  // Using fp16 in NPU
-  test_expand_3dim(place);
+#elif defined(LITE_WITH_ARM)
-  test_expand_4dim(place);
+  place = TARGET(kHost);
+#else
+  return;
 #endif
+  test_expand_3dim(place, abs_error);
+  test_expand_4dim(place, abs_error);
 }
 }  // namespace lite