[NPU] add host kernels, enhance reshape ut (#2733)

test=develop

[NPU] add host kernels, enhance reshape ut (#2733)
test=develop
8fef7532 · zhupengyang · GitHub · 2ad0e84a · 8fef7532 · 8fef7532
4 changed file
--- a/lite/kernels/npu/bridges/engine.cc
+++ b/lite/kernels/npu/bridges/engine.cc
@@ -57,9 +57,11 @@ int Engine::BuildOriginProgram() {
      VLOG(3) << "The attr '" << kKernelTypeAttr
              << "' not found, pick the first kernel for " << op_type;
 #if defined(LITE_WITH_ARM)
-      auto kernels = op->CreateKernels({Place{TARGET(kARM)}});
+      auto kernels =
+          op->CreateKernels({Place{TARGET(kARM)}, Place{TARGET(kHost)}});
 #elif defined(LITE_WITH_X86)
-      auto kernels = op->CreateKernels({Place{TARGET(kX86)}});
+      auto kernels =
+          op->CreateKernels({Place{TARGET(kX86)}, Place{TARGET(kHost)}});
 #endif
      CHECK_GT(kernels.size(), 0) << "No kernels found for " << op_type;
      picked_kernel = std::move(kernels.front());

--- a/lite/kernels/npu/bridges/reshape_op.cc
+++ b/lite/kernels/npu/bridges/reshape_op.cc
@@ -34,14 +34,11 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  // Get input and output vars and op attributes
  auto x_name = op_info->Input("X").front();
  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
-  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
  auto x = scope->FindMutableTensor(x_name);
  auto x_dims = x->dims();
+
  auto out_name = op_info->Output("Out").front();
  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
-  CHECK(out_type->layout() == DATALAYOUT(kNCHW));

  // X node
  std::shared_ptr<Node> x_node = nullptr;
@@ -81,6 +78,7 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
        LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
                        "but Shape has "
                     << out_shape.size();
+        return FAILED;
      }
      actual_shape_node =
          graph->Add(actual_shape_name,
@@ -95,34 +93,12 @@ int ReshapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
      LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
                      "but shape has "
                   << out_shape.size();
+      return FAILED;
    }
    reshape_op->set_attr_shape(
        ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
  }

-  // XShape node
-  if (op_type == "reshape2") {
-    // Append an extra reshape node to calc XShape
-    std::vector<int64_t> xshape_dims(x_dims.size() + 1, 1);
-    for (size_t i = 0; i < x_dims.size(); i++) {
-      xshape_dims[i + 1] = x_dims[i];
-    }
-    if (xshape_dims.size() > 4) {
-      LOG(WARNING) << "[NPU] HiAI DDK only supports less than 4 dimensions, "
-                      "but XShape has "
-                   << xshape_dims.size();
-      return FAILED;
-    }
-    auto xshape_name = op_info->Output("XShape").front();
-    // auto xshape_type = kernel->GetOutputDeclType("XShape");
-    // CHECK(xshape_type->precision() == PRECISION(kFloat));
-    // CHECK(xshape_type->layout() == DATALAYOUT(kNCHW));
-    auto xshape_node = graph->Add<ge::op::Reshape>(xshape_name);
-    auto xshape_op = xshape_node->data<ge::op::Reshape>();
-    xshape_op->set_input_tensor(*x_node->data());
-    xshape_op->set_attr_shape(
-        ge::AttrValue::LIST_INT(xshape_dims.begin(), xshape_dims.end()));
-  }
  return REBUILD_WHEN_SHAPE_CHANGED;
 }


--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -26,7 +26,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH
    #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${npu_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
@@ -16,6 +16,7 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"

 namespace paddle {
 namespace lite {
@@ -29,19 +30,19 @@ class ReshapeComputeTester : public arena::TestCase {
  std::string xshape_ = "xshape";
  std::vector<std::string> shape_tensor_vct_;
  std::string shape_tensor_;
-  DDim x_dims_;
+  DDim dims_;
  std::vector<int> shape_;
  bool inplace_ = false;

 public:
  ReshapeComputeTester(const Place& place,
                       const std::string& alias,
-                       DDim x_dims,
+                       DDim dims,
                       std::vector<int> shape,
                       bool is_shape_tensor_vct = false,
                       bool is_shape_tensor = false,
                       bool is_shape = true)
-      : TestCase(place, alias), x_dims_(x_dims) {
+      : TestCase(place, alias), dims_(dims) {
    if (is_shape_tensor_vct) {
      for (size_t i = 0; i < shape.size(); i++) {
        shape_tensor_vct_.emplace_back(op_type_ + "/shape" + std::to_string(i));
@@ -60,7 +61,6 @@ class ReshapeComputeTester : public arena::TestCase {
    CHECK(out);

    auto* x = scope->FindTensor(input_);
-    auto x_dims = x->dims();

    std::vector<int> out_shape;
    if (shape_tensor_vct_.size() > 0) {
@@ -86,8 +86,8 @@ class ReshapeComputeTester : public arena::TestCase {
        CHECK_EQ(unk_dim_idx, -1);
        unk_dim_idx = i;
      } else if (out_shape[i] == 0) {
-        CHECK_LE(i, x_dims.size());
-        final_out_shape[i] = x_dims[i];
+        CHECK_LE(i, dims_.size());
+        final_out_shape[i] = dims_[i];
      } else if (out_shape[i] > 0) {
        final_out_shape[i] = out_shape[i];
      } else {
@@ -97,18 +97,18 @@ class ReshapeComputeTester : public arena::TestCase {
    }

    if (unk_dim_idx > -1) {
-      final_out_shape[unk_dim_idx] = x_dims.production() / cap;
+      final_out_shape[unk_dim_idx] = dims_.production() / cap;
    }

    out->Resize(final_out_shape);

    auto x_data = x->data<float>();
    auto out_data = out->mutable_data<float>();
-    memcpy(out_data, x_data, sizeof(float) * x_dims.production());
+    memcpy(out_data, x_data, sizeof(float) * dims_.production());

    if (op_type_ == "reshape2") {
      auto* xshape = scope->NewTensor(xshape_);
-      auto xshape_dims = x_dims.Vectorize();
+      auto xshape_dims = dims_.Vectorize();
      xshape_dims.insert(xshape_dims.begin(), 0);
      xshape->Resize(xshape_dims);
    }
@@ -134,11 +134,9 @@ class ReshapeComputeTester : public arena::TestCase {
  }

  void PrepareData() override {
-    std::vector<float> data(x_dims_.production());
-    for (int i = 0; i < x_dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-    SetCommonTensor(input_, x_dims_, data.data());
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(input_, dims_, din.data());

    if (shape_tensor_vct_.size() > 0) {
      for (size_t i = 0; i < shape_.size(); i++) {
@@ -161,13 +159,16 @@ TEST(Reshape, precision) {
  LOG(INFO) << "test Reshape op";
  float abs_error = 2e-5;
  Place place;
-#ifdef LITE_WITH_XPU
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_XPU)
  place = TARGET(kXPU);
 #else
  return;
 #endif

-  DDim x_dims{{2, 3, 4, 5}};
+  DDim dims{{2, 3, 4, 5}};
  std::vector<std::vector<int>> shapes{{5, 4, 3, 2},
                                       {2, 3, 20},
                                       {2, 60},
@@ -176,8 +177,11 @@ TEST(Reshape, precision) {
                                       {0, 0, 20},
                                       {0, 0, -1}};
  for (auto shape : shapes) {
+#ifdef LITE_WITH_NPU
+    if (dims.size() > 4 || shape.size() > 4) continue;
+#endif
    std::unique_ptr<arena::TestCase> tester(
-        new ReshapeComputeTester(place, "def", x_dims, shape));
+        new ReshapeComputeTester(place, "def", dims, shape));
    arena::Arena arena(std::move(tester), place, abs_error);
    arena.TestPrecision({"xshape"});
  }