[LITE][NPU] Upgrade HiAI DDK from 300 to 310 (#2423)

c1837d76 · hong19860320 · GitHub · 94731268 · c1837d76 · c1837d76
5 changed file
--- a/lite/backends/npu/builder.h
+++ b/lite/backends/npu/builder.h
@@ -31,117 +31,6 @@
 // Extended Ops of HIAI DDK
 namespace ge {
-/**
- * Multiply the matrix x1 by the matrix x2 to generate x1 * x2.
- * The inputs must be two-dimensional matrices and the inner dimension of "x1"
- * (after being transposed if transpose_x1 is true) must match the outer
- * dimension of "x2" (after being transposed if transposed_x2 is true). <Input>
- *      x : the first input tensor, must be non const op.
- *      w : the second input tensor, must be const op.
- *      bias: the optional bias tensor, must be const op.
- * <Output>
- *      y : the output tensor.
- * <Attr>
- *      has_bias: If true, enable input bias.
- */
-REG_OP(MatMul)
-    .INPUT(x, TensorType({DT_FLOAT}))
-    .INPUT(w, TensorType({DT_FLOAT}))
-    .OPTIONAL_INPUT(bias, TensorType({DT_FLOAT}))  // bias must be const input
-    .OUTPUT(y, TensorType({DT_FLOAT}))
-    .ATTR(has_bias, AttrValue::BOOL{false})  // when has input::bias,set true
-    .OP_END();
-/**
- * Computes the gradients of convolution with respect to the input.
- * <Input>
- *      input_sizes : An integer vector representing the shape of input,
- * where input is a 4-D [batch, height, width, channels] tensor.
- *      filter : the filter tensor, with shape [H , W, filter_channel,
- * filter_number], filter_channel must be same as x channel.
- *      x :  The input tensor.
- * <Output>
- *      y : The output tensor.
- * <Attr>
- *      format: 0: NCHW. 1: NHWC
- *      group : 1: default
- *      num_output : 0: default, num_output must be equal to
- * (filter_channel * group)
- *      pad : Padding for the beginning and ending along each axis
- *      stride : Stride along each axis.
- *      dilation : dilation value along each axis of the filter.
- *      pad_mode : 0:NOTSET, 5:VALID 6:SAME. defaul value is 0:NOTSET
- *      bias_term : 0: default
- *      kernel : The shape of the convolution kernel
- */
-REG_OP(Deconvolution)
-    .INPUT(input_sizes, TensorType({DT_UINT8}))
-    .INPUT(filter, TensorType({DT_FLOAT}))
-    .INPUT(x, TensorType({DT_FLOAT}))
-    .OPTIONAL_INPUT(b, TensorType({DT_FLOAT}))
-    .OUTPUT(y, TensorType({DT_FLOAT}))
-    .ATTR(mode, AttrValue::INT{1})
-    .ATTR(format, AttrValue::INT{1})
-    .ATTR(group, AttrValue::INT{1})
-    .ATTR(num_output, AttrValue::INT{0})
-    .ATTR(pad, AttrValue::LIST_INT({0, 0, 0, 0}))
-    .ATTR(stride, AttrValue::LIST_INT({1, 1}))
-    .ATTR(dilation, AttrValue::LIST_INT({1, 1}))
-    .ATTR(pad_mode, AttrValue::INT{0})
-    .ATTR(bias_term, AttrValue::INT{0})
-    .ATTR(kernel, AttrValue::LIST_INT({0, 0}))
-    .OP_END();
-/**
- * Resize images to size using bilinear interpolation.
- * <Input>
- *      x : The tensor of 4-D
- *      w : A int32 Tensor of 2 elements: [height, width].
- * <Output>
- *      y : the output tensor
- * <Attr>
- *      align_corners : If true, the centers of the 4 corner pixels of the
- * input and output tensors are aligned, preserving the values at the corner
- * pixels.
- *      output_dim_mode : Defaults 2, including 0: zoom_factor , 1:
- * shrink_factor, 2: height/width. when output_dim_mode=2, the output-dim is
- * controled by the [height, width] of w.
- *      shrink_factor : shrink factor.
- *      zoom_factor : zoom factor.
- *      pad_begin : begin of pad.
- *      pad_end : end of pad.
- */
-REG_OP(ResizeBilinear)
-    .INPUT(x, TensorType({DT_FLOAT, DT_INT32}))
-    .INPUT(w, TensorType({DT_FLOAT, DT_INT32}))
-    .OUTPUT(y, TensorType({DT_FLOAT, DT_INT32}))
-    .ATTR(align_corners, AttrValue::BOOL{false})
-    .ATTR(output_dim_mode, AttrValue::INT{2})
-    .ATTR(shrink_factor, AttrValue::INT{1})
-    .ATTR(zoom_factor, AttrValue::INT{1})
-    .ATTR(pad_begin, AttrValue::INT{0})
-    .ATTR(pad_end, AttrValue::INT{0})
-    .OP_END();
-/**
- * Resize images to size using nearest neighbor interpolation.
- * <Input>
- *      image : Resize images to size using nearest neighbor interpolation.
- *      size : Must be one dimension and two  elements
- * <Output>
- *      output : the output tensor
- * <Attr>
- *      align_corners : If true, the centers of the 4 corner pixels of the
- * input and output tensors are aligned, preserving the values at the corner
- * pixels. Defaults to false
- */
-REG_OP(ResizeNearestNeighbor)
-    .INPUT(image, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL}))
-    .INPUT(size, TensorType({DT_INT32}))
-    .OUTPUT(output, TensorType({DT_FLOAT, DT_INT32, DT_UINT8, DT_BOOL}))
-    .ATTR(align_corners, AttrValue::BOOL{false})
-    .OP_END();
 /**
 * Pads a tensor.
 * <Input>

--- a/lite/kernels/npu/bridges/conv_transpose_op.cc
+++ b/lite/kernels/npu/bridges/conv_transpose_op.cc
@@ -82,7 +82,6 @@ node_map_type ConvTransposeConverter(
  lite::npu::OpList::Global().add(inputs_map.at(input_var_name));
  // set attributes
-  conv_transpose_node->set_attr_mode(1);
  conv_transpose_node->set_attr_format(0);    // NCHW
  conv_transpose_node->set_attr_pad_mode(0);  // NOTSET
  conv_transpose_node->set_attr_group(groups);

--- a/lite/kernels/npu/bridges/interpolate_op.cc
+++ b/lite/kernels/npu/bridges/interpolate_op.cc
@@ -45,6 +45,7 @@ node_map_type InterpolateConverter(
  auto out_h = op_info->GetAttr<int>("out_h");
  auto align_corners = op_info->GetAttr<bool>("align_corners");
  int align_mode = op_info->GetAttr<int>("align_mode");
+  auto interp_method = op_info->GetAttr<std::string>("interp_method");
  CHECK(!(align_mode == 0 && !align_corners)) << "[NPU] align_mode = 0 && "
                                                 "align_corners = false isn't "
                                                 "supported in HiAI DDK";
@@ -58,11 +59,11 @@ node_map_type InterpolateConverter(
  }
  // update out_h and out_w if has OutSize
-  bool inputs_map_has_w = false;
+  std::shared_ptr<ge::Operator> out_size_node = nullptr;
  if (lite::npu::HasInputArg(op_info, scope, "OutSize")) {
    auto out_size_var_name = op_info->Input("OutSize").front();
    if (inputs_map.count(out_size_var_name)) {
-      inputs_map_has_w = true;
+      out_size_node = inputs_map.at(out_size_var_name);
    } else {
      auto out_size =
          scope->FindVar(out_size_var_name)->GetMutable<lite::Tensor>();
@@ -73,58 +74,45 @@ node_map_type InterpolateConverter(
      out_w = out_size_data[1];
    }
  }
+  if (out_size_node == nullptr) {
-  node_map_type outputs_map;
-  auto interp_method = op_info->GetAttr<std::string>("interp_method");
    if (interp_method == "bilinear") {
-    auto interp_node = std::make_shared<ge::op::ResizeBilinear>(unique_op_type);
-    lite::npu::OpList::Global().add(interp_node);
-    interp_node->set_input_x(*inputs_map.at(x_var_name));
-    if (inputs_map_has_w) {
-      auto out_size_var_name = op_info->Input("OutSize").front();
-      interp_node->set_input_w(*inputs_map.at(out_size_var_name));
-      lite::npu::OpList::Global().add(inputs_map.at(out_size_var_name));
-    } else {
      const float largest_multiple = 7.0f;
      float multiple = static_cast<float>(x_h * x_w) / (out_h * out_w);
      CHECK_LT(multiple, largest_multiple)
          << "[NPU] multiple=(ih*iw)/(oh*ow)=" << multiple
          << " is too large, should not exceed " << largest_multiple
          << " in HiAI DDK";
-      auto w_const_node =
+    }
-          std::make_shared<ge::op::Const>(unique_op_type + "/w");
+    auto out_size_const_node =
-      w_const_node->set_attr_value(
+        std::make_shared<ge::op::Const>(unique_op_type + "/out_size");
+    out_size_const_node->set_attr_value(
        lite::npu::CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
-      interp_node->set_input_w(*w_const_node);
+    out_size_node = out_size_const_node;
-      lite::npu::OpList::Global().add(w_const_node);
  }
-    interp_node->set_attr_output_dim_mode(
+  lite::npu::OpList::Global().add(out_size_node);
-        2);  // 0: zoom_factor, 1: shrink_factor, 2: height/width
-    interp_node->set_attr_align_corners(align_corners);
+  std::shared_ptr<ge::Operator> interp_node = nullptr;
-    outputs_map[op_info->Output("Out").front()] = interp_node;
+  if (interp_method == "bilinear") {
+    auto bilinear_interp_node =
+        std::make_shared<ge::op::ResizeBilinear>(unique_op_type);
+    bilinear_interp_node->set_input_x(*inputs_map.at(x_var_name));
+    bilinear_interp_node->set_input_size(*out_size_node);
+    bilinear_interp_node->set_attr_align_corners(align_corners);
+    interp_node = bilinear_interp_node;
  } else if (interp_method == "nearest") {
-    auto interp_node =
+    auto nearest_interp_node =
        std::make_shared<ge::op::ResizeNearestNeighbor>(unique_op_type);
-    lite::npu::OpList::Global().add(interp_node);
+    nearest_interp_node->set_input_image(*inputs_map.at(x_var_name));
-    interp_node->set_input_image(*inputs_map.at(x_var_name));
+    nearest_interp_node->set_input_size(*out_size_node);
-    if (inputs_map_has_w) {
+    nearest_interp_node->set_attr_align_corners(align_corners);
-      auto out_size_var_name = op_info->Input("OutSize").front();
+    interp_node = nearest_interp_node;
-      interp_node->set_input_size(*inputs_map.at(out_size_var_name));
-      lite::npu::OpList::Global().add(inputs_map.at(out_size_var_name));
-    } else {
-      auto w_const_node =
-          std::make_shared<ge::op::Const>(unique_op_type + "/w");
-      w_const_node->set_attr_value(
-          lite::npu::CreateTensorAndFillData(std::vector<int>({out_h, out_w})));
-      interp_node->set_input_size(*w_const_node);
-      lite::npu::OpList::Global().add(w_const_node);
-    }
-    interp_node->set_attr_align_corners(align_corners);
-    outputs_map[op_info->Output("Out").front()] = interp_node;
  } else {
    LOG(FATAL) << "[NPU] Unsupported interpolate method: " << interp_method;
  }
+  lite::npu::OpList::Global().add(interp_node);
+  node_map_type outputs_map;
+  outputs_map[op_info->Output("Out").front()] = interp_node;
  return outputs_map;
 }

--- a/lite/kernels/npu/bridges/mul_op.cc
+++ b/lite/kernels/npu/bridges/mul_op.cc
@@ -31,82 +31,67 @@ node_map_type MulConverter(const std::shared_ptr<lite::OpLite> mul_op,
  auto unique_op_type = lite::npu::UniqueName(op_type);
  LOG(INFO) << "[NPU] Converting " + op_type + "...";
-  auto output_node = std::make_shared<ge::op::MatMul>(unique_op_type);
  auto x_var_name = op_info->Input("X").front();
  auto y_var_name = op_info->Input("Y").front();
+  auto x = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  auto y = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
  int x_num_col_dims = op_info->GetAttr<int>("x_num_col_dims");
  int y_num_col_dims = op_info->GetAttr<int>("y_num_col_dims");
-  auto* xtensor = scope->FindVar(x_var_name)->GetMutable<lite::Tensor>();
+  int m = x_dims.Slice(0, x_num_col_dims).production();
-  auto* ytensor = scope->FindVar(y_var_name)->GetMutable<lite::Tensor>();
+  int k = x_dims.Slice(x_num_col_dims, x_dims.size()).production();
+  CHECK_EQ(k, y_dims.Slice(0, y_num_col_dims).production())
-  int m = xtensor->dims().Slice(0, x_num_col_dims).production();
+      << "[NPU] columns of X must be equal with rows of Y";
-  int x_w = xtensor->dims()
+  int n = y_dims.Slice(y_num_col_dims, y_dims.size()).production();
-                .Slice(x_num_col_dims, xtensor->dims().size())
-                .production();
-  int y_h = ytensor->dims().Slice(0, y_num_col_dims).production();
-  int n = ytensor->dims()
-              .Slice(y_num_col_dims, ytensor->dims().size())
-              .production();
-  CHECK_EQ(x_w, y_h) << "[NPU] x_w must be equal with y_h";
-  int k = x_w;
  LOG(INFO) << "m:" << m << ",n:" << n << ",k:" << k;
  LOG(INFO) << "x_var_name:" << x_var_name
            << ", is data: " << inputs_map.count(x_var_name);
  LOG(INFO) << "y_var_name:" << y_var_name
            << ", is data: " << inputs_map.count(y_var_name);
  CHECK(inputs_map.count(x_var_name))
-      << "[NPU] MatMul only support X is data, Y is const yet";
+      << "[NPU] MatMul in HiAI DDK only support X is data, Y is const yet.";
+  auto mul_node = std::make_shared<ge::op::MatMul>(unique_op_type);
+  // add input x node which supports persistable and non-persistable tensor, and
+  // reshape to (m, k)
  if (inputs_map.count(x_var_name)) {
-    auto xsrc = inputs_map.at(x_var_name);
+    auto reshaped_x_node =
-    auto reshapex = std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
+        std::make_shared<ge::op::Reshape>(x_var_name + "_reshape");
-    reshapex->set_input_tensor(*xsrc);
+    reshaped_x_node->set_input_tensor(*inputs_map.at(x_var_name));
-    reshapex->set_attr_shape({m, k});
+    reshaped_x_node->set_attr_shape({m, k});
-    reshapex->set_attr_axis(0);
+    reshaped_x_node->set_attr_axis(0);
-    lite::npu::OpList::Global().add(xsrc);
+    mul_node->set_input_x1(*reshaped_x_node);
-    lite::npu::OpList::Global().add(reshapex);
+    lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
-    output_node->set_input_x(*reshapex);
+    lite::npu::OpList::Global().add(reshaped_x_node);
  } else {
-    auto constx = std::make_shared<ge::op::Const>(x_var_name);
+    auto x_const_node = std::make_shared<ge::op::Const>(x_var_name);
-    ge::TensorDesc desc(ge::Shape({m, k}), ge::FORMAT_NCHW, ge::DT_FLOAT);
+    x_const_node->set_attr_value(lite::npu::CvtTensor(x, {m, k}));
-    auto size = desc.GetShape().GetShapeSize();
+    mul_node->set_input_x1(*x_const_node);
-    CHECK_EQ(size, xtensor->dims().production());
+    lite::npu::OpList::Global().add(x_const_node);
-    ge::TensorPtr ptensor = std::make_shared<ge::Tensor>();
-    ptensor->SetTensorDesc(desc);
-    auto* pdata = reinterpret_cast<uint8_t*>(xtensor->mutable_data<float>());
-    ptensor->SetData(pdata, size * sizeof(float));
-    constx->set_attr_value(ptensor);
-    lite::npu::OpList::Global().add(constx);
-    output_node->set_input_x(*constx);
  }
+  // add input y node which only supports persistable tensor, and reshape to (k,
+  // n)
  if (inputs_map.count(y_var_name)) {
-    auto ysrc = inputs_map.at(y_var_name);
+    auto reshaped_y_node =
-    auto reshapey = std::make_shared<ge::op::Reshape>(y_var_name + "_reshape");
+        std::make_shared<ge::op::Reshape>(y_var_name + "_reshape");
-    reshapey->set_input_tensor(*ysrc);
+    reshaped_y_node->set_input_tensor(*inputs_map.at(y_var_name));
-    reshapey->set_attr_shape({k, n});
+    reshaped_y_node->set_attr_shape({k, n});
-    reshapey->set_attr_axis(0);
+    reshaped_y_node->set_attr_axis(0);
-    lite::npu::OpList::Global().add(ysrc);
+    mul_node->set_input_x2(*reshaped_y_node);
-    lite::npu::OpList::Global().add(reshapey);
+    lite::npu::OpList::Global().add(inputs_map.at(y_var_name));
-    output_node->set_input_w(*reshapey);
+    lite::npu::OpList::Global().add(reshaped_y_node);
  } else {
-    auto consty = std::make_shared<ge::op::Const>(y_var_name);
+    auto y_const_node = std::make_shared<ge::op::Const>(y_var_name);
-    ge::TensorDesc desc(ge::Shape({k, n}), ge::FORMAT_NCHW, ge::DT_FLOAT);
+    y_const_node->set_attr_value(lite::npu::CvtTensor(y, {k, n}));
-    auto size = desc.GetShape().GetShapeSize();
+    mul_node->set_input_x2(*y_const_node);
-    CHECK_EQ(size, ytensor->dims().production());
+    lite::npu::OpList::Global().add(y_const_node);
-    ge::TensorPtr ptensor = std::make_shared<ge::Tensor>();
-    ptensor->SetTensorDesc(desc);
-    auto* pdata = reinterpret_cast<uint8_t*>(ytensor->mutable_data<float>());
-    ptensor->SetData(pdata, size * sizeof(float));
-    consty->set_attr_value(ptensor);
-    lite::npu::OpList::Global().add(consty);
-    output_node->set_input_w(*consty);
  }
-  lite::npu::OpList::Global().add(output_node);
+  lite::npu::OpList::Global().add(mul_node);
  node_map_type outputs_map;
-  outputs_map[op_info->Output("Out").front()] = output_node;
+  outputs_map[op_info->Output("Out").front()] = mul_node;
  return outputs_map;
 }

--- a/lite/tools/build_npu.sh
+++ b/lite/tools/build_npu.sh
@@ -5,8 +5,8 @@ set -ex
 ARM_OS="android"                    # android only yet
 ARM_ABI="armv8"                     # armv8, armv7
 ARM_LANG="gcc"                      # gcc only yet
-ANDROID_STL="c++_static"            # c++_shared, c++_static
+ANDROID_STL="c++_shared"            # c++_shared/c++_static, c++_shared is used by HiAI DDK 310
-DDK_ROOT="$(pwd)/ai_ddk_lib/"       # HIAI SDK from https://developer.huawei.com/consumer/cn/hiai/
+DDK_ROOT="$(pwd)/ai_ddk_lib/"       # HiAI DDK 310 from https://developer.huawei.com/consumer/cn/hiai/
 TARGET_NAME="test_npu_pass"         # default target
 BUILD_EXTRA=OFF                     # ON(with sequence ops)/OFF
 WITH_JAVA=ON                        # ON(build jar and jni so)/OFF