[Ascend] add layer_norm, matmul, cast, scale, slice, gather op for ernie, test=develop (#4126)

* add new ops for ernie, test=develop * drop lookuptable, test=develop * update cast compute, test=develop * add slice gather op, test=develop

[Ascend] add layer_norm, matmul, cast, scale, slice, gather op for ernie, test=develop (#4126)
* add new ops for ernie, test=develop * drop lookuptable, test=develop * update cast compute, test=develop * add slice gather op, test=develop
64eaaeca · Qi Li · GitHub · d14e57f7 · 64eaaeca · 64eaaeca
17 changed file
--- a/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt
+++ b/lite/kernels/huawei_ascend_npu/bridges/CMakeLists.txt
@@ -20,6 +20,12 @@ lite_cc_library(subgraph_bridge_fc_op_huawei_ascend_npu SRCS fc_op.cc DEPS ${hua
 lite_cc_library(subgraph_bridge_reshape_op_huawei_ascend_npu SRCS reshape_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_transpose_op_huawei_ascend_npu SRCS transpose_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_flatten_op_huawei_ascend_npu SRCS flatten_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_layer_norm_op_huawei_ascend_npu SRCS layer_norm_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_matmul_op_huawei_ascend_npu SRCS matmul_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_cast_op_huawei_ascend_npu SRCS cast_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_scale_op_huawei_ascend_npu SRCS scale_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_slice_op_huawei_ascend_npu SRCS slice_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_gather_op_huawei_ascend_npu SRCS gather_op.cc DEPS ${huawei_ascend_npu_subgraph_bridge_deps})
 set(huawei_ascend_npu_subgraph_bridges
        subgraph_bridge_registry
@@ -38,4 +44,10 @@ set(huawei_ascend_npu_subgraph_bridges
        subgraph_bridge_reshape_op_huawei_ascend_npu
        subgraph_bridge_transpose_op_huawei_ascend_npu
        subgraph_bridge_flatten_op_huawei_ascend_npu
+        subgraph_bridge_layer_norm_op_huawei_ascend_npu
+        subgraph_bridge_matmul_op_huawei_ascend_npu
+        subgraph_bridge_cast_op_huawei_ascend_npu
+        subgraph_bridge_scale_op_huawei_ascend_npu
+        subgraph_bridge_slice_op_huawei_ascend_npu
+        subgraph_bridge_gather_op_huawei_ascend_npu
        CACHE INTERNAL "huawei_ascend_npu_subgraph_bridges")
--- a/lite/kernels/huawei_ascend_npu/bridges/cast_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/cast_op.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+int CastConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " + op_type + "...";
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  // BOOL = 0;INT16 = 1;INT32 = 2;INT64 = 3;FP16 = 4;FP32 = 5;FP64 = 6;
+  // SIZE_T = 19;UINT8 = 20;INT8 = 21;
+  // auto in_dtype = op_info->GetAttr<int>("in_dtype");
+  auto out_dtype = op_info->GetAttr<int>("out_dtype");
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+  PrecisionType ptype = PRECISION(kFloat);
+  ge::DataType otype = ge::DT_FLOAT;
+  switch (out_dtype) {
+    case 0:  // BOOL = 0;
+      ptype = PRECISION(kBool);
+      otype = ge::DT_BOOL;
+      break;
+    case 1:  // INT16 = 1
+      ptype = PRECISION(kInt16);
+      otype = ge::DT_INT16;
+      break;
+    case 2:  // INT32 = 2
+      ptype = PRECISION(kInt32);
+      otype = ge::DT_INT32;
+      break;
+    case 3:  // INT64 = 3
+      ptype = PRECISION(kInt64);
+      otype = ge::DT_INT64;
+      break;
+    case 4:  // FP16 = 4
+      ptype = PRECISION(kFP16);
+      otype = ge::DT_FLOAT16;
+      break;
+    case 5:  // FP32 = 5
+      ptype = PRECISION(kFloat);
+      otype = ge::DT_FLOAT;
+      break;
+    case 21:  // INT8 = 21
+      ptype = PRECISION(kInt8);
+      otype = ge::DT_INT8;
+      break;
+    default:
+      LOG(FATAL) << "unsupported data type: " << out_dtype;
+      break;
+  }
+  // Cast node
+  auto cast_node = graph->Add<ge::op::Cast>(out_name, ptype);
+  auto cast_op = cast_node->data<ge::op::Cast>();
+  cast_op->set_input_x(*x_node->data());
+  cast_op->set_attr_dst_type(otype);
+  INPUT_UPDATE(cast_op, x, x_node);
+  OUTPUT_UPDATE(cast_op, y, cast_node);
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(
+    cast,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::CastConverter);
--- a/lite/kernels/huawei_ascend_npu/bridges/elementwise_ops.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/elementwise_ops.cc
@@ -138,7 +138,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  std::shared_ptr<Node> x_node = nullptr;
  if (graph->Has(x_name)) {
    x_node = graph->Get(x_name);
-    auto shape_node = graph->Add<int64_t>(x_name + "/shape", x_new_shape);
+    auto shape_node = graph->Add<int64_t>(x_name + "/x_shape", x_new_shape);
    auto reshaped_x_node = graph->Add<ge::op::Reshape>(x_name + "/reshape");
    auto reshaped_x_op = reshaped_x_node->data<ge::op::Reshape>();
    reshaped_x_op->set_input_x(*x_node->data());
@@ -156,7 +156,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  std::shared_ptr<Node> y_node = nullptr;
  if (graph->Has(y_name)) {
    y_node = graph->Get(y_name);
-    auto shape_node = graph->Add<int64_t>(y_name + "/shape", y_new_shape);
+    auto shape_node = graph->Add<int64_t>(y_name + "/y_shape", y_new_shape);
    auto reshaped_y_node = graph->Add<ge::op::Reshape>(y_name + "/reshape");
    auto reshaped_y_op = reshaped_y_node->data<ge::op::Reshape>();
    reshaped_y_op->set_input_x(*y_node->data());
@@ -224,7 +224,7 @@ int ElementwiseConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto out_shape = out_dims.Vectorize();
  if (out_shape != x_new_shape) {
-    auto shape_node = graph->Add<int64_t>(out_name + "/shape", out_shape);
+    auto shape_node = graph->Add<int64_t>(out_name + "/out_shape", out_shape);
    auto reshaped_elt_node = graph->Add<ge::op::Reshape>(out_name);
    auto reshaped_elt_op = reshaped_elt_node->data<ge::op::Reshape>();
    reshaped_elt_op->set_input_x(*elt_node->data());

--- a/lite/kernels/huawei_ascend_npu/bridges/gather_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/gather_op.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " + op_type + "...";
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_name);
+  auto index_name = op_info->Input("Index").front();
+  auto index = scope->FindTensor(index_name);
+  auto index_dims = index->dims();
+  CHECK(index_dims.size() == 1 ||
+        (index_dims.size() == 2 && index_dims[1] == 1))
+      << "index dims unmatch";
+  auto out_name = op_info->Output("Out").front();
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+  // Index node
+  std::shared_ptr<Node> index_node = nullptr;
+  if (graph->Has(index_name)) {
+    index_node = graph->Get(index_name);
+  } else {
+    index_node = graph->Add(index_name, *index);
+  }
+  // Gather node
+  auto gather_node = graph->Add<ge::op::Gather>(out_name);
+  auto gather_op = gather_node->data<ge::op::Gather>();
+  gather_op->set_input_x(*x_node->data());
+  gather_op->set_input_indices(*index_node->data());
+  INPUT_UPDATE(gather_op, x, x_node);
+  INPUT_UPDATE(gather_op, indices, index_node);
+  OUTPUT_UPDATE(gather_op, y, gather_node);
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(
+    gather,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::GatherConverter);
--- a/lite/kernels/huawei_ascend_npu/bridges/layer_norm_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/layer_norm_op.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+int LayerNormConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " + op_type + "...";
+  // Get input and output vars
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto x_rank = static_cast<int>(x_dims.size());
+  CHECK(x_rank >= 2 && x_rank <= 4);
+  bool has_bias = op_info->HasInput("Bias");
+  bool has_scale = op_info->HasInput("Scale");
+  auto y_name = op_info->Output("Y").front();
+  auto y = scope->FindMutableTensor(y_name);
+  auto y_dims = y->dims();
+  auto mean_name = op_info->Output("Mean").front();
+  auto mean = scope->FindMutableTensor(mean_name);
+  auto mean_dims = mean->dims();
+  CHECK_EQ(mean_dims.size(), 1);
+  auto var_name = op_info->Output("Variance").front();
+  auto var = scope->FindMutableTensor(var_name);
+  auto var_dims = var->dims();
+  CHECK_EQ(var_dims.size(), 1);
+  // Get op attributes
+  auto epsilon = op_info->GetAttr<float>("epsilon");
+  auto begin_norm_axis = op_info->GetAttr<int>("begin_norm_axis");
+  if (begin_norm_axis < 0) {
+    begin_norm_axis += x_rank;
+  }
+  CHECK_GT(begin_norm_axis, 0);
+  CHECK_LT(begin_norm_axis, x_rank);
+  CHECK(begin_norm_axis >= 1 && begin_norm_axis < x_rank);
+  auto matrix_dim = x_dims.Flatten2D(begin_norm_axis);
+  int batch_size = matrix_dim[0];
+  int feature_size = matrix_dim[1];
+  CHECK_EQ(mean_dims.production(), batch_size);
+  CHECK_EQ(var_dims.production(), batch_size);
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+  // Get shape of bias and scale
+  DDim scale_bias_dims = x_dims.Slice(begin_norm_axis, x_dims.size());
+  CHECK_EQ(scale_bias_dims.production(), feature_size);
+  // auto scale_bias_dims = DDim({x_dims[x_dims.size()-1]});
+  // Bias node
+  std::shared_ptr<Node> bias_node = nullptr;
+  if (has_bias) {
+    auto bias_name = op_info->Input("Bias").front();
+    auto bias = scope->FindMutableTensor(bias_name);
+    auto bias_dims = bias->dims();
+    CHECK_EQ(bias_dims.size(), 1);
+    CHECK_EQ(bias_dims.production(), feature_size);
+    bias_node = graph->Add(bias_name, *bias, scale_bias_dims);
+  } else {
+    bias_node = graph->Add<float>(y_name + "/bias", 0.f, scale_bias_dims);
+  }
+  // Scale node
+  std::shared_ptr<Node> scale_node = nullptr;
+  if (has_scale) {
+    auto scale_name = op_info->Input("Scale").front();
+    auto scale = scope->FindMutableTensor(scale_name);
+    auto scale_dims = scale->dims();
+    CHECK_EQ(scale_dims.size(), 1);
+    CHECK_EQ(scale_dims.production(), feature_size);
+    scale_node = graph->Add(scale_name, *scale, scale_bias_dims);
+  } else {
+    scale_node = graph->Add<float>(y_name + "/scale", 1.f, scale_bias_dims);
+  }
+  // LayerNorm node
+  auto layer_norm_node = graph->Add<ge::op::LayerNorm>(y_name + "/layer_norm");
+  auto layer_norm_op = layer_norm_node->data<ge::op::LayerNorm>();
+  layer_norm_op->set_input_x(*x_node->data());
+  layer_norm_op->set_input_gamma(*scale_node->data());
+  layer_norm_op->set_input_beta(*bias_node->data());
+  layer_norm_op->set_attr_begin_norm_axis(begin_norm_axis);
+  layer_norm_op->set_attr_begin_params_axis(begin_norm_axis);
+  layer_norm_op->set_attr_epsilon(epsilon);
+  INPUT_UPDATE(layer_norm_op, x, x_node);
+  INPUT_UPDATE(layer_norm_op, gamma, scale_node);
+  INPUT_UPDATE(layer_norm_op, beta, bias_node);
+  OUTPUT_UPDATE(layer_norm_op, y, layer_norm_node);
+  OUTPUT_UPDATE(layer_norm_op, mean, layer_norm_node);
+  OUTPUT_UPDATE(layer_norm_op, variance, layer_norm_node);
+  // Get output of Y
+  auto out_y_node = graph->Add<ge::op::Identity>(y_name);
+  auto out_y_op = out_y_node->data<ge::op::Identity>();
+  out_y_op->set_input_x(*layer_norm_node->data(), "y");
+  INPUT_UPDATE(out_y_op, x, layer_norm_node);
+  OUTPUT_UPDATE(out_y_op, y, out_y_node);
+  // Get output of Mean
+  auto out_mean_node = graph->Add<ge::op::Identity>(mean_name);
+  auto out_mean_op = out_mean_node->data<ge::op::Identity>();
+  out_mean_op->set_input_x(*layer_norm_node->data(), "mean");
+  INPUT_UPDATE(out_mean_op, x, layer_norm_node);
+  OUTPUT_UPDATE(out_mean_op, y, out_mean_node);
+  // Get output of Variance
+  auto out_var_node = graph->Add<ge::op::Identity>(var_name);
+  auto out_var_op = out_var_node->data<ge::op::Identity>();
+  out_var_op->set_input_x(*layer_norm_node->data(), "variance");
+  INPUT_UPDATE(out_var_op, x, layer_norm_node);
+  OUTPUT_UPDATE(out_var_op, y, out_var_node);
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(
+    layer_norm,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::LayerNormConverter);
--- a/lite/kernels/huawei_ascend_npu/bridges/matmul_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/matmul_op.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+int MatMulConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " + op_type + "...";
+  // Get input and output vars and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_name);
+  auto x_dims = x->dims();
+  if (x_dims.size() < 2) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Input dims should be equal or large "
+                    "than 2 in Huawei Ascend NPU DDK.";
+    return FAILED;
+  }
+  auto y_name = op_info->Input("Y").front();
+  auto y = scope->FindTensor(y_name);
+  auto y_dims = y->dims();
+  if (y_dims.size() < 2) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] Input dims should be equal or large "
+                    "than 2 in Huawei Ascend NPU DDK.";
+    return FAILED;
+  }
+  if (x_dims.size() != y_dims.size()) {
+    LOG(WARNING) << "[HUAWEI_ASCEND_NPU] dims size of input x1 and x2 must be "
+                    "same in Huawei Ascend NPU DDK.";
+    return FAILED;
+  }
+  auto out_name = op_info->Output("Out").front();
+  auto out = scope->FindTensor(out_name);
+  auto out_dims = out->dims();
+  bool transpose_x = op_info->GetAttr<bool>("transpose_X");
+  bool transpose_y = op_info->GetAttr<bool>("transpose_Y");
+  float alpha = op_info->GetAttr<float>("alpha");
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
+  } else {
+    y_node = graph->Add(y_name, *y);
+  }
+  // Matmul node
+  std::shared_ptr<Node> matmul_node = nullptr;
+  if (x_dims.size() == 2) {
+    matmul_node = graph->Add<ge::op::MatMul>(out_name);
+    auto matmul_op = matmul_node->data<ge::op::MatMul>();
+    matmul_op->set_input_x1(*x_node->data());
+    matmul_op->set_input_x2(*y_node->data());
+    matmul_op->set_attr_transpose_x1(transpose_x);
+    matmul_op->set_attr_transpose_x2(transpose_y);
+    INPUT_UPDATE(matmul_op, x1, x_node);
+    INPUT_UPDATE(matmul_op, x2, y_node);
+    OUTPUT_UPDATE(matmul_op, y, matmul_node);
+  } else {
+    matmul_node = graph->Add<ge::op::BatchMatMul>(out_name);
+    auto matmul_op = matmul_node->data<ge::op::BatchMatMul>();
+    matmul_op->set_input_x1(*x_node->data());
+    matmul_op->set_input_x2(*y_node->data());
+    matmul_op->set_attr_adj_x1(transpose_x);
+    matmul_op->set_attr_adj_x2(transpose_y);
+    INPUT_UPDATE(matmul_op, x1, x_node);
+    INPUT_UPDATE(matmul_op, x2, y_node);
+    OUTPUT_UPDATE(matmul_op, y, matmul_node);
+  }
+  if (fabs(alpha - 1.f) > 1e-6f) {
+    auto scale_node = graph->Add<ge::op::Muls>(out_name);
+    auto scale_op = scale_node->data<ge::op::Muls>();
+    scale_op->set_input_x(*matmul_node->data());
+    scale_op->set_attr_value(alpha);
+    INPUT_UPDATE(scale_op, x, matmul_node);
+    OUTPUT_UPDATE(scale_op, y, scale_node);
+  }
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(
+    matmul,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::MatMulConverter);
--- a/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/huawei_ascend_npu/bridges/paddle_use_bridges.h
@@ -48,3 +48,9 @@ USE_SUBGRAPH_BRIDGE(transpose, kHuaweiAscendNPU);
 USE_SUBGRAPH_BRIDGE(transpose2, kHuaweiAscendNPU);
 USE_SUBGRAPH_BRIDGE(flatten, kHuaweiAscendNPU);
 USE_SUBGRAPH_BRIDGE(flatten2, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(layer_norm, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(matmul, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(cast, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(scale, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(slice, kHuaweiAscendNPU);
+USE_SUBGRAPH_BRIDGE(gather, kHuaweiAscendNPU);
--- a/lite/kernels/huawei_ascend_npu/bridges/scale_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/scale_op.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+int ScaleConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " + op_type + "...";
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindMutableTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  float scale = op_info->GetAttr<float>("scale");
+  float bias = op_info->GetAttr<float>("bias");
+  bool bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
+  if (!bias_after_scale) {
+    bias *= scale;
+  }
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+  // const node
+  auto input_scale_node =
+      graph->Add<float>(out_name + "/scale", scale, x_dims.Vectorize());
+  // scale node
+  auto scale_node = graph->Add<ge::op::Scale>(out_name);
+  auto scale_op = scale_node->data<ge::op::Scale>();
+  scale_op->set_input_x(*x_node->data());
+  scale_op->set_input_scale(*input_scale_node->data());
+  scale_op->set_attr_axis(0);
+  scale_op->set_attr_num_axes(-1);
+  scale_op->set_attr_scale_from_blob(true);
+  INPUT_UPDATE(scale_op, x, x_node);
+  INPUT_UPDATE(scale_op, scale, input_scale_node);
+  OUTPUT_UPDATE(scale_op, y, scale_node);
+  // Add bias node(fill with bias)
+  if (fabs(bias) > 1e-6f) {
+    auto bias_node = graph->Add(out_name + "/bias", bias, x_dims.Vectorize());
+    scale_op->set_input_bias(*bias_node->data());
+    INPUT_UPDATE(scale_op, bias, input_scale_node);
+  }
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(
+    scale,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::ScaleConverter);
--- a/lite/kernels/huawei_ascend_npu/bridges/slice_op.cc
+++ b/lite/kernels/huawei_ascend_npu/bridges/slice_op.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/huawei_ascend_npu/bridges/graph.h"
+#include "lite/kernels/huawei_ascend_npu/bridges/utility.h"
+#include "lite/kernels/npu/bridges/registry.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace huawei_ascend_npu {
+int SliceConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[HUAWEI_ASCEND_NPU] Converting " + op_type + "...";
+  // Get input, output and op attributes
+  auto input_name = op_info->Input("Input").front();
+  auto input = scope->FindMutableTensor(input_name);
+  auto input_dims = input->dims();
+  auto input_rank = static_cast<int>(input_dims.size());
+  std::vector<int64_t> input_shape = input_dims.Vectorize();
+  auto out_name = op_info->Output("Out").front();
+  auto axes = op_info->GetAttr<std::vector<int>>("axes");
+  auto starts = op_info->GetAttr<std::vector<int>>("starts");
+  auto ends = op_info->GetAttr<std::vector<int>>("ends");
+  CHECK_EQ(axes.size(), starts.size());
+  CHECK_EQ(axes.size(), ends.size());
+  // X node
+  std::shared_ptr<Node> input_node = nullptr;
+  if (graph->Has(input_name)) {
+    input_node = graph->Get(input_name);
+  } else {
+    input_node = graph->Add(input_name, *input);
+  }
+  // Get begin/offset based on axes and starts
+  std::vector<int> offset_vec(input_rank, 0);
+  std::vector<int> size_vec(input_shape.begin(), input_shape.end());
+  // Get begin/offset based on axes and starts
+  for (int i = 0; i < axes.size(); i++) {
+    auto axis = axes[i];
+    CHECK_LE(axis, input_rank)
+        << "[HUAWEI_ASCEND_NPU] axes value should less than input rank.";
+    offset_vec[axis] = starts[i];
+    size_vec[axis] = ends[i] - starts[i];
+  }
+  // Cast node
+  auto slice_node = graph->Add<ge::op::SliceD>(out_name);
+  auto slice_op = slice_node->data<ge::op::SliceD>();
+  slice_op->set_input_x(*input_node->data());
+  slice_op->set_attr_offsets(
+      ge::Operator::OpListInt(offset_vec.begin(), offset_vec.end()));
+  slice_op->set_attr_size(
+      ge::Operator::OpListInt(size_vec.begin(), size_vec.end()));
+  INPUT_UPDATE(slice_op, x, input_node);
+  OUTPUT_UPDATE(slice_op, y, slice_node);
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+}  // namespace huawei_ascend_npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(
+    slice,
+    kHuaweiAscendNPU,
+    paddle::lite::subgraph::huawei_ascend_npu::SliceConverter);
--- a/lite/kernels/huawei_ascend_npu/bridges/utility.h
+++ b/lite/kernels/huawei_ascend_npu/bridges/utility.h
@@ -66,7 +66,7 @@ ge::DataType CvtPrecisionType(PrecisionType itype);
 ge::Format CvtDataLayoutType(DataLayoutType itype);
-// Padding the shape to 4-dimensions(NCHW) for HiAI
+// Padding the shape to 4-dimensions(NCHW) for Huawei Ascend NPU
 std::vector<int64_t> CvtShape(const std::vector<int64_t>& in_shape);
 std::vector<int64_t> CvtShape(const DDim& in_dims);

--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
@@ -137,17 +137,20 @@ TEST(Cast, precision) {
  place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
  place = TARGET(kXPU);
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // precision_mode default is force_fp16
 #else
  return;
 #endif
 // BOOL = 0;INT16 = 1;INT32 = 2;INT64 = 3;FP16 = 4;FP32 = 5;FP64 = 6;
 // SIZE_T = 19;UINT8 = 20;INT8 = 21;
-#ifndef LITE_WITH_XPU
+#if !defined(LITE_WITH_XPU) && !defined(LITE_WITH_HUAWEI_ASCEND_NPU)
  TestCast(place, abs_error, 20, 5);
 #endif
  TestCast(place, abs_error, 2, 5);
-#ifdef LITE_WITH_XPU
+#if defined(LITE_WITH_XPU) || defined(LITE_WITH_HUAWEI_ASCEND_NPU)
  TestCast(place, abs_error, 3, 5);
  TestCast(place, abs_error, 5, 3);
 #endif

--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
@@ -96,6 +96,9 @@ TEST(Gather, precision) {
 #if defined(LITE_WITH_NPU)
  place = TARGET(kNPU);
  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // precision_mode default is force_fp16
 #elif defined(LITE_WITH_ARM)
  place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)

--- a/lite/tests/kernels/layer_norm_compute_test.cc
+++ b/lite/tests/kernels/layer_norm_compute_test.cc
@@ -152,6 +152,9 @@ TEST(LayerNorm, precision) {
 #elif defined(LITE_WITH_NPU)
  place = TARGET(kNPU);
  abs_error = 1e-2;
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // precision_mode default is force_fp16
 #elif defined(LITE_WITH_ARM)
  place = TARGET(kARM);
  abs_error = 6e-5;

--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
@@ -118,6 +118,9 @@ TEST(LookupTable, precision) {
  place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
  place = TARGET(kXPU);
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // precision_mode default is force_fp16
 #else
  return;
 #endif

--- a/lite/tests/kernels/matmul_compute_test.cc
+++ b/lite/tests/kernels/matmul_compute_test.cc
@@ -455,6 +455,9 @@ TEST(Matmul2x2, precision) {
 #if defined(LITE_WITH_NPU)
  place = TARGET(kNPU);
  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // precision_mode default is force_fp16
 #elif defined(LITE_WITH_ARM)
  place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
@@ -472,6 +475,9 @@ TEST(Matmul2x2_x_transpose, precision) {
 #if defined(LITE_WITH_NPU)
  place = TARGET(kNPU);
  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // precision_mode default is force_fp16
 #elif defined(LITE_WITH_ARM)
  place = TARGET(kARM);
 #else
@@ -487,6 +493,9 @@ TEST(Matmul2x2_y_transpose, precision) {
 #if defined(LITE_WITH_NPU)
  place = TARGET(kNPU);
  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // precision_mode default is force_fp16
 #elif defined(LITE_WITH_ARM)
  place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
@@ -559,6 +568,9 @@ TEST(Matmulnxn, precision) {
 #if defined(LITE_WITH_NPU)
  place = TARGET(kNPU);
  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // precision_mode default is force_fp16
 #elif defined(LITE_WITH_ARM)
  place = TARGET(kARM);
 #else

--- a/lite/tests/kernels/scale_compute_test.cc
+++ b/lite/tests/kernels/scale_compute_test.cc
@@ -168,6 +168,9 @@ TEST(Scale, precision) {
 #elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
  place = TARGET(kXPU);
  abs_error = 3e-4;  // Some operations use fp16 in XPU
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  place = TARGET(kHuaweiAscendNPU);
+  abs_error = 1e-2;  // precision_mode default is force_fp16
 #elif defined(LITE_WITH_X86)
  place = TARGET(kX86);
 #else

--- a/lite/tests/kernels/slice_compute_test.cc
+++ b/lite/tests/kernels/slice_compute_test.cc
@@ -271,6 +271,9 @@ TEST(Slice, precision) {
 #elif defined(LITE_WITH_XPU) && defined(LITE_WITH_XTCL)
  Place place(TARGET(kXPU));
  test_slice(place);
+#elif defined(LITE_WITH_HUAWEI_ASCEND_NPU)
+  Place place = TARGET(kHuaweiAscendNPU);
+  test_slice(place);
 #endif
 }