[NPU] add shape, gather, lookup_table bridge (#3197)

* [NPU] add shape bridge move shape arm kernel to host * enhance compare arm kernel * [NPU] add gather op bridge * enable reshape arm ut * [NPU] add lookup_table bridge

[NPU] add shape, gather, lookup_table bridge (#3197)
* [NPU] add shape bridge move shape arm kernel to host * enhance compare arm kernel * [NPU] add gather op bridge * enable reshape arm ut * [NPU] add lookup_table bridge
dcf6acce · zhupengyang · GitHub · ae3ebea5 · dcf6acce · dcf6acce
21 changed file
--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -292,13 +292,10 @@ void Predictor::Build(const cpp::ProgramDesc &desc,
  program_desc_ = desc;
  // `inner_places` is used to optimize passes
  std::vector<Place> inner_places = valid_places;
-  inner_places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
-  inner_places.emplace_back(
-      TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
-  inner_places.emplace_back(
-      TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kNCHW));
-  inner_places.emplace_back(
-      TARGET(kHost), PRECISION(kInt64), DATALAYOUT(kNCHW));
+  for (auto &valid_place : valid_places) {
+    inner_places.emplace_back(
+        Place(TARGET(kHost), valid_place.precision, valid_place.layout));
+  }

  // Analysis whether the modle is quantized.
  // For quantized model, add place(arm, int8) to inner_places

--- a/lite/core/op_registry.cc
+++ b/lite/core/op_registry.cc
@@ -151,16 +151,30 @@ KernelRegistry::KernelRegistry()
  INIT_FOR(kMLU, kInt16, kNHWC);
  INIT_FOR(kMLU, kInt16, kNCHW);

-  INIT_FOR(kHost, kFloat, kNCHW);
-  INIT_FOR(kHost, kInt32, kNCHW);
-  INIT_FOR(kHost, kInt64, kNCHW);
  INIT_FOR(kHost, kAny, kNCHW);
-  INIT_FOR(kHost, kFloat, kNHWC);
-  INIT_FOR(kHost, kFloat, kAny);
-  INIT_FOR(kHost, kAny, kNHWC);
-  INIT_FOR(kHost, kAny, kAny);
  INIT_FOR(kHost, kAny, kNHWC);
  INIT_FOR(kHost, kAny, kAny);
+  INIT_FOR(kHost, kBool, kNCHW);
+  INIT_FOR(kHost, kBool, kNHWC);
+  INIT_FOR(kHost, kBool, kAny);
+  INIT_FOR(kHost, kFloat, kNCHW);
+  INIT_FOR(kHost, kFloat, kNHWC);
+  INIT_FOR(kHost, kFloat, kAny);
+  INIT_FOR(kHost, kFP16, kNCHW);
+  INIT_FOR(kHost, kFP16, kNHWC);
+  INIT_FOR(kHost, kFP16, kAny);
+  INIT_FOR(kHost, kInt8, kNCHW);
+  INIT_FOR(kHost, kInt8, kNHWC);
+  INIT_FOR(kHost, kInt8, kAny);
+  INIT_FOR(kHost, kInt16, kNCHW);
+  INIT_FOR(kHost, kInt16, kNHWC);
+  INIT_FOR(kHost, kInt16, kAny);
+  INIT_FOR(kHost, kInt32, kNCHW);
+  INIT_FOR(kHost, kInt32, kNHWC);
+  INIT_FOR(kHost, kInt32, kAny);
+  INIT_FOR(kHost, kInt64, kNCHW);
+  INIT_FOR(kHost, kInt64, kNHWC);
+  INIT_FOR(kHost, kInt64, kAny);

  INIT_FOR(kX86, kFloat, kNCHW);
  INIT_FOR(kX86, kAny, kNCHW);

--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -63,7 +63,6 @@ add_kernel(lrn_compute_arm ARM extra SRCS lrn_compute.cc DEPS ${lite_kernel_deps
 add_kernel(decode_bboxes_compute_arm ARM extra SRCS decode_bboxes_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(density_prior_box_compute_arm ARM basic SRCS density_prior_box_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(axpy_compute_arm ARM extra SRCS axpy_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(shape_compute_arm ARM extra SRCS shape_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(reduce_max_compute_arm ARM extra SRCS reduce_max_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_expand_compute_arm ARM extra SRCS sequence_expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(im2sequence_compute_arm ARM extra SRCS im2sequence_compute.cc DEPS ${lite_kernel_deps} math_arm)
@@ -92,7 +91,6 @@ add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_
 add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(compare_compute_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(increment_compute_arm ARM extra SRCS increment_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(write_to_array_compute_arm ARM extra SRCS write_to_array_compute.cc DEPS ${lite_kernel_deps} math_arm)

--- a/lite/kernels/arm/compare_compute.cc
+++ b/lite/kernels/arm/compare_compute.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/arm/compare_compute.h"
-#include <vector>
-#include "lite/api/paddle_place.h"
-#include "lite/backends/arm/math/funcs.h"
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-#define COMPARE_FUNCTOR(name, op)                                           \
-  template <typename T>                                                     \
-  struct _##name##Functor {                                                 \
-    inline bool operator()(const T &a, const T &b) const { return a op b; } \
-  };
-
-COMPARE_FUNCTOR(Equal, ==);
-COMPARE_FUNCTOR(NotEqual, !=);
-COMPARE_FUNCTOR(LessThan, <);
-COMPARE_FUNCTOR(LessEqual, <=);
-COMPARE_FUNCTOR(GreaterThan, >);
-COMPARE_FUNCTOR(GreaterEqual, >=);
-
-template <>
-struct _EqualFunctor<float> {
-  inline bool operator()(const float &a, const float &b) const {
-    // It is safe to cast a and b to double.
-    return fabs(static_cast<double>(a - b)) < 1e-8;
-  }
-};
-
-template <>
-struct _NotEqualFunctor<float> {
-  inline bool operator()(const float &a, const float &b) const {
-    return !_EqualFunctor<float>()(a, b);
-  }
-};
-
-inline void get_mid_dims(const lite::DDim &x_dims,
-                         const lite::DDim &y_dims,
-                         const int axis,
-                         int *pre,
-                         int *n,
-                         int *post) {
-  *pre = 1;
-  *n = 1;
-  *post = 1;
-  for (int i = 0; i < axis; ++i) {
-    (*pre) *= x_dims[i];
-  }
-
-  for (int i = 0; i < y_dims.size(); ++i) {
-    (*n) *= y_dims[i];
-  }
-
-  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
-    (*post) *= x_dims[i];
-  }
-}
-
-template <template <typename T> class Functor>
-void CompareCompute<Functor>::Run() {
-  auto &param = this->Param<operators::CompareParam>();
-
-  using CompareFunctor = Functor<float>;
-
-  const size_t x_size = param.X->numel();
-  const size_t y_size = param.Y->numel();
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  bool *z = param.Out->template mutable_data<bool>();
-  const auto *x = param.X->template data<float>();
-  const auto *y = param.Y->template data<float>();
-  auto axis = param.axis;
-  bool force_cpu = param.force_cpu;
-  if (x_size == y_size) {
-    for (int i = 0; i < x_size; ++i) {
-      z[i] = CompareFunctor()(x[i], y[i]);
-    }
-  } else {
-    int axis = (param.axis == -1 ? x_dims.size() - y_dims.size() : param.axis);
-    int outer_num, mid_num, inner_num;
-    get_mid_dims(x_dims, y_dims, axis, &outer_num, &mid_num, &inner_num);
-    for (int outer_id = 0; outer_id < outer_num; ++outer_id) {
-      for (int mid_id = 0; mid_id < mid_num; ++mid_id) {
-        auto y_data = y[mid_id];
-        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
-          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
-          z[index] = CompareFunctor()(x[index], y_data);
-          // z[index] = x[index] < y_data;
-        }
-      }
-    }
-  }
-}
-
-template <template <typename T> class Functor>
-void CompareCompute_int32<Functor>::Run() {
-  auto &param = this->Param<operators::CompareParam>();
-
-  using CompareFunctor = Functor<int>;
-
-  const size_t x_size = param.X->numel();
-  const size_t y_size = param.Y->numel();
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  bool *z = param.Out->template mutable_data<bool>();
-  const auto *x = param.X->template data<int>();
-  const auto *y = param.Y->template data<int>();
-  auto axis = param.axis;
-  bool force_cpu = param.force_cpu;
-  if (x_size == y_size) {
-    for (int i = 0; i < x_size; ++i) {
-      z[i] = CompareFunctor()(x[i], y[i]);
-    }
-  } else {
-    int axis = (param.axis == -1 ? x_dims.size() - y_dims.size() : param.axis);
-    int outer_num, mid_num, inner_num;
-    get_mid_dims(x_dims, y_dims, axis, &outer_num, &mid_num, &inner_num);
-    for (int outer_id = 0; outer_id < outer_num; ++outer_id) {
-      for (int mid_id = 0; mid_id < mid_num; ++mid_id) {
-        auto y_data = y[mid_id];
-        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
-          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
-          z[index] = CompareFunctor()(x[index], y_data);
-          // z[index] = x[index] < y_data;
-        }
-      }
-    }
-  }
-}
-
-template <template <typename T> class Functor>
-void CompareCompute_int64<Functor>::Run() {
-  auto &param = this->Param<operators::CompareParam>();
-
-  using CompareFunctor = Functor<int64_t>;
-
-  const size_t x_size = param.X->numel();
-  const size_t y_size = param.Y->numel();
-  auto x_dims = param.X->dims();
-  auto y_dims = param.Y->dims();
-  bool *z = param.Out->template mutable_data<bool>();
-  const auto *x = param.X->template data<int64_t>();
-  const auto *y = param.Y->template data<int64_t>();
-  auto axis = param.axis;
-  bool force_cpu = param.force_cpu;
-  if (x_size == y_size) {
-    for (int i = 0; i < x_size; ++i) {
-      z[i] = CompareFunctor()(x[i], y[i]);
-    }
-  } else {
-    int axis = (param.axis == -1 ? x_dims.size() - y_dims.size() : param.axis);
-    int outer_num, mid_num, inner_num;
-    get_mid_dims(x_dims, y_dims, axis, &outer_num, &mid_num, &inner_num);
-    for (int outer_id = 0; outer_id < outer_num; ++outer_id) {
-      for (int mid_id = 0; mid_id < mid_num; ++mid_id) {
-        auto y_data = y[mid_id];
-        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
-          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
-          z[index] = CompareFunctor()(x[index], y_data);
-        }
-      }
-    }
-  }
-}
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(equal,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_EqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(equal,
-                     kARM,
-                     kInt32,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute_int32<
-                         paddle::lite::kernels::arm::_EqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(not_equal,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_NotEqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(less_than,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_LessThanFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(less_than,
-                     kARM,
-                     kInt32,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute_int32<
-                         paddle::lite::kernels::arm::_LessThanFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(less_than,
-                     kARM,
-                     kInt64,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute_int64<
-                         paddle::lite::kernels::arm::_LessThanFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(less_equal,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_LessEqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(greater_than,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_GreaterThanFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-
-REGISTER_LITE_KERNEL(greater_equal,
-                     kARM,
-                     kFloat,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_GreaterEqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -2,7 +2,9 @@ message(STATUS "compile with lite host kernels")

 add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
-add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps} reshape_op)
+add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(shape_compute_host Host extra SRCS shape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(compare_compute_host Host extra SRCS compare_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(ctc_align_compute_host Host extra SRCS ctc_align_compute.cc DEPS ${lite_kernel_deps})
--- a/lite/kernels/host/compare_compute.cc
+++ b/lite/kernels/host/compare_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/host/compare_compute.h"
+#include <vector>
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace host {
+
+#define COMPARE_FUNCTOR(name, op)                                           \
+  template <typename T>                                                     \
+  struct _##name##Functor {                                                 \
+    using TYPE = T;                                                         \
+    inline bool operator()(const T &a, const T &b) const { return a op b; } \
+  };
+
+COMPARE_FUNCTOR(Equal, ==);
+COMPARE_FUNCTOR(NotEqual, !=);
+COMPARE_FUNCTOR(LessThan, <);
+COMPARE_FUNCTOR(LessEqual, <=);
+COMPARE_FUNCTOR(GreaterThan, >);
+COMPARE_FUNCTOR(GreaterEqual, >=);
+
+template <>
+struct _EqualFunctor<float> {
+  using TYPE = float;
+  inline bool operator()(const float &a, const float &b) const {
+    // It is safe to cast a and b to double.
+    return fabs(static_cast<double>(a - b)) < 1e-8;
+  }
+};
+
+template <>
+struct _NotEqualFunctor<float> {
+  using TYPE = float;
+  inline bool operator()(const float &a, const float &b) const {
+    return !_EqualFunctor<float>()(a, b);
+  }
+};
+
+inline void get_mid_dims(const lite::DDim &x_dims,
+                         const lite::DDim &y_dims,
+                         const int axis,
+                         int *pre,
+                         int *n,
+                         int *post) {
+  *pre = 1;
+  *n = 1;
+  *post = 1;
+  for (int i = 0; i < axis; ++i) {
+    (*pre) *= x_dims[i];
+  }
+
+  for (int i = 0; i < y_dims.size(); ++i) {
+    (*n) *= y_dims[i];
+  }
+
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    (*post) *= x_dims[i];
+  }
+}
+
+template <PrecisionType PType, typename CompareFunctor>
+void CompareCompute<PType, CompareFunctor>::Run() {
+  auto &param = this->template Param<operators::CompareParam>();
+  using DType = typename CompareFunctor::TYPE;
+  const size_t x_size = param.X->numel();
+  const size_t y_size = param.Y->numel();
+  auto x_dims = param.X->dims();
+  auto y_dims = param.Y->dims();
+  bool *z = param.Out->template mutable_data<bool>();
+  const auto *x = param.X->template data<DType>();
+  const auto *y = param.Y->template data<DType>();
+  if (x_size == y_size) {
+    for (int i = 0; i < x_size; ++i) {
+      z[i] = CompareFunctor()(x[i], y[i]);
+    }
+  } else {
+    int axis = (param.axis == -1 ? x_dims.size() - y_dims.size() : param.axis);
+    int outer_num, mid_num, inner_num;
+    get_mid_dims(x_dims, y_dims, axis, &outer_num, &mid_num, &inner_num);
+    for (int outer_id = 0; outer_id < outer_num; ++outer_id) {
+      for (int mid_id = 0; mid_id < mid_num; ++mid_id) {
+        auto y_data = y[mid_id];
+        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
+          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
+          z[index] = CompareFunctor()(x[index], y_data);
+        }
+      }
+    }
+  }
+}
+
+}  // namespace host
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+using equal_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_EqualFunctor<float>>;
+REGISTER_LITE_KERNEL(equal, kHost, kFloat, kAny, equal_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using equal_int32 = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kInt32),
+    paddle::lite::kernels::host::_EqualFunctor<int32_t>>;
+REGISTER_LITE_KERNEL(equal, kHost, kInt32, kAny, equal_int32, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using not_equal_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_NotEqualFunctor<float>>;
+REGISTER_LITE_KERNEL(not_equal, kHost, kFloat, kAny, not_equal_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using less_than_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_LessThanFunctor<float>>;
+REGISTER_LITE_KERNEL(less_than, kHost, kFloat, kAny, less_than_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using less_than_int32 = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kInt32),
+    paddle::lite::kernels::host::_LessThanFunctor<int32_t>>;
+REGISTER_LITE_KERNEL(less_than, kHost, kInt32, kAny, less_than_int32, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using less_than_int64 = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kInt64),
+    paddle::lite::kernels::host::_LessThanFunctor<int64_t>>;
+REGISTER_LITE_KERNEL(less_than, kHost, kInt64, kAny, less_than_int64, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt64), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kInt64), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using less_equal_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_LessEqualFunctor<float>>;
+REGISTER_LITE_KERNEL(less_equal, kHost, kFloat, kAny, less_equal_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using greater_than_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_GreaterThanFunctor<float>>;
+REGISTER_LITE_KERNEL(greater_than, kHost, kFloat, kAny, greater_than_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
+
+using greater_equal_float = paddle::lite::kernels::host::CompareCompute<
+    PRECISION(kFloat),
+    paddle::lite::kernels::host::_GreaterEqualFunctor<float>>;
+REGISTER_LITE_KERNEL(
+    greater_equal, kHost, kFloat, kAny, greater_equal_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindInput("Y",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kBool), DATALAYOUT(kAny), -1)})
+    .Finalize();
--- a/lite/kernels/arm/compare_compute.h
+++ b/lite/kernels/arm/compare_compute.h
@@ -13,43 +13,24 @@
 // limitations under the License.

 #pragma once
-#include <stdint.h>
-#include "lite/backends/arm/math/type_trans.h"
 #include "lite/core/kernel.h"
-#include "lite/operators/compare_op.h"
+#include "lite/core/op_registry.h"

 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {

-template <template <typename T> class Functor>
-class CompareCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <PrecisionType PType, typename CompareFunctor>
+class CompareCompute
+    : public KernelLite<TARGET(kHost), PType, DATALAYOUT(kAny)> {
 public:
  void Run() override;

-  ~CompareCompute() {}
+  virtual ~CompareCompute() = default;
 };

-template <template <typename T> class Functor>
-class CompareCompute_int32
-    : public KernelLite<TARGET(kARM), PRECISION(kInt32)> {
- public:
-  void Run() override;
-
-  ~CompareCompute_int32() {}
-};
-
-template <template <typename T> class Functor>
-class CompareCompute_int64
-    : public KernelLite<TARGET(kARM), PRECISION(kInt64)> {
- public:
-  void Run() override;
-
-  ~CompareCompute_int64() {}
-};
-
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
--- a/lite/kernels/host/reshape_compute_test.cc
+++ b/lite/kernels/host/reshape_compute_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/kernels/host/reshape_compute.h"
-#include <gtest/gtest.h>
-#include <vector>
-#include "lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-TEST(reshape_host, init) {
-  ReshapeCompute reshape;
-  ASSERT_EQ(reshape.precision(), PRECISION(kAny));
-  ASSERT_EQ(reshape.target(), TARGET(kHost));
-}
-
-TEST(reshape_host, compute) {
-  ReshapeCompute reshape;
-  operators::ReshapeParam param;
-
-  Tensor input;
-  Tensor output;
-  input.Resize({1, 2, 4, 6});
-  auto* input_data = input.mutable_data<float>();
-  for (int i = 0; i < input.numel(); i++) {
-    input_data[i] = i;
-  }
-  Tensor shape_tensor;
-  shape_tensor.Resize({2});
-  auto* shape_tensor_data = shape_tensor.mutable_data<int>();
-  shape_tensor_data[0] = 6;
-  shape_tensor_data[1] = 8;
-
-  // set param and run
-  param.x = &input;
-  param.shape_tensor = &shape_tensor;  // use shape_tensor
-  param.inplace = false;
-  param.output = &output;
-  reshape.SetParam(param);
-  reshape.Run();
-
-  // check output dims
-  CHECK_EQ(shape_tensor.numel(), output.numel());
-  for (int i = 0; i < output.dims().size(); i++) {
-    CHECK_EQ(output.dims()[i], shape_tensor_data[i]);
-  }
-
-  // check output data
-  auto* output_data = output.mutable_data<float>();
-  CHECK_NE(output_data, input_data);
-  for (int i = 0; i < output.numel(); i++) {
-    EXPECT_NEAR(output_data[i], input_data[i], 1e-6);
-  }
-
-  // use shape, set param and run
-  param.shape_tensor = nullptr;
-  param.shape_vct = {-1, 0, 3, 2, 1};
-  reshape.SetParam(param);
-  reshape.Run();
-
-  // check output dims
-  CHECK_EQ(shape_tensor.numel(), output.numel());
-  for (int i = 0; i < output.dims().size(); i++) {
-    CHECK_EQ(output.dims()[i], shape_tensor_data[i]);
-  }
-
-  // check output data
-  output_data = output.mutable_data<float>();
-  CHECK_NE(output_data, input_data);
-  for (int i = 0; i < output.numel(); i++) {
-    EXPECT_NEAR(output_data[i], input_data[i], 1e-6);
-  }
-
-  // check output data if inplace = true;
-  param.inplace = true;
-  reshape.SetParam(param);
-  reshape.Run();
-  output_data = output.mutable_data<float>();
-  CHECK_EQ(output_data, input_data);
-}
-
-TEST(reshape, retrive_op) {
-  auto reshape =
-      KernelRegistry::Global()
-          .Create<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)>("reshape");
-  ASSERT_FALSE(reshape.empty());
-  ASSERT_TRUE(reshape.front());
-}
-
-TEST(reshape2, retrive_op) {
-  auto reshape2 =
-      KernelRegistry::Global()
-          .Create<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)>("reshape2");
-  ASSERT_FALSE(reshape2.empty());
-  ASSERT_TRUE(reshape2.front());
-}
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
-USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
--- a/lite/kernels/arm/shape_compute.cc
+++ b/lite/kernels/arm/shape_compute.cc
@@ -12,13 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "lite/kernels/arm/shape_compute.h"
-#include "lite/backends/arm/math/funcs.h"
+#include "lite/kernels/host/shape_compute.h"

 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {

 void ShapeCompute::Run() {
  auto& param = Param<operators::ShapeParam>();
@@ -29,13 +28,17 @@ void ShapeCompute::Run() {
  }
 }

-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle

 REGISTER_LITE_KERNEL(
-    shape, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ShapeCompute, def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    shape, kHost, kAny, kAny, paddle::lite::kernels::host::ShapeCompute, def)
+    .BindInput("Input",
+               {LiteType::GetTensorTy(
+                   TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny), -1)})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(
+                    TARGET(kHost), PRECISION(kInt32), DATALAYOUT(kAny), -1)})
    .Finalize();
--- a/lite/kernels/arm/shape_compute.h
+++ b/lite/kernels/arm/shape_compute.h
@@ -19,16 +19,17 @@
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {

-class ShapeCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+class ShapeCompute
+    : public KernelLite<TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)> {
 public:
  void Run() override;

  virtual ~ShapeCompute() = default;
 };

-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -38,6 +38,8 @@ lite_cc_library(subgraph_bridge_shuffle_channel_op_npu SRCS shuffle_channel_op.c
 lite_cc_library(subgraph_bridge_pad2d_op_npu SRCS pad2d_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_reduce_mean_op_npu SRCS reduce_mean_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_unsqueeze_op_npu SRCS unsqueeze_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_gather_op_npu SRCS gather_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_lookup_table_op_npu SRCS lookup_table_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_argmax_op_npu SRCS argmax_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_instance_norm_op_npu SRCS instance_norm_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_dropout_op_npu SRCS dropout_op.cc DEPS ${npu_subgraph_bridge_deps})
@@ -47,6 +49,7 @@ lite_cc_library(subgraph_bridge_fill_constant_op_npu SRCS fill_constant_op.cc DE
 lite_cc_library(subgraph_bridge_fill_constant_batch_size_like_op_npu SRCS fill_constant_batch_size_like_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_increment_op_npu SRCS increment_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_compare_op_npu SRCS compare_op.cc DEPS ${npu_subgraph_bridge_deps})
+#lite_cc_library(subgraph_bridge_shape_op_npu SRCS shape_op.cc DEPS ${npu_subgraph_bridge_deps})


 set(npu_subgraph_bridges
@@ -73,6 +76,8 @@ set(npu_subgraph_bridges
        subgraph_bridge_pad2d_op_npu
        subgraph_bridge_reduce_mean_op_npu
        subgraph_bridge_unsqueeze_op_npu
+        subgraph_bridge_gather_op_npu
+        subgraph_bridge_lookup_table_op_npu
        subgraph_bridge_argmax_op_npu
        subgraph_bridge_instance_norm_op_npu
        subgraph_bridge_dropout_op_npu

--- a/lite/kernels/npu/bridges/gather_op.cc
+++ b/lite/kernels/npu/bridges/gather_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int GatherConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_name);
+
+  auto index_name = op_info->Input("Index").front();
+  auto index = scope->FindTensor(index_name);
+  auto index_dims = index->dims();
+  CHECK(index_dims.size() == 1 ||
+        (index_dims.size() == 2 && index_dims[1] == 1))
+      << "index dims unmatch";
+
+  auto out_name = op_info->Output("Out").front();
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Index node
+  std::shared_ptr<Node> index_node = nullptr;
+  if (graph->Has(index_name)) {
+    index_node = graph->Get(index_name);
+  } else {
+    index_node = graph->Add(index_name, *index);
+  }
+
+  // Gather node
+  auto gather_node = graph->Add<ge::op::Gather>(out_name);
+  auto gather_op = gather_node->data<ge::op::Gather>();
+  gather_op->set_input_params(*x_node->data());
+  gather_op->set_input_indices(*index_node->data());
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(gather,
+                         kNPU,
+                         paddle::lite::subgraph::npu::GatherConverter);
--- a/lite/kernels/npu/bridges/lookup_table_op.cc
+++ b/lite/kernels/npu/bridges/lookup_table_op.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int LookupTableConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input, output and op attributes
+  auto w_name = op_info->Input("W").front();
+  auto w = scope->FindTensor(w_name);
+
+  auto index_name = op_info->Input("Ids").front();
+  auto index = scope->FindTensor(index_name);
+
+  auto out_name = op_info->Output("Out").front();
+  auto out = scope->FindTensor(out_name);
+  auto out_shape = out->dims().Vectorize();
+
+  // W node
+  std::shared_ptr<Node> w_node = nullptr;
+  if (graph->Has(w_name)) {
+    w_node = graph->Get(w_name);
+  } else {
+    w_node = graph->Add(w_name, *w);
+  }
+
+  // Index node
+  std::shared_ptr<Node> index_node = nullptr;
+  if (graph->Has(index_name)) {
+    index_node = graph->Get(index_name);
+  } else {
+    index_node = graph->Add(index_name, *index);
+  }
+
+  // reshape ids
+  auto reshaped_index_node =
+      graph->Add<ge::op::Reshape>(index_name + "/reshape");
+  auto reshaped_index_op = reshaped_index_node->data<ge::op::Reshape>();
+  reshaped_index_op->set_input_tensor(*index_node->data());
+  reshaped_index_op->set_attr_shape(ge::AttrValue::LIST_INT({index->numel()}));
+  reshaped_index_op->set_attr_axis(0);
+  index_node = reshaped_index_node;
+
+  // Gather node
+  auto gather_node = graph->Add<ge::op::Gather>(out_name);
+  auto gather_op = gather_node->data<ge::op::Gather>();
+  gather_op->set_input_params(*w_node->data());
+  gather_op->set_input_indices(*index_node->data());
+
+  // reshape out
+  auto reshaped_gather_node = graph->Add<ge::op::Reshape>(out_name);
+  auto reshaped_gather_op = reshaped_gather_node->data<ge::op::Reshape>();
+  reshaped_gather_op->set_input_tensor(*gather_node->data());
+  reshaped_gather_op->set_attr_shape(
+      ge::AttrValue::LIST_INT(out_shape.begin(), out_shape.end()));
+  reshaped_gather_op->set_attr_axis(0);
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(lookup_table,
+                         kNPU,
+                         paddle::lite::subgraph::npu::LookupTableConverter);
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
@@ -45,6 +45,8 @@ USE_SUBGRAPH_BRIDGE(fusion_elementwise_div_activation, kNPU);
 USE_SUBGRAPH_BRIDGE(fill_constant, kNPU)
 USE_SUBGRAPH_BRIDGE(fill_constant_batch_size_like, kNPU)

+// USE_SUBGRAPH_BRIDGE(gather, kNPU);
+// USE_SUBGRAPH_BRIDGE(lookup_table, kNPU);
 USE_SUBGRAPH_BRIDGE(increment, kNPU);
 USE_SUBGRAPH_BRIDGE(instance_norm, kNPU);
 USE_SUBGRAPH_BRIDGE(fc, kNPU);
@@ -59,6 +61,7 @@ USE_SUBGRAPH_BRIDGE(reduce_mean, kNPU);
 USE_SUBGRAPH_BRIDGE(reshape, kNPU);
 USE_SUBGRAPH_BRIDGE(reshape2, kNPU);
 USE_SUBGRAPH_BRIDGE(scale, kNPU);
+// USE_SUBGRAPH_BRIDGE(shape, kNPU);
 USE_SUBGRAPH_BRIDGE(shuffle_channel, kNPU);
 USE_SUBGRAPH_BRIDGE(softmax, kNPU);
 USE_SUBGRAPH_BRIDGE(split, kNPU);

--- a/lite/kernels/npu/bridges/shape_op.cc
+++ b/lite/kernels/npu/bridges/shape_op.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int ShapeConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("Input").front();
+  auto x = scope->FindTensor(x_name);
+
+  auto out_name = op_info->Output("Out").front();
+
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // Shape node
+  auto shape_node = graph->Add<ge::op::Shape>(out_name);
+  auto shape_op = shape_node->data<ge::op::Shape>();
+  shape_op->set_input_x(*x_node->data());
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(shape,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ShapeConverter);
--- a/lite/operators/shape_op.cc
+++ b/lite/operators/shape_op.cc
@@ -26,9 +26,8 @@ bool ShapeOpLite::CheckShape() const {
 }

 bool ShapeOpLite::InferShapeImpl() const {
-  std::vector<int64_t> shape_vec;
-  shape_vec.push_back(static_cast<int64_t>(param_.X->dims().size()));
-  param_.Out->Resize(shape_vec);
+  int64_t x_dims_size = param_.X->dims().size();
+  param_.Out->Resize({x_dims_size});
  return true;
 }


--- a/lite/tests/kernels/compare_compute_test.cc
+++ b/lite/tests/kernels/compare_compute_test.cc
@@ -216,7 +216,7 @@ TEST(Compare_OP_NPU, precision) {
 }
 #elif defined(LITE_WITH_ARM)
 TEST(Compare_OP_ARM, precision) {
-  Place place{TARGET(kARM)};
+  Place place{TARGET(kHost)};
  float abs_error = 1e-5;
  for (auto op : std::vector<std::string>{"equal",
                                          "not_equal",

--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
@@ -91,10 +91,12 @@ class GatherComputeTest : public arena::TestCase {
 };

 TEST(Gather, precision) {
-  LOG(INFO) << "test gather op";
  float abs_error = 2e-5;
  Place place;
-#if defined(LITE_WITH_ARM)
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
  place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU)
  place = TARGET(kXPU);
@@ -104,8 +106,7 @@ TEST(Gather, precision) {

  for (auto x_dims :
       std::vector<std::vector<int64_t>>{{5, 2, 3, 4}, {8, 3, 5}, {12, 3}}) {
-    for (auto index_dims :
-         std::vector<std::vector<int64_t>>{{3, 1}, {7, 1}, {10, 1}}) {
+    for (auto index_dims : std::vector<std::vector<int64_t>>{{3}, {7}, {10}}) {
      std::unique_ptr<arena::TestCase> tester(
          new GatherComputeTest(place, "def", DDim(x_dims), DDim(index_dims)));
      arena::Arena arena(std::move(tester), place, abs_error);

--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
@@ -21,6 +21,7 @@
 namespace paddle {
 namespace lite {

+template <typename T>
 class LookupTableComputeTest : public arena::TestCase {
 protected:
  // common attributes for this op.
@@ -64,7 +65,7 @@ class LookupTableComputeTest : public arena::TestCase {
    out->Resize(out_dims);
    out->set_lod(ids->lod());

-    auto ids_data = ids->data<int64_t>();
+    auto ids_data = ids->data<T>();
    auto ids_size = ids_dims.production();
    auto w_data = w->data<float>();
    auto w_rows = w_dims[0];
@@ -95,9 +96,8 @@ class LookupTableComputeTest : public arena::TestCase {
  }

  void PrepareData() override {
-    std::vector<int64_t> ids(ids_dims_.production());
-    fill_data_rand<int64_t>(
-        ids.data(), 0, w_dims_[0] - 1, ids_dims_.production());
+    std::vector<T> ids(ids_dims_.production());
+    fill_data_rand<T>(ids.data(), 0, w_dims_[0] - 1, ids_dims_.production());

    std::vector<float> w(w_dims_.production());
    fill_data_rand(w.data(), -1.f, 1.f, w_dims_.production());
@@ -109,9 +109,12 @@ class LookupTableComputeTest : public arena::TestCase {

 TEST(LookupTable, precision) {
  LOG(INFO) << "test lookup_table op";
-  float abs_error = 2e-5;
+  float abs_error = 1e-5;
  Place place;
-#if defined(LITE_WITH_ARM)
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;
+#elif defined(LITE_WITH_ARM)
  place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU)
  place = TARGET(kXPU);
@@ -119,18 +122,25 @@ TEST(LookupTable, precision) {
  return;
 #endif

+#if defined(LITE_WITH_NPU)
+  using ID_T = int;
+#else
+  using ID_T = int64_t;
+#endif
+
  for (auto ids_dims :
       std::vector<std::vector<int64_t>>{{5, 2, 3, 1}, {2, 3, 1}, {3, 1}}) {
    for (auto w_dims :
         std::vector<std::vector<int64_t>>{{4, 2}, {6, 8}, {12, 15}}) {
-#if defined(LITE_WITH_XPU)
+#if defined(LITE_WITH_XPU) && defined(LITE_WITH_NPU)
      for (auto padding_idx :
-           std::vector<int64_t>{-1}) {  // Only -1 is supported by XPU
+           std::vector<int64_t>{-1}) {  // Only -1 is supported by XPU or NPU
 #else
      for (auto padding_idx : std::vector<int64_t>{-1, 0, w_dims[0] - 1}) {
 #endif
-        std::unique_ptr<arena::TestCase> tester(new LookupTableComputeTest(
-            place, "def", DDim(ids_dims), DDim(w_dims), padding_idx));
+        std::unique_ptr<arena::TestCase> tester(
+            new LookupTableComputeTest<ID_T>(
+                place, "def", DDim(ids_dims), DDim(w_dims), padding_idx));
        arena::Arena arena(std::move(tester), place, abs_error);
        arena.TestPrecision();
      }

--- a/lite/tests/kernels/reshape_compute_test.cc
+++ b/lite/tests/kernels/reshape_compute_test.cc
@@ -204,6 +204,8 @@ TEST(Reshape, precision) {
 #if defined(LITE_WITH_NPU)
  place = TARGET(kNPU);
  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kHost);
 #elif defined(LITE_WITH_XPU)
  place = TARGET(kXPU);
 #else

--- a/lite/tests/kernels/shape_compute_test.cc
+++ b/lite/tests/kernels/shape_compute_test.cc
@@ -16,13 +16,14 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"

 namespace paddle {
 namespace lite {
 class ShapeComputeTester : public arena::TestCase {
 protected:
  // common attributes for this op.
-  std::string x_ = "Input";
+  std::string input_ = "Input";
  std::string out_ = "Out";
  DDim dims_;

@@ -31,7 +32,7 @@ class ShapeComputeTester : public arena::TestCase {
      : TestCase(place, alias), dims_(dims) {}

  void RunBaseline(Scope* scope) override {
-    const auto* input = scope->FindTensor(x_);
+    const auto* input = scope->FindTensor(input_);
    CHECK(input);
    auto* out = scope->NewTensor(out_);
    CHECK(out);
@@ -45,42 +46,46 @@ class ShapeComputeTester : public arena::TestCase {

  void PrepareOpDesc(cpp::OpDesc* op_desc) {
    op_desc->SetType("shape");
-    op_desc->SetInput("Input", {x_});
+    op_desc->SetInput("Input", {input_});
    op_desc->SetOutput("Out", {out_});
  }

  void PrepareData() override {
-    std::vector<float> in_data(dims_.production());
-    for (int i = 0; i < dims_.production(); ++i) {
-      in_data[i] = i;
-    }
-    SetCommonTensor(x_, dims_, in_data.data());
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(input_, dims_, din.data());
  }
 };

-void test_shape(Place place) {
-  for (int N : {1, 2, 3, 4}) {
-    for (int C : {1, 2, 3, 4}) {
-      for (int H : {1, 2, 3, 4}) {
-        for (int W : {1, 2, 3, 4}) {
-          std::unique_ptr<arena::TestCase> tester(
-              new ShapeComputeTester(place, "def", DDim({N, C, H, W})));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
+void TestShapeHelper(Place place,
+                     float abs_error,
+                     std::vector<int64_t> x_dims) {
+  std::unique_ptr<arena::TestCase> tester(
+      new ShapeComputeTester(place, "def", DDim(x_dims)));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
+void test_shape(Place place, float abs_error) {
+  TestShapeHelper(place, abs_error, {2, 3, 4, 5});
+  TestShapeHelper(place, abs_error, {3, 4, 5});
+  TestShapeHelper(place, abs_error, {4, 5});
+  TestShapeHelper(place, abs_error, {5});
 }

 TEST(shape, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_shape(place);
+  Place place;
+  float abs_error = 1e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kHost);
+#else
+  return;
 #endif
+
+  test_shape(place, abs_error);
 }

 }  // namespace lite