diff --git a/lite/kernels/arm/CMakeLists.txt b/lite/kernels/arm/CMakeLists.txt
index 31996dab195c7d5d7e99d917b8d251b35f477f8f..c4b03b03072b36ff10d53f7da9a90b8ea5607818 100644
--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -41,7 +41,6 @@ add_kernel(slice_compute_arm ARM basic SRCS slice_compute.cc DEPS ${lite_kernel_
 add_kernel(cast_compute_arm ARM basic SRCS cast_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(squeeze_compute_arm ARM basic SRCS squeeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(unsqueeze_compute_arm ARM basic SRCS unsqueeze_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(expand_compute_arm ARM basic SRCS expand_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(reduce_mean_compute_arm ARM basic SRCS reduce_mean_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(stack_compute_arm ARM basic SRCS stack_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(affine_channel_compute_arm ARM basic SRCS affine_channel_compute.cc DEPS ${lite_kernel_deps} math_arm)
diff --git a/lite/kernels/host/CMakeLists.txt b/lite/kernels/host/CMakeLists.txt
index a0085e6d6c5e65667e96393c42a1608c8dd24d0c..078fad7aa0221a0e60b1f4dd928136b38f198dcb 100644
--- a/lite/kernels/host/CMakeLists.txt
+++ b/lite/kernels/host/CMakeLists.txt
@@ -4,6 +4,7 @@ add_kernel(feed_compute_host Host basic SRCS feed_compute.cc DEPS ${lite_kernel_
 add_kernel(fetch_compute_host Host basic SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(reshape_compute_host Host basic SRCS reshape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(multiclass_nms_compute_host Host basic SRCS multiclass_nms_compute.cc DEPS ${lite_kernel_deps})
+add_kernel(expand_compute_host Host basic SRCS expand_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(shape_compute_host Host extra SRCS shape_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(is_empty_compute_host Host extra SRCS is_empty_compute.cc DEPS ${lite_kernel_deps})
 add_kernel(crf_decoding_compute_host Host extra SRCS crf_decoding_compute.cc DEPS ${lite_kernel_deps})
diff --git a/lite/kernels/arm/expand_compute.cc b/lite/kernels/host/expand_compute.cc
similarity index 63%
rename from lite/kernels/arm/expand_compute.cc
rename to lite/kernels/host/expand_compute.cc
index 73bcae909e7016b6b3cf9d2b0091299b44cea3db..cb7241a47371b4793b1bcd24353c7f09669d6f8e 100644
--- a/lite/kernels/arm/expand_compute.cc
+++ b/lite/kernels/host/expand_compute.cc
@@ -12,24 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "lite/kernels/arm/expand_compute.h"
+#include "lite/kernels/host/expand_compute.h"
 #include <vector>
-#include "lite/core/op_registry.h"
-#include "lite/core/type_system.h"
 
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
-void ExpandCompute::Run() {
-  auto& param = Param<operators::ExpandParam>();
+template <typename T, PrecisionType PType>
+void ExpandCompute<T, PType>::Run() {
+  auto& param = this->template Param<operators::ExpandParam>();
   const auto* x = param.X;
   auto* out = param.Out;
   std::vector<int> expand_times = param.expand_times;
 
-  const float* src = x->data<float>();
-  float* dst = out->mutable_data<float>();
+  const T* src = x->template data<T>();
+  T* dst = out->template mutable_data<T>();
 
   int dims = expand_times.size();
   DDim in_shape = x->dims();
@@ -42,7 +41,7 @@ void ExpandCompute::Run() {
     for (int k = 0; k < expand_times[i]; ++k) {
       memcpy(dst + (j * expand_times[i] + k) * inner_num,
              src + j * inner_num,
-             sizeof(float) * inner_num);
+             sizeof(T) * inner_num);
     }
   }
   inner_num *= expand_times[i];
@@ -53,20 +52,27 @@ void ExpandCompute::Run() {
       for (int k = expand_times[i] - 1; k >= 0; --k) {
         memcpy(dst + (j * expand_times[i] + k) * inner_num,
                dst + j * inner_num,
-               sizeof(float) * inner_num);
+               sizeof(T) * inner_num);
       }
     }
     inner_num *= expand_times[i];
   }
 }
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
 
-REGISTER_LITE_KERNEL(
-    expand, kARM, kFloat, kNCHW, paddle::lite::kernels::arm::ExpandCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+using expand_float =
+    paddle::lite::kernels::host::ExpandCompute<float, PRECISION(kFloat)>;
+REGISTER_LITE_KERNEL(expand, kHost, kFloat, kAny, expand_float, def)
+    .BindInput("X",
+               {LiteType::GetTensorTy(TARGET(kHost),
+                                      PRECISION(kFloat),
+                                      DATALAYOUT(kAny))})
+    .BindOutput("Out",
+                {LiteType::GetTensorTy(TARGET(kHost),
+                                       PRECISION(kFloat),
+                                       DATALAYOUT(kAny))})
     .Finalize();
diff --git a/lite/kernels/arm/expand_compute.h b/lite/kernels/host/expand_compute.h
similarity index 84%
rename from lite/kernels/arm/expand_compute.h
rename to lite/kernels/host/expand_compute.h
index d872c2a60b613bb05ee36698cb31ceef0d5eed3e..8bb9422501fa4ffb77472a5c898a838d3b6cc7e1 100644
--- a/lite/kernels/arm/expand_compute.h
+++ b/lite/kernels/host/expand_compute.h
@@ -19,16 +19,18 @@
 namespace paddle {
 namespace lite {
 namespace kernels {
-namespace arm {
+namespace host {
 
-class ExpandCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
+template <typename T, PrecisionType PType>
+class ExpandCompute
+    : public KernelLite<TARGET(kHost), PType, DATALAYOUT(kAny)> {
  public:
   void Run() override;
 
   virtual ~ExpandCompute() = default;
 };
 
-}  // namespace arm
+}  // namespace host
 }  // namespace kernels
 }  // namespace lite
 }  // namespace paddle
diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt
index f2974bf6103da4e8470926b4cc1ef07e5530fd2c..5157f47867160cf4f705306ca37cfad962373386 100644
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -49,6 +49,7 @@ lite_cc_library(subgraph_bridge_fill_constant_op_npu SRCS fill_constant_op.cc DE
 lite_cc_library(subgraph_bridge_fill_constant_batch_size_like_op_npu SRCS fill_constant_batch_size_like_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_increment_op_npu SRCS increment_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_compare_op_npu SRCS compare_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_expand_op_npu SRCS expand_op.cc DEPS ${npu_subgraph_bridge_deps})
 #lite_cc_library(subgraph_bridge_shape_op_npu SRCS shape_op.cc DEPS ${npu_subgraph_bridge_deps})
 
 
@@ -87,6 +88,7 @@ set(npu_subgraph_bridges
         subgraph_bridge_fill_constant_batch_size_like_op_npu
         subgraph_bridge_increment_op_npu
         subgraph_bridge_compare_op_npu
+        subgraph_bridge_expand_op_npu
         CACHE INTERNAL "npu_subgraph_bridges")
 
 message(STATUS "+++++ npu_subgraph_bridges: ${npu_subgraph_bridges}")
diff --git a/lite/kernels/npu/bridges/expand_op.cc b/lite/kernels/npu/bridges/expand_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..62501ab76c46c714af6be95c68b85d22e1e044c9
--- /dev/null
+++ b/lite/kernels/npu/bridges/expand_op.cc
@@ -0,0 +1,69 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+
+int ExpandConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_name);
+  auto x_dims = x->dims();
+
+  auto out_name = op_info->Output("Out").front();
+
+  auto expand_times = op_info->GetAttr<std::vector<int>>("expand_times");
+
+  // x node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+
+  // w node
+  std::shared_ptr<Node> w_node = graph->Add(out_name + "/w", expand_times);
+
+  // expand node
+  auto expand_node = graph->Add<ge::op::Tile>(out_name);
+  auto expand_op = expand_node->data<ge::op::Tile>();
+  expand_op->set_input_x(*x_node->data());
+  expand_op->set_input_w(*w_node->data());
+
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_SUBGRAPH_BRIDGE(expand,
+                         kNPU,
+                         paddle::lite::subgraph::npu::ExpandConverter);
diff --git a/lite/kernels/npu/bridges/paddle_use_bridges.h b/lite/kernels/npu/bridges/paddle_use_bridges.h
index 3965bd64ff534eb1bc2506527e80cca8a19056b2..b6ce66fe34963d8c3bc9c2bccc0f3a294ab16290 100644
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
@@ -38,6 +38,7 @@ USE_SUBGRAPH_BRIDGE(elementwise_add, kNPU);
 USE_SUBGRAPH_BRIDGE(elementwise_sub, kNPU);
 USE_SUBGRAPH_BRIDGE(elementwise_mul, kNPU);
 USE_SUBGRAPH_BRIDGE(elementwise_div, kNPU);
+USE_SUBGRAPH_BRIDGE(expand, kNPU);
 USE_SUBGRAPH_BRIDGE(fusion_elementwise_add_activation, kNPU);
 USE_SUBGRAPH_BRIDGE(fusion_elementwise_sub_activation, kNPU);
 USE_SUBGRAPH_BRIDGE(fusion_elementwise_mul_activation, kNPU);
diff --git a/lite/tests/kernels/expand_compute_test.cc b/lite/tests/kernels/expand_compute_test.cc
index 4ab1c15a5e78f562bc4270cd57c5f0dd3600bbe2..75d5aa65f5a7eba179f5da23e2497434f9cdb1dc 100644
--- a/lite/tests/kernels/expand_compute_test.cc
+++ b/lite/tests/kernels/expand_compute_test.cc
@@ -84,7 +84,7 @@ class ExpandComputeTester : public arena::TestCase {
   }
 };
 
-void test_expand_3dim(Place place) {
+void test_expand_3dim(Place place, float abs_error) {
   for (std::vector<int> expand_times : {std::vector<int>({2, 3, 1}),
                                         std::vector<int>({2, 2, 2}),
                                         std::vector<int>({3, 1, 2})}) {
@@ -93,7 +93,7 @@ void test_expand_3dim(Place place) {
         for (int W : {4}) {
           std::unique_ptr<arena::TestCase> tester(new ExpandComputeTester(
               place, "def", expand_times, DDim({C, H, W})));
-          arena::Arena arena(std::move(tester), place, 2e-5);
+          arena::Arena arena(std::move(tester), place, abs_error);
           arena.TestPrecision();
         }
       }
@@ -101,7 +101,7 @@ void test_expand_3dim(Place place) {
   }
 }
 
-void test_expand_4dim(Place place) {
+void test_expand_4dim(Place place, float abs_error) {
   for (std::vector<int> expand_times : {std::vector<int>({2, 3, 1, 4}),
                                         std::vector<int>({2, 2, 2, 2}),
                                         std::vector<int>({3, 1, 2, 1})}) {
@@ -111,7 +111,7 @@ void test_expand_4dim(Place place) {
           for (int W : {4}) {
             std::unique_ptr<arena::TestCase> tester(new ExpandComputeTester(
                 place, "def", expand_times, DDim({N, C, H, W})));
-            arena::Arena arena(std::move(tester), place, 2e-5);
+            arena::Arena arena(std::move(tester), place, abs_error);
             arena.TestPrecision();
           }
         }
@@ -121,14 +121,19 @@ void test_expand_4dim(Place place) {
 }
 
 TEST(Expand, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_expand_3dim(place);
-  test_expand_4dim(place);
+  float abs_error = 1e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kHost);
+#else
+  return;
 #endif
+
+  test_expand_3dim(place, abs_error);
+  test_expand_4dim(place, abs_error);
 }
 
 }  // namespace lite