diff --git a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
index 95339d6175c98f22d542db24f02d6d714ccbe2a8..1afb54c692592ca42d8b120dcf1a91922e19149c 100644
--- a/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
+++ b/lite/core/mir/subgraph/generate_npu_program_pass_test.cc
@@ -160,8 +160,8 @@ TEST(NPUSubgraph, compare) {
       TestModel(FLAGS_model_dir,
                 FLAGS_model_file,
                 FLAGS_params_file,
-                {lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
-                 lite_api::Place{TARGET(kNPU), PRECISION(kFloat)}},
+                {lite_api::Place{TARGET(kNPU), PRECISION(kFloat)},
+                 lite_api::Place{TARGET(kARM), PRECISION(kFloat)}},
                 input_tensor_shape,
                 FLAGS_optimized_model_dir + "/NPU");
   // verify results
diff --git a/lite/kernels/npu/bridges/CMakeLists.txt b/lite/kernels/npu/bridges/CMakeLists.txt
index d79fdeb3c16cae2fcc0964697bb31f3746164a9f..79d1bf2fd5fa694d4888d474c321a43d279bab76 100644
--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -21,6 +21,7 @@ lite_cc_library(npu_bridge_shuffle_channel_op SRCS shuffle_channel_op.cc DEPS ${
 lite_cc_library(npu_bridge_pad2d_op SRCS pad2d_op.cc DEPS ${npu_bridge_deps})
 lite_cc_library(npu_bridge_square_op SRCS square_op.cc DEPS ${npu_bridge_deps})
 lite_cc_library(npu_bridge_sqrt_op SRCS sqrt_op.cc DEPS ${npu_bridge_deps})
+lite_cc_library(npu_bridge_reduce_mean_op SRCS reduce_mean_op.cc DEPS ${npu_bridge_deps})
 
 set(npu_bridges
         npu_bridge_registry
@@ -43,6 +44,7 @@ set(npu_bridges
         npu_bridge_pad2d_op
         npu_bridge_square_op
         npu_bridge_sqrt_op
+        npu_bridge_reduce_mean_op
         CACHE INTERNAL "npu_bridges")
 
 set(npu_bridge_test_deps ${npu_bridges} ${npu_kernels} ${ops})
@@ -66,5 +68,6 @@ lite_cc_test(test_npu_bridge_shuffle_channel_op SRCS shuffle_channel_op_test.cc
 lite_cc_test(test_npu_bridge_pad2d_op SRCS pad2d_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
 lite_cc_test(test_npu_bridge_square_op SRCS square_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
 lite_cc_test(test_npu_bridge_sqrt_op SRCS sqrt_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
+lite_cc_test(test_npu_bridge_reduce_mean_op SRCS reduce_mean_op_test.cc test_helper.cc DEPS ${npu_bridge_test_deps})
 
 message(STATUS "+++++ npu_bridges: ${npu_bridges}")
diff --git a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
index 8b4252de06e8934affe7592fc8ea521ad7d20025..8cdfff1fb4365f9a855c5490ddc4677c6954830a 100644
--- a/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_npu_bridges.h
@@ -24,6 +24,9 @@ USE_NPU_BRIDGE(pool2d);
 USE_NPU_BRIDGE(relu);
 USE_NPU_BRIDGE(elementwise_add);
 USE_NPU_BRIDGE(fusion_elementwise_add_activation);
+USE_NPU_BRIDGE(elementwise_sub);
+USE_NPU_BRIDGE(elementwise_mul);
+USE_NPU_BRIDGE(elementwise_div);
 USE_NPU_BRIDGE(scale);
 USE_NPU_BRIDGE(softmax);
 USE_NPU_BRIDGE(concat);
@@ -36,3 +39,8 @@ USE_NPU_BRIDGE(bilinear_interp);
 USE_NPU_BRIDGE(conv2d_transpose);
 USE_NPU_BRIDGE(reshape);
 USE_NPU_BRIDGE(reshape2);
+USE_NPU_BRIDGE(sqrt);
+USE_NPU_BRIDGE(square);
+USE_NPU_BRIDGE(reduce_mean);
+USE_NPU_BRIDGE(tanh);
+USE_NPU_BRIDGE(nearest_interp);
diff --git a/lite/kernels/npu/bridges/reduce_mean_op.cc b/lite/kernels/npu/bridges/reduce_mean_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4725bdfb0e17c4f99dfd2359ff34c96f9e5af6e5
--- /dev/null
+++ b/lite/kernels/npu/bridges/reduce_mean_op.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/backends/npu/builder.h"
+#include "lite/kernels/npu/bridges/registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+node_map_type ReduceMeanConverter(
+    const std::shared_ptr<lite::OpLite> reduce_mean_op,
+    const node_map_type& inputs_map) {
+  auto scope = reduce_mean_op->scope();
+  auto op_info = reduce_mean_op->op_info();
+  auto op_type = op_info->Type();
+  auto unique_op_type = lite::npu::UniqueName(op_type);
+  LOG(INFO) << "[NPU] Converting " + op_type + "...";
+
+  // get input, and op attributes
+  auto x_var_name = op_info->Input("X").front();
+  auto x_dims = scope->FindTensor(x_var_name)->dims();
+  auto keep_dim = op_info->GetAttr<bool>("keep_dim");
+  auto dim = op_info->GetAttr<std::vector<int>>("dim");
+  CHECK(!dim.empty()) << "\"dim\" of reduce_mean should not be empty.";
+  for (size_t i = 0; i < dim.size(); i++) {
+    if (dim[i] < 0) {
+      dim[i] += x_dims.size();
+    }
+  }
+  std::sort(dim.begin(), dim.end());
+
+  // create reduce_mean(reduce_sum + scale) node and set input node from
+  // inputs_map
+  // creat reduce_sum node
+  auto unique_reduce_sum = lite::npu::UniqueName("reduce_sum");
+  auto reduce_sum_node = std::make_shared<ge::op::ReduceSum>(unique_reduce_sum);
+  CHECK(inputs_map.count(x_var_name));
+  reduce_sum_node->set_input_x(*inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(inputs_map.at(x_var_name));
+  lite::npu::OpList::Global().add(reduce_sum_node);
+
+  auto dim_const_node =
+      std::make_shared<ge::op::Const>(unique_reduce_sum + "/dim");
+  dim_const_node->set_attr_value(lite::npu::CreateTensorAndFillData<int>(dim));
+  reduce_sum_node->set_input_w(*dim_const_node);
+  lite::npu::OpList::Global().add(dim_const_node);
+
+  reduce_sum_node->set_attr_keep_dims(keep_dim);
+
+  // create scale node
+  auto unique_scale = lite::npu::UniqueName("scale");
+  auto scale_node = std::make_shared<ge::op::Scale>(unique_scale);
+  scale_node->set_input_x(*reduce_sum_node);
+  lite::npu::OpList::Global().add(scale_node);
+
+  float scale = 1;
+  for (size_t i = 0; i < dim.size(); i++) {
+    scale /= x_dims[dim[i]];
+  }
+
+  std::vector<int64_t> scale_bias_shape = x_dims.Vectorize();
+  if (keep_dim) {
+    for (size_t i = 0; i < dim.size(); i++) {
+      scale_bias_shape[dim[i]] = 1;
+    }
+  } else {
+    const int64_t kDelFlag = -2;
+    for (size_t i = 0; i < dim.size(); ++i) {
+      scale_bias_shape[dim[i]] = kDelFlag;
+    }
+    scale_bias_shape.erase(
+        remove(scale_bias_shape.begin(), scale_bias_shape.end(), kDelFlag),
+        scale_bias_shape.end());
+  }
+
+  auto filter_const_node =
+      std::make_shared<ge::op::Const>(unique_scale + "/filter");
+  filter_const_node->set_attr_value(
+      lite::npu::CreateTensorAndFillData(scale, scale_bias_shape));
+  scale_node->set_input_filter(*filter_const_node);
+  lite::npu::OpList::Global().add(filter_const_node);
+
+  scale_node->set_attr_axis(1);
+
+  node_map_type outputs_map;
+  outputs_map[op_info->Output("Out").front()] = scale_node;
+  return outputs_map;
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_NPU_BRIDGE(reduce_mean,
+                    paddle::lite::kernels::npu::bridges::ReduceMeanConverter);
diff --git a/lite/kernels/npu/bridges/reduce_mean_op_test.cc b/lite/kernels/npu/bridges/reduce_mean_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8646ce5c25b367cf3c9055f1ed13a225149a9cc7
--- /dev/null
+++ b/lite/kernels/npu/bridges/reduce_mean_op_test.cc
@@ -0,0 +1,347 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/reduce_mean_op.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <random>
+#include "lite/core/op_registry.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/test_helper.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace npu {
+namespace bridges {
+
+void reduce_mean_n(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = channel_in * hw_size;
+  int data_index, src_index;
+  for (int c = 0; c < channel_in; ++c) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = c * hw_size + h * width_in + w;
+        dst[data_index] = 0.0;
+        for (int n = 0; n < num_in; ++n) {
+          src_index = n * chw_size + data_index;
+          dst[data_index] += static_cast<float>(src[src_index]) / num_in;
+        }
+      }
+    }
+  }
+}
+
+void reduce_mean_c(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int hw_size = height_in * width_in;
+  int chw_size = hw_size * channel_in;
+  int data_index, src_index0, src_index;
+  for (int n = 0; n < num_in; ++n) {
+    for (int h = 0; h < height_in; ++h) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * hw_size + h * width_in + w;
+        src_index0 = n * chw_size + h * width_in + w;
+        dst[data_index] = 0.0;
+        for (int c = 0; c < channel_in; ++c) {
+          src_index = src_index0 + c * hw_size;
+          dst[data_index] += static_cast<float>(src[src_index]) / channel_in;
+        }
+      }
+    }
+  }
+}
+
+void reduce_mean_h(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int cw_size = channel_in * width_in;
+  int chw_size = cw_size * height_in;
+  int hw_size = height_in * width_in;
+  int data_index, src_index, src_index0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int w = 0; w < width_in; ++w) {
+        data_index = n * cw_size + c * width_in + w;
+        src_index0 = n * chw_size + c * hw_size + w;
+        dst[data_index] = 0.0;
+        for (int h = 0; h < height_in; ++h) {
+          src_index = src_index0 + h * width_in;
+          dst[data_index] += static_cast<float>(src[src_index]) / height_in;
+        }
+      }
+    }
+  }
+}
+
+void reduce_mean_w(const float* src,
+                   float* dst,
+                   int num_in,
+                   int channel_in,
+                   int height_in,
+                   int width_in) {
+  int ch_size = channel_in * height_in;
+  int hw_size = height_in * width_in;
+  int chw_size = ch_size * width_in;
+  int data_index = 0;
+  int src_index0 = 0;
+  int src_index = 0;
+  for (int n = 0; n < num_in; ++n) {
+    for (int c = 0; c < channel_in; ++c) {
+      for (int h = 0; h < height_in; ++h) {
+        data_index = n * ch_size + c * height_in + h;
+        src_index0 = n * chw_size + c * hw_size + h * width_in;
+        dst[data_index] = 0.0;
+        for (int w = 0; w < width_in; ++w) {
+          src_index = src_index0 + w;
+          dst[data_index] += static_cast<float>(src[src_index]) / width_in;
+        }
+      }
+    }
+  }
+}
+
+void reduce_mean_all(const float* src,
+                     float* dst,
+                     int num_in,
+                     int channel_in,
+                     int height_in,
+                     int width_in) {
+  float mean = 0.0;
+  int src_index;
+  int n_id, c_id;
+  int all = num_in * channel_in * height_in * width_in;
+  for (int n = 0; n < num_in; ++n) {
+    n_id = n * channel_in * height_in * width_in;
+    for (int c = 0; c < channel_in; ++c) {
+      c_id = c * height_in * width_in;
+      for (int h = 0; h < height_in; ++h) {
+        for (int w = 0; w < width_in; ++w) {
+          src_index = n_id + c_id + h * width_in + w;
+          mean = src[src_index] / all;
+        }
+      }
+    }
+  }
+  dst[0] = mean;
+}
+
+void reduce_mean_nc(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce n first.
+  DDimLite ddimA({1, channel_in, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_mean_n(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_mean_c(tmp_out, dst, 1, channel_in, height_in, width_in);
+}
+
+void reduce_mean_ch(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce c first
+  DDimLite ddimA({num_in, 1, height_in, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_mean_c(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_mean_h(tmp_out, dst, num_in, 1, height_in, width_in);
+}
+
+void reduce_mean_hw(const float* src,
+                    float* dst,
+                    int num_in,
+                    int channel_in,
+                    int height_in,
+                    int width_in) {
+  // reduce h first
+  DDimLite ddimA({num_in, channel_in, 1, width_in});
+  lite::Tensor tensor_tmp;
+  tensor_tmp.Resize(ddimA);
+  float* tmp_out = tensor_tmp.mutable_data<float>();
+  reduce_mean_h(src, tmp_out, num_in, channel_in, height_in, width_in);
+  reduce_mean_w(tmp_out, dst, num_in, channel_in, 1, width_in);
+}
+
+void reduce_mean_ref(const std::shared_ptr<operators::ReduceMeanOp> op) {
+  Scope* scope = op->scope();
+  const OpInfo* op_info = op->op_info();
+
+  auto x = scope->FindTensor("x");
+  auto x_dims = x->dims();
+  auto x_data = x->data<float>();
+  auto out = scope->FindMutableTensor("out_ref");
+
+  auto dim = op_info->GetAttr<std::vector<int>>("dim");
+  auto keep_dim = op_info->GetAttr<bool>("keep_dim");
+
+  auto x_rank = x_dims.size();
+  if (!dim.empty()) {
+    for (size_t i = 0; i < dim.size(); i++) {
+      if (dim[i] < 0) {
+        dim[i] += x_rank;
+      }
+    }
+  }
+
+  bool reduce_all = false;
+  sort(dim.begin(), dim.end());
+  if (dim.size() == 0) {
+    reduce_all = true;
+  }
+
+  std::vector<int64_t> out_dims;
+  if (reduce_all) {
+    if (keep_dim) {
+      for (size_t i = 0; i < x_dims.size(); i++) {
+        out_dims.push_back(1);
+      }
+    } else {
+      out_dims.push_back(1);
+    }
+  } else {
+    for (int i = 0; i < x_dims.size(); i++) {
+      out_dims.push_back(x_dims[i]);
+    }
+    if (keep_dim) {
+      for (size_t i = 0; i < dim.size(); ++i) {
+        out_dims[dim[i]] = 1L;
+      }
+    } else {
+      int64_t kDelFlag = -2;
+      for (size_t i = 0; i < dim.size(); ++i) {
+        out_dims[dim[i]] = kDelFlag;
+      }
+      out_dims.erase(remove(out_dims.begin(), out_dims.end(), kDelFlag),
+                     out_dims.end());
+    }
+    out->Resize(DDim(out_dims));
+  }
+
+  auto out_data = out->mutable_data<float>();
+  int in_n = x_dims[0];
+  int in_c = x_dims[1];
+  int in_h = x_dims[2];
+  int in_w = x_dims[3];
+
+  if (dim.size() == 0) {
+    reduce_mean_all(x_data, out_data, in_n, in_c, in_h, in_w);
+  } else if (dim.size() == 1) {
+    switch (dim[0]) {
+      case 0:
+        reduce_mean_n(x_data, out_data, in_n, in_c, in_h, in_w);
+        break;
+      case 1:
+        reduce_mean_c(x_data, out_data, in_n, in_c, in_h, in_w);
+        break;
+      case 2:
+        reduce_mean_h(x_data, out_data, in_n, in_c, in_h, in_w);
+        break;
+      case 3:
+        reduce_mean_w(x_data, out_data, in_n, in_c, in_h, in_w);
+        break;
+      default:
+        LOG(FATAL) << "error!!!";
+    }
+  } else if (dim.size() == 2) {
+    if (dim[0] == 0 && dim[1] == 1) {
+      reduce_mean_nc(x_data, out_data, in_n, in_c, in_h, in_w);
+    } else if (dim[0] == 1 && dim[1] == 2) {
+      reduce_mean_ch(x_data, out_data, in_n, in_c, in_h, in_w);
+    } else if (dim[0] == 2 && dim[1] == 3) {
+      reduce_mean_hw(x_data, out_data, in_n, in_c, in_h, in_w);
+    } else {
+      LOG(FATAL) << "invalid dim!!";
+    }
+  }
+}
+
+void test_reduce_mean(const std::vector<int64_t>& input_shape,
+                      std::vector<int> dim,
+                      bool keep_dim) {
+  // prepare input&output variables
+  Scope scope;
+  std::string x_var_name("x");
+  std::string out_var_name("out");
+  std::string out_ref_var_name("out_ref");
+  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
+  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
+  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
+  x->Resize(input_shape);
+
+  // initialize input&output data
+  FillTensor<float>(x);
+
+  // initialize op desc
+  cpp::OpDesc opdesc;
+  opdesc.SetType("reduce_mean");
+  opdesc.SetInput("X", {x_var_name});
+  opdesc.SetOutput("Out", {out_var_name});
+  opdesc.SetAttr("dim", dim);
+  opdesc.SetAttr("keep_dim", keep_dim);
+
+  // create and convert op to NPU model, then run it on NPU
+  auto op = CreateOp<operators::ReduceMeanOp>(opdesc, &scope);
+  LauchOp(op, {x_var_name}, {out_var_name});
+
+  // execute reference implementation and save to output tensor
+  reduce_mean_ref(op);
+
+  // compare results
+  auto* out_data = out->mutable_data<float>();
+  auto* out_ref_data = out_ref->mutable_data<float>();
+  for (int i = 0; i < out->dims().production(); i++) {
+    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
+  }
+}
+
+TEST(NPUBridges, reduce_mean) {
+  std::vector<std::vector<int>> reduce_dim{
+      {0}, {1}, {2}, {3}, {0, 1}, {1, 2}, {2, 3}, {-2, -1}};
+  for (auto dim : reduce_dim) {
+    for (auto keep_dim : {true, false}) {
+      test_reduce_mean({1, 2, 3, 4}, dim, keep_dim);
+    }
+  }
+}
+
+}  // namespace bridges
+}  // namespace npu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_OP(reduce_mean);
+USE_NPU_BRIDGE(reduce_mean);