[NPU] enhance unittest for shuffle_channel, unsqueeze, pool (#2730)

* [NPU] enhance unittest for shuffle_channel, unsqueeze, pool test=develop

[NPU] enhance unittest for shuffle_channel, unsqueeze, pool (#2730)
* [NPU] enhance unittest for shuffle_channel, unsqueeze, pool test=develop
08afd3aa · zhupengyang · GitHub · 8e7906d0 · 08afd3aa · 8e7906d0
9 changed file
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
@@ -51,3 +51,5 @@ USE_SUBGRAPH_BRIDGE(sqrt, kNPU);
 USE_SUBGRAPH_BRIDGE(square, kNPU);
 USE_SUBGRAPH_BRIDGE(transpose, kNPU);
 USE_SUBGRAPH_BRIDGE(transpose2, kNPU);
+USE_SUBGRAPH_BRIDGE(unsqueeze, kNPU);
+USE_SUBGRAPH_BRIDGE(unsqueeze2, kNPU);
--- a/lite/kernels/npu/bridges/pool_op_test.cc
+++ b/lite/kernels/npu/bridges/pool_op_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/pool_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-void pool_ref(const std::shared_ptr<operators::PoolOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto& in_dims = x->dims();
-  auto& out_dims = out->dims();
-
-  const float* src_ptr = x->data<const float>();
-  float* dst_ptr = out->mutable_data<float>();
-
-  std::vector<int> ksize = op_info->GetAttr<std::vector<int>>("ksize");
-  std::vector<int> strides = op_info->GetAttr<std::vector<int>>("strides");
-  std::vector<int> paddings = op_info->GetAttr<std::vector<int>>("paddings");
-  bool exclusive = op_info->GetAttr<bool>("exclusive");
-  std::string pooling_type = op_info->GetAttr<std::string>("pooling_type");
-  bool global_pooling = op_info->GetAttr<bool>("global_pooling");
-
-  int in_n = in_dims[0];
-  int in_c = in_dims[1];
-  int in_h = in_dims[2];
-  int in_w = in_dims[3];
-  int size_in_n = in_c * in_h * in_w;
-  int size_in_c = in_h * in_w;
-
-  int out_h = out_dims[2];
-  int out_w = out_dims[3];
-  int size_out_n = in_c * out_h * out_w;
-  int size_out_c = out_h * out_w;
-
-  int window_h = ksize[0];
-  int window_w = ksize[1];
-  int stride_h = strides[0];
-  int stride_w = strides[1];
-  int pad_h = paddings[0];
-  int pad_w = paddings[2];
-
-  if (global_pooling == true) {
-    for (int n = 0; n < in_n; ++n) {
-      for (int c = 0; c < in_c; ++c) {
-        const float* src = src_ptr + n * size_in_n + c * size_in_c;
-        float res = src[0];
-        if (pooling_type == "max") {
-          for (int i = 1; i < size_in_c; ++i) {
-            float cur_val = src[i];
-            res = cur_val > res ? cur_val : res;
-          }
-        } else if (pooling_type == "avg") {
-          for (int i = 1; i < size_in_c; ++i) {
-            float cur_val = src[i];
-            res += cur_val;
-          }
-          res /= size_in_c;
-        }
-        dst_ptr[n * size_out_n + c] = res;
-      }
-    }
-  } else {
-    for (int n = 0; n < in_n; ++n) {
-      for (int c = 0; c < in_c; ++c) {
-        for (int h = 0; h < out_h; ++h) {
-          int sh = h * stride_h;
-          int eh = sh + window_h;
-          sh = (sh - pad_h) < 0 ? 0 : sh - pad_h;
-          eh = (eh - pad_h) > in_h ? in_h : eh - pad_h;
-          for (int w = 0; w < out_w; ++w) {
-            int sw = w * stride_w;
-            int ew = sw + window_w;
-            sw = (sw - pad_w) < 0 ? 0 : sw - pad_w;
-            ew = (ew - pad_w) > in_w ? in_w : ew - pad_w;
-            int pooling_size = (ew - sw) * (eh - sh);
-            if (pooling_size == 0) continue;
-            float res = 0.f;
-            for (int kh = sh; kh < eh; ++kh) {
-              for (int kw = sw; kw < ew; ++kw) {
-                int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
-                if (kh == sh && kw == sw) {
-                  res = src_ptr[src_idx];
-                } else {
-                  if (pooling_type == "max") {
-                    res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
-                  }
-                  if (pooling_type == "avg") {
-                    res += src_ptr[src_idx];
-                  }
-                }
-              }
-            }
-            if (pooling_type == "avg") {
-              if (exclusive) {
-                res /= pooling_size;
-              } else {
-                res /= window_h * window_w;
-              }
-            }
-            dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
-          }
-        }
-      }
-    }
-  }
-}
-
-void test_pool(int bs,
-               int ic,
-               int ih,
-               int iw,
-               std::string pooling_type,
-               bool ceil_mode,
-               bool global_pooling,
-               bool exclusive,
-               int ksize,
-               int stride,
-               int padding) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("pool2d");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("pooling_type", pooling_type);
-  opdesc.SetAttr("ksize", std::vector<int>({ksize, ksize}));
-  opdesc.SetAttr("global_pooling", global_pooling);
-  opdesc.SetAttr("exclusive", exclusive);
-  opdesc.SetAttr("strides", std::vector<int>({stride, stride}));
-  opdesc.SetAttr("paddings",
-                 std::vector<int>({padding, padding, padding, padding}));
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::PoolOpLite>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  pool_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, pool) {
-  for (auto pooling_type : {"max", "avg"}) {
-    for (auto ceil_mode : {true, false}) {
-      for (auto global_pooling : {/*true, */ false}) {
-        for (auto exclusive : {true /*, false*/}) {
-          for (auto ksize : {2, 3}) {
-            for (auto stride : {1, 2}) {
-              for (auto padding : {0, 1}) {
-                for (auto bs : {1, 3}) {
-                  for (auto ic : {1, 3}) {
-                    for (auto ih : {3, 7}) {
-                      for (auto iw : {3, 7}) {
-                        test_pool(bs,
-                                  ic,
-                                  ih,
-                                  iw,
-                                  pooling_type,
-                                  ceil_mode,
-                                  global_pooling,
-                                  exclusive,
-                                  ksize,
-                                  stride,
-                                  padding);
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-  for (auto pooling_type : {"max", "avg"}) {
-    for (auto ceil_mode : {true, false}) {
-      bool global_pooling = true;
-      bool exclusive = true;
-      int ksize = 2;
-      int stride = 1;
-      int padding = 0;
-      int bs = 6;
-      int ic = 6;
-      int ih = 6;
-      int iw = 6;
-      test_pool(bs,
-                ic,
-                ih,
-                iw,
-                pooling_type,
-                ceil_mode,
-                global_pooling,
-                exclusive,
-                ksize,
-                stride,
-                padding);
-    }
-  }
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(pool2d);
-USE_NPU_BRIDGE(pool2d);
--- a/lite/kernels/npu/bridges/shuffle_channel_op_test.cc
+++ b/lite/kernels/npu/bridges/shuffle_channel_op_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/shuffle_channel_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-void shuffle_channel_ref(
-    const std::shared_ptr<operators::ShuffleChannelOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_data = x->mutable_data<float>();
-  auto out_data = out->mutable_data<float>();
-  int group = op_info->GetAttr<int>("group");
-  auto x_dims = x->dims();
-
-  int n_size = x_dims.production() / x_dims[0];
-  int c_size = n_size / x_dims[1];
-  for (int n = 0; n < x_dims[0]; n++) {
-    int g_num = x_dims[1] / group;
-    auto tmp_out_data = out_data;
-    for (int g = 0; g < g_num; g++) {
-      auto tmp_x_data = x_data + g * c_size;
-      for (int i = 0; i < group; i++) {
-        std::memcpy(tmp_out_data,
-                    tmp_x_data + i * g_num * c_size,
-                    c_size * sizeof(float));
-        tmp_out_data += c_size;
-      }
-    }
-    x_data += n_size;
-    out_data += n_size;
-  }
-}
-
-void test_shuffle_channel(int bs, int ic, int ih, int iw, int group) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("shuffle_channel");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("group", group);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ShuffleChannelOpLite>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  shuffle_channel_ref(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, softmax) {
-  for (auto bs : {1, 4}) {
-    for (auto ic : {1, 24, 35}) {
-      for (auto ih : {1, 4}) {
-        for (auto iw : {1, 4}) {
-          for (auto group : {1, 3, 7, 24, 35}) {
-            if (ic % group != 0) continue;
-            test_shuffle_channel(bs, ic, ih, iw, group);
-          }
-        }
-      }
-    }
-  }
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(shuffle_channel);
-USE_NPU_BRIDGE(shuffle_channel);
--- a/lite/kernels/npu/bridges/unsqueeze_op.cc
+++ b/lite/kernels/npu/bridges/unsqueeze_op.cc
@@ -32,13 +32,12 @@ int UnsqueezeConverter(void* ctx, OpLite* op, KernelBase* kernel) {

  auto x_name = op_info->Input("X").front();
  auto x_type = kernel->GetInputDeclType("X");
-  CHECK(x_type->precision() == PRECISION(kFloat));
  CHECK(x_type->layout() == DATALAYOUT(kNCHW));
  auto x = scope->FindMutableTensor(x_name);
  auto x_dims = x->dims();
+
  auto out_name = op_info->Output("Out").front();
  auto out_type = kernel->GetOutputDeclType("Out");
-  CHECK(out_type->precision() == PRECISION(kFloat));
  CHECK(out_type->layout() == DATALAYOUT(kNCHW));
  auto out_shape = scope->FindTensor(out_name)->dims().Vectorize();
  CHECK(op_info->HasAttr("axes"))

--- a/lite/kernels/npu/bridges/unsqueeze_op_test.cc
+++ b/lite/kernels/npu/bridges/unsqueeze_op_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/unsqueeze_op.h"
-#include <gtest/gtest.h>
-#include <cmath>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-static DDim GetOutputShape(const std::vector<int>& unsqz_dims,
-                           const DDim& in_dims) {
-  int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
-  int cur_output_size = in_dims.size();
-  std::vector<int64_t> output_shape(output_size, 0);
-
-  // Validate Check: rank range.
-  CHECK_LE(output_size, 6) << "The output tensor's rank should be less than 6.";
-
-  for (int axis : unsqz_dims) {
-    int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
-    // Validate Check: the axis bound
-    CHECK((cur >= 0) && (cur <= cur_output_size))
-        << "The unsqueeze dims must be within range of current rank.";
-    // Move old axis, and insert new axis
-    for (int i = cur_output_size; i >= cur; --i) {
-      if (output_shape[i] == 1) {
-        // Move axis
-        output_shape[i + 1] = 1;
-        output_shape[i] = 0;
-      }
-    }
-
-    output_shape[cur] = 1;
-    // Add the output size.
-    cur_output_size++;
-  }
-
-  // Make output shape
-  for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
-    if (output_shape[out_idx] == 0) {
-      output_shape[out_idx] = in_dims[in_idx++];
-    }
-  }
-
-  return DDim(output_shape);
-}
-
-template <typename dtype>
-void unsqueeze_ref(const std::shared_ptr<operators::UnsqueezeOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-
-  auto x = scope->FindTensor("x");
-  auto out = scope->FindMutableTensor("out_ref");
-  auto axes = op_info->GetAttr<std::vector<int>>("axes");
-  auto y_dims = GetOutputShape(axes, x->dims());
-  out->Resize(y_dims);
-
-  auto x_data = x->data<dtype>();
-  auto out_data = out->mutable_data<dtype>();
-
-  memcpy(out_data, x_data, x->numel() * sizeof(float));
-}
-
-void test_unsqueeze(const std::vector<int64_t>& input_shape,
-                    std::vector<int> axes) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.NewTensor(x_var_name);
-  auto* out = scope.NewTensor(out_var_name);
-  auto* out_ref = scope.NewTensor(out_ref_var_name);
-  x->Resize(input_shape);
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("unsqueeze");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("axes", axes);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::UnsqueezeOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-
-  // execute reference implementation and save to output tensor
-  unsqueeze_ref<float>(op);
-
-  // compare results
-  CHECK_EQ(out->dims().size(), out_ref->dims().size());
-  for (int i = 0; i < out->dims().size(); i++) {
-    CHECK_EQ(out->dims()[i], out_ref->dims()[i]);
-  }
-
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, unsqueeze) {
-  test_unsqueeze({2}, {0, 2});
-  test_unsqueeze({2, 3}, {1, 3});
-  test_unsqueeze({1, 2, 3}, {3});
-  test_unsqueeze({5, 6, 7}, {1});
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(unsqueeze);
-USE_NPU_BRIDGE(unsqueeze);
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
 if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH_ARM))
    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_fc_compute SRCS fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -25,19 +25,20 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA) AND (LITE_WITH_X86 OR LITE_WITH
    #lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${npu_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${npu_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

 if(LITE_BUILD_EXTRA)
    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

--- a/lite/tests/kernels/pool_compute_test.cc
+++ b/lite/tests/kernels/pool_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+class PoolComputeTest : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string op_type_ = "pool2d";
+  std::string x_ = "x";
+  std::string out_ = "out";
+  DDim dims_{{1, 2, 3, 4}};
+  std::string pooling_type_ = "max";
+  bool global_pooling_ = false;
+  std::vector<int> strides_{1, 1};
+  std::vector<int> paddings_{0, 0};
+  std::vector<int> ksize_{2, 2};
+  bool exclusive_ = true;
+  bool ceil_mode_ = false;
+  bool adaptive_ = false;
+  std::string padding_algorithm_;
+
+ public:
+  PoolComputeTest(const Place& place,
+                  const std::string& alias,
+                  DDim dims,
+                  std::string pooling_type,
+                  bool global_pooling,
+                  std::vector<int> strides = {1, 1},
+                  std::vector<int> paddings = {0, 0},
+                  std::vector<int> ksize = {2, 2},
+                  bool exclusive = true,
+                  bool ceil_mode = false,
+                  bool adaptive = false,
+                  std::string padding_algorithm = "")
+      : TestCase(place, alias),
+        dims_(dims),
+        pooling_type_(pooling_type),
+        global_pooling_(global_pooling),
+        strides_(strides),
+        paddings_(paddings),
+        ksize_(ksize),
+        exclusive_(exclusive),
+        ceil_mode_(ceil_mode),
+        adaptive_(adaptive) {}
+
+  void RunBaseline(Scope* scope) override {
+    std::vector<int> paddings_new{paddings_};
+    if (paddings_new.size() == 1L) {
+      paddings_new = std::vector<int>(4, paddings_new[0]);
+    } else if (paddings_new.size() == 2L) {
+      paddings_new.insert(paddings_new.begin(), paddings_new[0]);
+      paddings_new.insert(paddings_new.begin() + 2, paddings_new[2]);
+    }
+    CHECK_EQ(paddings_new.size(), 4L);
+    if (padding_algorithm_ == "SAME") {
+      for (int i = 0; i < strides_.size(); ++i) {
+        int out_size = (dims_[i + 2] + strides_[i] - 1) / strides_[i];
+        int pad_sum =
+            std::max((out_size - 1) * strides_[i] + ksize_[i] - dims_[i + 2],
+                     (int64_t)0);
+        int pad_0 = pad_sum / 2;
+        int pad_1 = pad_sum - pad_0;
+        *(paddings_new.begin() + i * 2) = pad_0;
+        *(paddings_new.begin() + i * 2 + 1) = pad_1;
+      }
+    }
+    if (padding_algorithm_ == "VALID" || global_pooling_ || adaptive_) {
+      for (size_t i = 0; i < paddings_new.size(); i++) {
+        paddings_new[i] = 0;
+      }
+    }
+
+    std::vector<int> ksize_new{ksize_};
+    if (global_pooling_) {
+      ksize_new.clear();
+      ksize_new.push_back(dims_[2]);
+      ksize_new.push_back(dims_[3]);
+    }
+
+    std::vector<int64_t> out_shape{dims_[0], dims_[1]};
+    if (adaptive_) {
+      out_shape.insert(out_shape.end(), ksize_new.begin(), ksize_new.end());
+    } else {
+      for (size_t i = 0; i < ksize_new.size(); ++i) {
+        int out_size;
+        if (!ceil_mode_) {
+          out_size = (dims_[i + 2] - ksize_new[i] + paddings_new[2 * i] +
+                      paddings_new[2 * i + 1]) /
+                         strides_[i] +
+                     1;
+        } else {
+          out_size = (dims_[i + 2] - ksize_new[i] + paddings_new[2 * i] +
+                      paddings_new[2 * i + 1] + strides_[i] - 1) /
+                         strides_[i] +
+                     1;
+        }
+        out_shape.push_back(out_size);
+      }
+    }
+
+    auto out = scope->NewTensor(out_);
+    CHECK(out);
+    out->Resize(DDim(out_shape));
+    auto out_dims = out->dims();
+    auto dst_ptr = out->mutable_data<float>();
+
+    auto x = scope->FindTensor(x_);
+    auto src_ptr = x->data<float>();
+
+    int in_n = dims_[0];
+    int in_c = dims_[1];
+    int in_h = dims_[2];
+    int in_w = dims_[3];
+    int size_in_n = in_c * in_h * in_w;
+    int size_in_c = in_h * in_w;
+
+    int out_h = out_dims[2];
+    int out_w = out_dims[3];
+    int size_out_n = in_c * out_h * out_w;
+    int size_out_c = out_h * out_w;
+
+    int window_h = ksize_new[0];
+    int window_w = ksize_new[1];
+    int stride_h = strides_[0];
+    int stride_w = strides_[1];
+    int pad_t = paddings_new[0];
+    int pad_l = paddings_new[2];
+
+    if (global_pooling_) {
+      for (int n = 0; n < in_n; ++n) {
+        for (int c = 0; c < in_c; ++c) {
+          const float* src = src_ptr + n * size_in_n + c * size_in_c;
+          float res = src[0];
+          if (pooling_type_ == "max") {
+            for (int i = 1; i < size_in_c; ++i) {
+              float cur_val = src[i];
+              res = cur_val > res ? cur_val : res;
+            }
+          } else if (pooling_type_ == "avg") {
+            for (int i = 1; i < size_in_c; ++i) {
+              float cur_val = src[i];
+              res += cur_val;
+            }
+            res /= size_in_c;
+          }
+          dst_ptr[n * size_out_n + c] = res;
+        }
+      }
+    } else {
+      for (int n = 0; n < in_n; ++n) {
+        for (int c = 0; c < in_c; ++c) {
+          for (int h = 0; h < out_h; ++h) {
+            int sh = h * stride_h;
+            int eh = sh + window_h;
+            sh = (sh - pad_t) < 0 ? 0 : sh - pad_t;
+            eh = (eh - pad_t) > in_h ? in_h : eh - pad_t;
+            for (int w = 0; w < out_w; ++w) {
+              int sw = w * stride_w;
+              int ew = sw + window_w;
+              sw = (sw - pad_l) < 0 ? 0 : sw - pad_l;
+              ew = (ew - pad_l) > in_w ? in_w : ew - pad_l;
+              int pooling_size = (ew - sw) * (eh - sh);
+              if (pooling_size == 0) continue;
+              float res = 0.f;
+              for (int kh = sh; kh < eh; ++kh) {
+                for (int kw = sw; kw < ew; ++kw) {
+                  int src_idx = n * size_in_n + c * size_in_c + kh * in_w + kw;
+                  if (kh == sh && kw == sw) {
+                    res = src_ptr[src_idx];
+                  } else {
+                    if (pooling_type_ == "max") {
+                      res = res >= src_ptr[src_idx] ? res : src_ptr[src_idx];
+                    }
+                    if (pooling_type_ == "avg") {
+                      res += src_ptr[src_idx];
+                    }
+                  }
+                }
+              }
+              if (pooling_type_ == "avg") {
+                if (exclusive_) {
+                  res /= pooling_size;
+                } else {
+                  res /= window_h * window_w;
+                }
+              }
+              dst_ptr[n * size_out_n + c * size_out_c + h * out_w + w] = res;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    op_desc->SetType(op_type_);
+    op_desc->SetInput("X", {x_});
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr("pooling_type", pooling_type_);
+    op_desc->SetAttr("global_pooling", global_pooling_);
+    op_desc->SetAttr("strides", strides_);
+    op_desc->SetAttr("paddings", paddings_);
+    op_desc->SetAttr("ksize", ksize_);
+    op_desc->SetAttr("exclusive", exclusive_);
+    op_desc->SetAttr("ceil_mode", ceil_mode_);
+    op_desc->SetAttr("adaptive", adaptive_);
+    if (!padding_algorithm_.empty()) {
+      op_desc->SetAttr("padding_algorithm", padding_algorithm_);
+    }
+  }
+
+  void PrepareData() override {
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(x_, dims_, din.data());
+  }
+};
+
+void TestPoolGlobal(Place place, float abs_error = 2e-5) {
+  for (auto dims : std::vector<std::vector<int64_t>>{{2, 3, 4, 5}}) {
+    for (std::string pooling_type : {"max", "avg"}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new PoolComputeTest(place, "def", DDim(dims), pooling_type, true));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestPoolAlgorithm(Place place, float abs_error = 2e-5) {
+  for (auto dims : std::vector<std::vector<int64_t>>{{2, 3, 4, 5}}) {
+    for (auto pooling_type : {"max", "avg"}) {
+      for (auto padding_algorithm : {"SAME", "VALID"}) {
+        std::unique_ptr<arena::TestCase> tester(
+            new PoolComputeTest(place,
+                                "def",
+                                DDim(dims),
+                                pooling_type,
+                                false,
+                                {2, 2},
+                                {0, 0},
+                                {2, 2},
+                                true,
+                                false,
+                                false,
+                                padding_algorithm));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+
+void TestPoolHelper(Place place,
+                    float abs_error,
+                    std::vector<int64_t> dims,
+                    std::string pooling_type,
+                    std::vector<int> strides,
+                    std::vector<int> paddings,
+                    std::vector<int> ksize) {
+  std::unique_ptr<arena::TestCase> tester(new PoolComputeTest(
+      place, "def", DDim(dims), pooling_type, false, strides, paddings, ksize));
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+
+void TestPoolStrides(Place place, float abs_error = 2e-5) {
+  for (auto pooling_type : {"max", "avg"}) {
+    TestPoolHelper(
+        place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 1}, {0, 0}, {2, 2});
+    TestPoolHelper(
+        place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 2}, {0, 0}, {2, 2});
+    TestPoolHelper(
+        place, abs_error, {2, 3, 6, 7}, pooling_type, {2, 2}, {0, 0}, {2, 2});
+  }
+}
+
+void TestPoolPaddings(Place place, float abs_error = 2e-5) {
+  for (auto pooling_type : {"max", "avg"}) {
+    TestPoolHelper(
+        place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 1}, {0, 0}, {2, 2});
+    TestPoolHelper(
+        place, abs_error, {2, 3, 6, 7}, pooling_type, {1, 1}, {1, 1}, {2, 2});
+    TestPoolHelper(place,
+                   abs_error,
+                   {2, 3, 6, 7},
+                   pooling_type,
+                   {1, 1},
+                   {0, 0, 1, 1},
+                   {2, 2});
+    TestPoolHelper(place,
+                   abs_error,
+                   {2, 3, 6, 7},
+                   pooling_type,
+                   {1, 1},
+                   {1, 0, 1, 0},
+                   {2, 2});
+    TestPoolHelper(place,
+                   abs_error,
+                   {2, 3, 6, 7},
+                   pooling_type,
+                   {1, 1},
+                   {1, 0, 0, 1},
+                   {2, 2});
+  }
+}
+
+void TestPoolKsize(Place place, float abs_error = 2e-5) {
+  for (auto pooling_type : {"max", "avg"}) {
+    for (auto ksize : {2, 3}) {
+      TestPoolHelper(place,
+                     abs_error,
+                     {2, 3, 6, 7},
+                     pooling_type,
+                     {1, 1},
+                     {0, 0},
+                     {ksize, ksize});
+      TestPoolHelper(place,
+                     abs_error,
+                     {2, 3, 6, 7},
+                     pooling_type,
+                     {2, 2},
+                     {1, 1},
+                     {ksize, ksize});
+    }
+  }
+}
+
+TEST(Pool, precision) {
+  LOG(INFO) << "test pool op";
+  float abs_error = 2e-5;
+  Place place;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#else
+  return;
+#endif
+
+  TestPoolGlobal(place, abs_error);
+  TestPoolAlgorithm(place, abs_error);
+  TestPoolStrides(place, abs_error);
+  TestPoolPaddings(place, abs_error);
+  TestPoolKsize(place, abs_error);
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/kernels/shuffle_channel_compute_test.cc
+++ b/lite/tests/kernels/shuffle_channel_compute_test.cc
@@ -12,12 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-// TODO(FrostML): shaffle_channel cannot pass on CI, but ok in local machine.
-// Open this.
-/*#include <gtest/gtest.h>
+#include <gtest/gtest.h>
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"

 namespace paddle {
 namespace lite {
@@ -40,28 +39,29 @@ class ShuffleChannelComputeTester : public arena::TestCase {
    auto* out = scope->NewTensor(output_);
    CHECK(out);
    out->Resize(dims_);
-    auto* outputs = out->mutable_data<float>();
+    auto* out_data = out->mutable_data<float>();
+
    auto* x = scope->FindTensor(input_);
-    const auto* inputs = x->data<float>();
-    DDim x_dims = x->dims();
-    int num = x->dims()[0];
-    int channel = x->dims()[1];
-    int height = x->dims()[2];
-    int width = x->dims()[3];
-    int fea_size = channel * height * width;
+    const auto* in_data = x->data<float>();
+
+    int num = dims_[0];
+    int channel = dims_[1];
+    int height = dims_[2];
+    int width = dims_[3];
+    int feather_size = channel * height * width;
    int spatial_size = height * width;
-    int group_row = group_;
-    int group_col = channel / group_;
-    for (int k = 0; k < num; ++k) {
-      inputs += k * fea_size;
-      outputs += k * fea_size;
-      for (int i = 0; i < group_row; ++i) {
-        for (int j = 0; j < group_col; ++j) {
-          const float* p_i = inputs + (i * group_col + j) * spatial_size;
-          float* p_o = outputs + (j * group_row + i) * spatial_size;
+    int group_num = group_;
+    int group_size = channel / group_;
+    for (int n = 0; n < num; n++) {
+      for (int i = 0; i < group_num; ++i) {
+        for (int j = 0; j < group_size; ++j) {
+          const float* p_i = in_data + (i * group_size + j) * spatial_size;
+          float* p_o = out_data + (j * group_num + i) * spatial_size;
          memcpy(p_o, p_i, spatial_size * sizeof(float));
        }
      }
+      in_data += feather_size;
+      out_data += feather_size;
    }
  }

@@ -73,35 +73,33 @@ class ShuffleChannelComputeTester : public arena::TestCase {
  }

  void PrepareData() override {
-    std::vector<float> data(dims_.production());
-
-    for (int i = 0; i < dims_.production(); i++) {
-      data[i] = i * 1.1;
-    }
-
-    SetCommonTensor(input_, dims_, data.data());
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(input_, dims_, din.data());
  }
 };

-void test_shuffle_channel(Place place) {
-  for (int group : {4}) {
+void test_shuffle_channel(Place place, float abs_error = 2e-5) {
+  for (int group : {2, 4, 8}) {
    std::unique_ptr<arena::TestCase> tester(
        new ShuffleChannelComputeTester(place, "def", group));
-    arena::Arena arena(std::move(tester), place, 2e-5);
+    arena::Arena arena(std::move(tester), place, abs_error);
    arena.TestPrecision();
  }
 }

 TEST(ShuffleChannel, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_shuffle_channel(place);
+  Place place;
+  float abs_error = 2e-5;
+#ifdef LITE_WITH_NPU
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#else
+  return;
 #endif
+
+  test_shuffle_channel(place, abs_error);
 }

 }  // namespace lite
 }  // namespace paddle
-*/
--- a/lite/tests/kernels/unsqueeze_compute_test.cc
+++ b/lite/tests/kernels/unsqueeze_compute_test.cc
@@ -223,67 +223,73 @@ class Unsqueeze2ComputeTester : public arena::TestCase {
  }
 };

-void test_unsqueeze(Place place) {
+void test_unsqueeze(Place place, float abs_error = 2e-5) {
  for (std::vector<int> axes : {std::vector<int>({1}),
                                std::vector<int>({0, 2}),
                                std::vector<int>({0, -2})}) {
-    for (int N : {1}) {
-      for (int C : {3}) {
-        for (int H : {1}) {
-          for (int W : {5}) {
-            for (int input_axes_flag : {1, 2, 3}) {
-              LOG(INFO) << N << " " << C << " " << H << " " << W << " "
-                        << input_axes_flag;
-              std::unique_ptr<arena::TestCase> tester(
-                  new UnsqueezeComputeTester(
-                      place, "def", axes, DDim({N, C, H, W}), input_axes_flag));
-              arena::Arena arena(std::move(tester), place, 2e-5);
-              arena.TestPrecision();
-            }
-          }
-        }
+    for (auto dims : std::vector<std::vector<int64_t>>{{3}, {3, 5}, {3, 5, 7}})
+      for (int input_axes_flag : {1, 2, 3}) {
+#ifdef LITE_WITH_NPU
+        if (input_axes_flag != 1) continue;
+        if (dims.size() + axes.size() > 4) continue;
+#endif
+        std::unique_ptr<arena::TestCase> tester(new UnsqueezeComputeTester(
+            place, "def", axes, DDim(dims), input_axes_flag));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
      }
-    }
  }
 }

-void test_unsqueeze2(Place place) {
+void test_unsqueeze2(Place place,
+                     float abs_error = 2e-5,
+                     std::vector<std::string> ignored_outs = {}) {
  for (std::vector<int> axes : {std::vector<int>({0}),
                                std::vector<int>({0, 2}),
                                std::vector<int>({0, -2})}) {
-    for (int N : {1}) {
-      for (int C : {3}) {
-        for (int H : {1}) {
-          for (int W : {5}) {
-            std::unique_ptr<arena::TestCase> tester(new Unsqueeze2ComputeTester(
-                place, "def", axes, DDim({N, C, H, W})));
-            arena::Arena arena(std::move(tester), place, 2e-5);
-            arena.TestPrecision();
-          }
-        }
-      }
+    for (auto dims :
+         std::vector<std::vector<int64_t>>{{3}, {3, 5}, {3, 5, 7}}) {
+#ifdef LITE_WITH_NPU
+      if (dims.size() + axes.size() > 4) continue;
+#endif
+      std::unique_ptr<arena::TestCase> tester(
+          new Unsqueeze2ComputeTester(place, "def", axes, DDim(dims)));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision(ignored_outs);
    }
  }
 }

 TEST(squeeze, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_unsqueeze(place);
+  Place place;
+  float abs_error = 2e-5;
+#ifdef LITE_WITH_NPU
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // Using fp16 in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_unsqueeze(place, abs_error);
 }

 TEST(squeeze2, precision) {
-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-#endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_unsqueeze2(place);
+  Place place;
+  float abs_error = 2e-5;
+  std::vector<std::string> ignored_outs = {};
+#ifdef LITE_WITH_NPU
+  place = TARGET(kNPU);
+  abs_error = 1e-2;                  // Using fp16 in NPU
+  ignored_outs.push_back("XShape");  // not supported out in NPU
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
 #endif
+
+  test_unsqueeze2(place, abs_error, ignored_outs);
 }

 }  // namespace lite