[NPU] clean code (#2798)

69ad4b80 · zhupengyang · hong19860320 · 124c43a0 · 124c43a0 · 124c43a0
4 changed file
--- a/lite/kernels/npu/bridges/fc_op_test.cc
+++ b/lite/kernels/npu/bridges/fc_op_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/fc_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-void fc_ref(const std::shared_ptr<operators::FcOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto input =
-      scope->FindVar(op_info->Input("Input").front())->GetMutable<Tensor>();
-  auto w = scope->FindVar(op_info->Input("W").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  int32_t in_num_col_dims = op_info->GetAttr<int32_t>("in_num_col_dims");
-  Tensor* bias = nullptr;
-  float* bias_data = nullptr;
-  if (op_info->HasInput("Bias")) {
-    auto bias_var_names = op_info->Input("Bias");
-    if (bias_var_names.size() > 0) {
-      auto bias_var_name = bias_var_names.front();
-      bias = scope->FindVar(bias_var_name)->GetMutable<lite::Tensor>();
-      bias_data = bias->mutable_data<float>();
-    }
-  }
-  auto input_data = input->data<float>();
-  auto w_data = w->mutable_data<float>();
-  auto out_data = out->mutable_data<float>();
-  auto in_mat_dims = input->dims().Flatten2D(in_num_col_dims);
-  int out_num_classes = w->dims()[1];
-  const int M = in_mat_dims[0];
-  const int K = in_mat_dims[1];
-  const int N = out_num_classes;
-  for (int m = 0; m < M; ++m) {
-    for (int n = 0; n < N; ++n) {
-      out_data[m * N + n] = 0;
-      for (int k = 0; k < K; ++k) {
-        out_data[m * N + n] += input_data[m * K + k] * w_data[k * N + n];
-      }
-    }
-  }
-  if (bias_data != nullptr) {
-    for (int m = 0; m < M; ++m) {
-      for (int n = 0; n < N; ++n) {
-        out_data[m * N + n] += bias_data[n];
-      }
-    }
-  }
-}
-
-void test_fc(const std::vector<int64_t>& input_shape,
-             const std::vector<int64_t>& w_shape,
-             int in_num_col_dims,
-             bool has_bias) {
-  CHECK_EQ(w_shape.size(), 2UL);
-
-  const auto& bridges = lite::kernels::npu::bridges::Factory::Instance();
-  const auto& supported_lists = bridges.AllFunctions();
-  CHECK(bridges.HasType("fc"));
-
-  Scope scope;
-  std::string input_var_name("Input");
-  std::string w_var_name("W");
-  std::string bias_var_name("Bias");
-  std::string out_var_name("Out");
-  std::string out_ref_var_name("out_ref");
-  auto* input = scope.Var(input_var_name)->GetMutable<Tensor>();
-  auto* w = scope.Var(w_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  input->Resize(input_shape);
-  w->Resize(w_shape);
-
-  FillTensor<float, int>(input);
-  FillTensor<float, int>(w);
-
-  // create fc op
-  cpp::OpDesc fc_op_desc;
-  fc_op_desc.SetType("fc");
-  fc_op_desc.SetInput("Input", {input_var_name});
-  fc_op_desc.SetInput("W", {w_var_name});
-  fc_op_desc.SetOutput("Out", {out_var_name});
-  fc_op_desc.SetAttr("in_num_col_dims", static_cast<int>(in_num_col_dims));
-  if (has_bias) {
-    auto* bias = scope.Var(bias_var_name)->GetMutable<Tensor>();
-    bias->Resize({w_shape[1]});
-    FillTensor<float, int>(bias);
-    fc_op_desc.SetInput("Bias", {bias_var_name});
-  }
-
-  auto fc_op = CreateOp<operators::FcOpLite>(fc_op_desc, &scope);
-  LauchOp(fc_op, {input_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // compare results
-  fc_ref(fc_op);
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-
-TEST(NPUBridges, fc) {
-  for (bool use_bias : {true, false}) {
-    test_fc({1, 8, 8, 1}, {8, 4}, 2, use_bias);
-    test_fc({1, 5, 5, 1}, {5, 7}, 2, use_bias);
-    test_fc({1, 4, 1, 1}, {4, 8}, 1, use_bias);
-    test_fc({1, 1024, 1, 1}, {1024, 1000}, 1, use_bias);
-  }
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(fc);
-USE_NPU_BRIDGE(fc);
--- a/lite/kernels/npu/bridges/reshape_op_test.cc
+++ b/lite/kernels/npu/bridges/reshape_op_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/reshape_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-void reshape_ref(const std::shared_ptr<lite::OpLite> op) {
-  auto scope = op->scope();
-  auto op_info = op->op_info();
-  auto op_type = op_info->Type();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_dims = x->dims();
-  auto shape = op_info->GetAttr<std::vector<int>>("shape");
-  auto inplace = op_info->GetAttr<bool>("inplace");
-  if (op_info->HasInput("Shape")) {
-    auto actual_shape_var_names = op_info->Input("Shape");
-    if (actual_shape_var_names.size() > 0) {
-      auto actual_shape = scope->FindVar(actual_shape_var_names.front())
-                              ->GetMutable<lite::Tensor>();
-      auto actual_shape_dims = actual_shape->dims();
-      auto* actual_shape_data = actual_shape->data<int>();
-      shape =
-          std::vector<int>(actual_shape_data,
-                           actual_shape_data + actual_shape_dims.production());
-    }
-  }
-  if (inplace) {
-    out->ShareDataWith(*x);
-  } else {
-    out->CopyDataFrom(*x);
-  }
-  auto out_dims = operators::ValidateShape(shape, x_dims);
-  out->Resize(out_dims);
-}
-
-void test_reshape(const std::vector<int64_t>& x_shape,
-                  const std::vector<int>& shape,
-                  const std::vector<int>& act_shape,
-                  bool inplace,
-                  bool reshape2) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name("x");
-  std::string actual_shape_var_name("actual_shape");
-  std::string out_var_name("out");
-  std::string out_ref_var_name("out_ref");
-  std::string xshape_var_name("xshape");
-  std::string xshape_ref_var_name("xshape_ref");
-  auto x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto actual_shape = scope.Var(actual_shape_var_name)->GetMutable<Tensor>();
-  auto out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  auto xshape = scope.Var(xshape_var_name)->GetMutable<Tensor>();
-  auto xshape_ref = scope.Var(xshape_ref_var_name)->GetMutable<Tensor>();
-
-  x->Resize(x_shape);
-
-  // initialize input&output data
-  FillTensor<float, int>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType(reshape2 ? "reshape2" : "reshape");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("shape", shape);
-  opdesc.SetAttr("inplace", inplace);
-  if (!act_shape.empty()) {
-    int64_t act_shape_size = act_shape.size();
-    actual_shape->Resize({act_shape_size});
-    memcpy(actual_shape->mutable_data<int>(),
-           act_shape.data(),
-           act_shape_size * sizeof(int));
-    opdesc.SetInput("Shape", {actual_shape_var_name});
-  }
-  if (reshape2) {
-    opdesc.SetOutput("XShape", {xshape_var_name});
-  }
-
-  // create op and execute reference implementation
-  auto op = reshape2 ? CreateOp<operators::Reshape2Op>(opdesc, &scope)
-                     : CreateOp<operators::ReshapeOp>(opdesc, &scope);
-  reshape_ref(op);
-  out_ref->CopyDataFrom(*out);
-  if (reshape2) {
-    xshape_ref->CopyDataFrom(*xshape);
-  }
-
-  // convert op to NPU model, then run it on NPU
-  LauchOp(op,
-          {x_var_name},
-          {out_var_name});  // TODO(hong19860320) support XShape for reshape2
-
-  // compare results
-  auto out_dims = out->dims();
-  auto out_ref_dims = out_ref->dims();
-  CHECK_EQ(out_dims.size(), out_ref_dims.size());
-  for (int i = 0; i < out_dims.size(); i++) {
-    CHECK_EQ(out_dims[i], out_ref_dims[i]);
-  }
-  auto out_data = out->mutable_data<float>();
-  auto out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-  // if (reshape2) {
-  //   auto xshape_dims = xshape->dims();
-  //   auto xshape_ref_dims = xshape_ref->dims();
-  //   CHECK_EQ(xshape_dims.size(), xshape_ref_dims.size());
-  //   for (size_t i = 0; i < xshape_dims.size(); i++) {
-  //     CHECK_EQ(xshape_dims[i], xshape_ref_dims[i]);
-  //   }
-  // }
-}
-
-TEST(NPUBridges, reshape) {
-#if 1
-  std::map<std::vector<int64_t>, std::vector<std::vector<int>>> tests = {
-      {{1, 2, 4, 6},
-       {{},
-        {-1},
-        {48},
-        {-1, 48},
-        {1, 48},
-        {0, 48},
-        {48, -1},
-        {48, 1},
-        {-1, 24},
-        {2, 24},
-        {24, 0},
-        {-1, 0, 3, 2},
-        {4, 2, 3, 2},
-        {0, -1, 3, 2},
-        {1, 8, 3, 2}}}};
-  for (auto& i : tests) {
-    for (auto& shape : i.second) {
-      if (shape.empty()) {
-        continue;
-      }
-      for (auto& act_shape : i.second) {
-        for (auto& inplace : {true, false}) {
-          for (auto& reshape2 : {true, false}) {
-            std::stringstream ss;
-            ss << "x:{ ";
-            for (auto s : i.first) {
-              ss << s << " ";
-            }
-            ss << "} shape:{ ";
-            for (auto s : shape) {
-              ss << s << " ";
-            }
-            ss << "} act_shape:{ ";
-            for (auto s : act_shape) {
-              ss << s << " ";
-            }
-            VLOG(3) << ss.str() << "} inplace:" << inplace
-                    << " reshape2:" << reshape2;
-            test_reshape(i.first, shape, act_shape, inplace, reshape2);
-          }
-        }
-      }
-    }
-  }
-#else
-  test_reshape({2, 4, 6}, {-1, 0, 4, 3}, {}, true, true);
-  test_reshape({1, 232, 14, 14}, {-1, 2, 116, 14, 14}, {}, true, true);
-#endif
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(reshape);
-USE_NPU_BRIDGE(reshape);
-
-USE_LITE_OP(reshape2);
-USE_NPU_BRIDGE(reshape2);
--- a/lite/kernels/npu/bridges/softmax_op_test.cc
+++ b/lite/kernels/npu/bridges/softmax_op_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/softmax_op.h"
-#include <gtest/gtest.h>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-template <typename dtype>
-void softmax_ref(const std::shared_ptr<operators::SoftmaxOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_data = x->data<dtype>();
-  auto out_data = out->mutable_data<dtype>();
-  DDim x_dims = x->dims();
-
-  auto x_rank = x_dims.size();
-  int axis = op_info->GetAttr<int>("axis");
-  if (axis < 0) {
-    axis += x_rank;
-  }
-  int axis_size = x_dims[axis];
-  int outer_num = x_dims.Slice(0, axis).production();
-  int inner_num = x_dims.Slice(axis + 1, x_rank).production();
-  int compute_size = outer_num * inner_num;
-  for (int i = 0; i < compute_size; i++) {
-    int idx_inner = i % inner_num;
-    int idx_outer = (i / inner_num) * axis_size;
-    int start = idx_outer * inner_num + idx_inner;
-    int offset;
-
-    offset = start;
-    dtype max_data = std::numeric_limits<dtype>::lowest();
-    for (int j = 0; j < axis_size; j++) {
-      max_data = x_data[offset] > max_data ? x_data[offset] : max_data;
-      offset += inner_num;
-    }
-
-    offset = start;
-    dtype sum_data = (dtype)0;
-    for (int j = 0; j < axis_size; j++) {
-      out_data[offset] = exp(x_data[offset] - max_data);
-      sum_data += out_data[offset];
-      offset += inner_num;
-    }
-
-    offset = start;
-    for (int j = 0; j < axis_size; j++) {
-      out_data[offset] /= sum_data;
-      offset += inner_num;
-    }
-  }
-}
-
-void test_softmax(const std::vector<int64_t>& input_shape, int axis) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name = "x";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize(input_shape);
-
-  // initialize input&output data
-  FillTensor<float>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("softmax");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("axis", axis);
-
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::SoftmaxOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-
-  // execute reference implementation and save to output tensor
-  softmax_ref<float>(op);
-
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2);
-  }
-}
-
-TEST(NPUBridges, softmax) {
-  test_softmax({1, 4}, -1);
-  // Bug exists in HiAI DDK when the number of items > 16500
-  // test_softmax({1, 16500}, -1);
-  test_softmax({1, 4}, 0);
-  test_softmax({1, 4}, 1);
-  test_softmax({3, 4}, -1);
-  test_softmax({3, 4}, 0);
-  test_softmax({3, 4}, 1);
-  test_softmax({1, 4, 7}, -1);
-  test_softmax({1, 4, 7}, 0);
-  // Bug exists in HiAI DDK when axis is 1 and iw > 1
-  // test_softmax({1, 4, 7}, 1);
-  test_softmax({1, 4, 1}, 1);
-  test_softmax({1, 4, 7}, 2);
-  test_softmax({3, 4, 7}, -1);
-  test_softmax({3, 4, 7}, 0);
-  test_softmax({3, 4, 1}, 1);
-  test_softmax({3, 4, 7}, 2);
-  test_softmax({1, 4, 7, 9}, -1);
-  test_softmax({1, 4, 7, 9}, 0);
-  test_softmax({1, 4, 7, 9}, 1);
-  // Bug exists in HiAI DDK when axis is 2 and iw > 1
-  // test_softmax({1, 4, 7, 9}, 2);
-  test_softmax({1, 4, 7, 1}, 2);
-  test_softmax({1, 4, 7, 9}, 3);
-  test_softmax({3, 4, 7, 9}, -1);
-  test_softmax({3, 4, 7, 9}, 0);
-  test_softmax({3, 4, 7, 9}, 1);
-  test_softmax({3, 4, 7, 1}, 2);
-  test_softmax({3, 4, 7, 9}, 3);
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(softmax);
-USE_NPU_BRIDGE(softmax);
--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -2,21 +2,21 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_
    lite_cc_test(test_kernel_conv_compute SRCS conv_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_conv_transpose_compute SRCS conv_transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_scale_compute SRCS scale_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_power_compute SRCS power_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_shuffle_channel_compute SRCS shuffle_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_yolo_box_compute SRCS yolo_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_fc_compute SRCS fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_elementwise_compute SRCS elementwise_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${npu_kernels} ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_lrn_compute SRCS lrn_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_decode_bboxes_compute SRCS decode_bboxes_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_box_coder_compute SRCS box_coder_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_activation_compute SRCS activation_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_argmax_compute SRCS argmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_axpy_compute SRCS axpy_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_norm_compute SRCS norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_cast_compute SRCS cast_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_instance_norm_compute SRCS instance_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -31,28 +31,28 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_
    lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_dropout_compute SRCS dropout_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_softmax_compute SRCS softmax_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_mul_compute SRCS mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_batch_norm_compute SRCS batch_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_pool_compute SRCS pool_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

 if(LITE_BUILD_EXTRA)
-    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS ${bm_kernels} arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_gru_unit SRCS gru_unit_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_sequence_pool_compute SRCS sequence_pool_compute_test.cc DEPS ${bm_kernels} arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_max_compute SRCS reduce_max_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_unsqueeze_compute SRCS unsqueeze_compute_test.cc DEPS arena_framework ${bm_kernels} ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_assign_compute SRCS assign_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_assign_value_compute SRCS assign_value_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_box_clip_compute SRCS box_clip_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_mean_compute SRCS reduce_mean_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_reduce_prod_compute SRCS reduce_prod_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_stack_compute SRCS stack_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_range_compute SRCS range_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_affine_channel_compute SRCS affine_channel_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_anchor_generator_compute SRCS anchor_generator_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_generate_proposals_compute SRCS generate_proposals_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_roi_align_compute SRCS roi_align_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_search_aligned_mat_mul_compute SRCS search_aligned_mat_mul_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_search_seq_fc_compute SRCS search_seq_fc_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${bm_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_lookup_table_compute SRCS lookup_table_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_gather_compute SRCS gather_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
 endif()