[NPU] support increment, less than op bridge (#3147)

* [NPU] subgraph's precision register to kAny

[NPU] support increment, less than op bridge (#3147)
* [NPU] subgraph's precision register to kAny
9edfecaa · zhupengyang · GitHub · ff332144 · 9edfecaa · 9edfecaa
25 changed file
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -59,6 +59,8 @@ void TestCase::CreateInstruction() {
  CHECK(it != kernels.end()) << "failed to create the kernel in "
                             << place_.DebugString()
                             << " with alias: " << alias_;
+  // reset final place
+  place_ = (*it)->place();
  // prepare context
  (*it)->SetContext(std::move(ctx_));
  instruction_.reset(new Instruction(op, std::move(*it)));

--- a/lite/core/op_lite.cc
+++ b/lite/core/op_lite.cc
@@ -47,18 +47,19 @@ std::vector<std::unique_ptr<KernelBase>> OpLite::CreateKernels(
    return kernels;
  }
-  std::set<Place> place_set;
+  std::set<Place> expanded_places(places.begin(), places.end());
-  for (auto place : places) {
+  for (auto &place : places) {
-    place_set.insert(place);
+    // Pick kernels those support any Precision and any DataLayout, For example:
-    // Pick kernels those support any Precision and any DataLayout
+    // kARM,kFloat,kNCHW -> kARM,kFloat,kAny; kARM,kAny,kNCHW; kARM,kAny,kAny
-    place.precision = PRECISION(kAny);
+    expanded_places.insert(
-    place_set.insert(place);
+        Place(place.target, place.precision, DATALAYOUT(kAny)));
-    place.layout = DATALAYOUT(kAny);
+    expanded_places.insert(Place(place.target, PRECISION(kAny), place.layout));
-    place_set.insert(place);
+    expanded_places.insert(
+        Place(place.target, PRECISION(kAny), DATALAYOUT(kAny)));
  }
  std::set<TargetType> targets;
-  for (auto place : place_set) {
+  for (auto place : expanded_places) {
    pick_kernel(place);
    targets.insert(place.target);
  }

--- a/lite/kernels/arm/CMakeLists.txt
+++ b/lite/kernels/arm/CMakeLists.txt
@@ -91,7 +91,6 @@ add_kernel(lookup_table_compute_arm ARM extra SRCS lookup_table_compute.cc DEPS
 add_kernel(lookup_table_dequant_compute_arm ARM extra SRCS lookup_table_dequant_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(logical_compute_arm ARM extra SRCS logical_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(sequence_softmax_compute_arm ARM extra SRCS sequence_softmax_compute.cc DEPS ${lite_kernel_deps} math_arm)
-add_kernel(less_than_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(while_compute_arm ARM extra SRCS while_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(compare_compute_arm ARM extra SRCS compare_compute.cc DEPS ${lite_kernel_deps} math_arm)
 add_kernel(topk_compute_arm ARM extra SRCS topk_compute.cc DEPS ${lite_kernel_deps} math_arm)

--- a/lite/kernels/arm/compare_compute.cc
+++ b/lite/kernels/arm/compare_compute.cc
@@ -73,8 +73,6 @@ inline void get_mid_dims(const lite::DDim &x_dims,
    (*post) *= x_dims[i];
  }
 }
-template <template <typename T> class Functor>
-void CompareCompute<Functor>::PrepareForRun() {}
 template <template <typename T> class Functor>
 void CompareCompute<Functor>::Run() {
@@ -177,7 +175,6 @@ void CompareCompute_int64<Functor>::Run() {
        for (int inner_id = 0; inner_id < inner_num; ++inner_id) {
          int index = (outer_id * mid_num + mid_id) * inner_num + inner_id;
          z[index] = CompareFunctor()(x[index], y_data);
-          // z[index] = x[index] < y_data;
        }
      }
    }
@@ -189,50 +186,78 @@ void CompareCompute_int64<Functor>::Run() {
 }  // namespace lite
 }  // namespace paddle
-REGISTER_LITE_KERNEL(less_than,
+REGISTER_LITE_KERNEL(equal,
                     kARM,
                     kFloat,
                     kNCHW,
                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_LessThanFunctor>,
+                         paddle::lite::kernels::arm::_EqualFunctor>,
                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
    .Finalize();
-REGISTER_LITE_KERNEL(less_than,
+REGISTER_LITE_KERNEL(equal,
                     kARM,
-                     kInt64,
+                     kInt32,
                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute_int64<
+                     paddle::lite::kernels::arm::CompareCompute_int32<
-                         paddle::lite::kernels::arm::_LessThanFunctor>,
+                         paddle::lite::kernels::arm::_EqualFunctor>,
                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
    .Finalize();
-REGISTER_LITE_KERNEL(equal,
+REGISTER_LITE_KERNEL(not_equal,
                     kARM,
                     kFloat,
                     kNCHW,
                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_EqualFunctor>,
+                         paddle::lite::kernels::arm::_NotEqualFunctor>,
                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
    .Finalize();
-REGISTER_LITE_KERNEL(not_equal,
+REGISTER_LITE_KERNEL(less_than,
                     kARM,
                     kFloat,
                     kNCHW,
                     paddle::lite::kernels::arm::CompareCompute<
-                         paddle::lite::kernels::arm::_NotEqualFunctor>,
+                         paddle::lite::kernels::arm::_LessThanFunctor>,
                     def)
    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
    .Finalize();
+REGISTER_LITE_KERNEL(less_than,
+                     kARM,
+                     kInt32,
+                     kNCHW,
+                     paddle::lite::kernels::arm::CompareCompute_int32<
+                         paddle::lite::kernels::arm::_LessThanFunctor>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
+    .Finalize();
+REGISTER_LITE_KERNEL(less_than,
+                     kARM,
+                     kInt64,
+                     kNCHW,
+                     paddle::lite::kernels::arm::CompareCompute_int64<
+                         paddle::lite::kernels::arm::_LessThanFunctor>,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
+    .Finalize();
 REGISTER_LITE_KERNEL(less_equal,
                     kARM,
                     kFloat,
@@ -244,6 +269,7 @@ REGISTER_LITE_KERNEL(less_equal,
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
    .Finalize();
 REGISTER_LITE_KERNEL(greater_than,
                     kARM,
                     kFloat,
@@ -255,6 +281,7 @@ REGISTER_LITE_KERNEL(greater_than,
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
    .Finalize();
 REGISTER_LITE_KERNEL(greater_equal,
                     kARM,
                     kFloat,
@@ -266,27 +293,3 @@ REGISTER_LITE_KERNEL(greater_equal,
    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
    .Finalize();
-REGISTER_LITE_KERNEL(less_than,
-                     kARM,
-                     kInt32,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute_int32<
-                         paddle::lite::kernels::arm::_LessThanFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
-REGISTER_LITE_KERNEL(equal,
-                     kARM,
-                     kInt32,
-                     kNCHW,
-                     paddle::lite::kernels::arm::CompareCompute_int32<
-                         paddle::lite::kernels::arm::_EqualFunctor>,
-                     def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kInt32))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kBool))})
-    .Finalize();
--- a/lite/kernels/arm/compare_compute.h
+++ b/lite/kernels/arm/compare_compute.h
@@ -26,10 +26,6 @@ namespace arm {
 template <template <typename T> class Functor>
 class CompareCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
 public:
-  using param_t = operators::LogicalParam;
-  void PrepareForRun() override;
  void Run() override;
  ~CompareCompute() {}
@@ -39,8 +35,6 @@ template <template <typename T> class Functor>
 class CompareCompute_int32
    : public KernelLite<TARGET(kARM), PRECISION(kInt32)> {
 public:
-  using param_t = operators::LogicalParam;
  void Run() override;
  ~CompareCompute_int32() {}
@@ -50,8 +44,6 @@ template <template <typename T> class Functor>
 class CompareCompute_int64
    : public KernelLite<TARGET(kARM), PRECISION(kInt64)> {
 public:
-  using param_t = operators::LogicalParam;
  void Run() override;
  ~CompareCompute_int64() {}

--- a/lite/kernels/npu/bridges/CMakeLists.txt
+++ b/lite/kernels/npu/bridges/CMakeLists.txt
@@ -46,6 +46,9 @@ lite_cc_library(subgraph_bridge_dropout_op_npu SRCS dropout_op.cc DEPS ${npu_sub
 lite_cc_library(subgraph_bridge_layer_norm_op_npu SRCS layer_norm_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_fill_constant_op_npu SRCS fill_constant_op.cc DEPS ${npu_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_fill_constant_batch_size_like_op_npu SRCS fill_constant_batch_size_like_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_increment_op_npu SRCS increment_op.cc DEPS ${npu_subgraph_bridge_deps})
+lite_cc_library(subgraph_bridge_compare_op_npu SRCS compare_op.cc DEPS ${npu_subgraph_bridge_deps})
 set(npu_subgraph_bridges
        subgraph_bridge_registry
@@ -79,6 +82,8 @@ set(npu_subgraph_bridges
        subgraph_bridge_layer_norm_op_npu
        subgraph_bridge_fill_constant_op_npu
        subgraph_bridge_fill_constant_batch_size_like_op_npu
+        subgraph_bridge_increment_op_npu
+        subgraph_bridge_compare_op_npu
        CACHE INTERNAL "npu_subgraph_bridges")
 message(STATUS "+++++ npu_subgraph_bridges: ${npu_subgraph_bridges}")
--- a/lite/kernels/npu/bridges/compare_op.cc
+++ b/lite/kernels/npu/bridges/compare_op.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+int LessThanConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_name);
+  auto x_dims = x->dims();
+  auto y_name = op_info->Input("Y").front();
+  auto y = scope->FindTensor(y_name);
+  auto y_dims = y->dims();
+  auto out_name = op_info->Output("Out").front();
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x);
+  }
+  // Y node
+  std::shared_ptr<Node> y_node = nullptr;
+  if (graph->Has(y_name)) {
+    y_node = graph->Get(y_name);
+  } else {
+    y_node = graph->Add(y_name, *y);
+  }
+  // add node
+  auto less_than_node = graph->Add<ge::op::Less>(out_name, PRECISION(kBool));
+  auto less_than_op = less_than_node->data<ge::op::Less>();
+  less_than_op->set_input_x1(*x_node->data());
+  less_than_op->set_input_x2(*y_node->data());
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(less_than,
+                         kNPU,
+                         paddle::lite::subgraph::npu::LessThanConverter);
--- a/lite/kernels/npu/bridges/increment_op.cc
+++ b/lite/kernels/npu/bridges/increment_op.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "lite/kernels/npu/bridges/graph.h"
+#include "lite/kernels/npu/bridges/registry.h"
+#include "lite/kernels/npu/bridges/utility.h"
+namespace paddle {
+namespace lite {
+namespace subgraph {
+namespace npu {
+int IncrementConverter(void* ctx, OpLite* op, KernelBase* kernel) {
+  CHECK(ctx != nullptr);
+  CHECK(op != nullptr);
+  auto graph = static_cast<Graph*>(ctx);
+  auto op_info = op->op_info();
+  auto op_type = op_info->Type();
+  auto scope = op->scope();
+  VLOG(3) << "[NPU] Converting " + op_type + "...";
+  // Get input, output and op attributes
+  auto x_name = op_info->Input("X").front();
+  auto x = scope->FindTensor(x_name);
+  auto x_dims = x->dims();
+  auto out_name = op_info->Output("Out").front();
+  float step = op_info->GetAttr<float>("step");
+  // X node
+  std::shared_ptr<Node> x_node = nullptr;
+  if (graph->Has(x_name)) {
+    x_node = graph->Get(x_name);
+  } else {
+    x_node = graph->Add(x_name, *x, CvtShape(x_dims));
+  }
+  // Y node
+  Tensor y;
+  y.Resize({1});
+  auto y_data = y.mutable_data<float>();
+  y_data[0] = step;
+  y.set_persistable(true);
+  auto y_node = graph->Add(out_name + "/y", y);
+  // add node
+  auto increment_node = graph->Add<ge::op::Add>(out_name);
+  auto increment_op = increment_node->data<ge::op::Add>();
+  increment_op->set_input_x1(*x_node->data());
+  increment_op->set_input_x2(*y_node->data());
+  return REBUILD_WHEN_SHAPE_CHANGED;
+}
+}  // namespace npu
+}  // namespace subgraph
+}  // namespace lite
+}  // namespace paddle
+REGISTER_SUBGRAPH_BRIDGE(increment,
+                         kNPU,
+                         paddle::lite::subgraph::npu::IncrementConverter);
--- a/lite/kernels/npu/bridges/paddle_use_bridges.h
+++ b/lite/kernels/npu/bridges/paddle_use_bridges.h
@@ -23,6 +23,7 @@ USE_SUBGRAPH_BRIDGE(softsign, kNPU);
 USE_SUBGRAPH_BRIDGE(hard_sigmoid, kNPU);
 USE_SUBGRAPH_BRIDGE(batch_norm, kNPU);
+USE_SUBGRAPH_BRIDGE(less_than, kNPU);
 USE_SUBGRAPH_BRIDGE(concat, kNPU);
 USE_SUBGRAPH_BRIDGE(conv2d, kNPU);
 USE_SUBGRAPH_BRIDGE(depthwise_conv2d, kNPU);
@@ -40,9 +41,12 @@ USE_SUBGRAPH_BRIDGE(fusion_elementwise_div_activation, kNPU);
 USE_SUBGRAPH_BRIDGE(fill_constant, kNPU)
 USE_SUBGRAPH_BRIDGE(fill_constant_batch_size_like, kNPU)
+USE_SUBGRAPH_BRIDGE(increment, kNPU);
+USE_SUBGRAPH_BRIDGE(instance_norm, kNPU);
 USE_SUBGRAPH_BRIDGE(fc, kNPU);
 USE_SUBGRAPH_BRIDGE(bilinear_interp, kNPU);
 USE_SUBGRAPH_BRIDGE(nearest_interp, kNPU);
+USE_SUBGRAPH_BRIDGE(layer_norm, kNPU);
 USE_SUBGRAPH_BRIDGE(matmul, kNPU);
 USE_SUBGRAPH_BRIDGE(mul, kNPU);
 USE_SUBGRAPH_BRIDGE(pad2d, kNPU);
@@ -60,5 +64,3 @@ USE_SUBGRAPH_BRIDGE(transpose, kNPU);
 USE_SUBGRAPH_BRIDGE(transpose2, kNPU);
 USE_SUBGRAPH_BRIDGE(unsqueeze, kNPU);
 USE_SUBGRAPH_BRIDGE(unsqueeze2, kNPU);
-USE_SUBGRAPH_BRIDGE(instance_norm, kNPU);
-USE_SUBGRAPH_BRIDGE(layer_norm, kNPU);
--- a/lite/kernels/npu/bridges/scale_op_test.cc
+++ b/lite/kernels/npu/bridges/scale_op_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "lite/operators/scale_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-void scale_ref(const std::shared_ptr<operators::ScaleOp> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  float scale = op_info->GetAttr<float>("scale");
-  float bias = op_info->GetAttr<float>("bias");
-  bool bias_after_scale = op_info->GetAttr<bool>("bias_after_scale");
-  if (!bias_after_scale) {
-    bias *= scale;
-  }
-  auto x_data = x->data<float>();
-  auto out_data = out->mutable_data<float>();
-  DDim x_dims = x->dims();
-  DDim out_dims = out->dims();
-  CHECK_EQ(x_dims.production(), out_dims.production());
-  for (int i = 0; i < out_dims.production(); i++) {
-    out_data[i] = x_data[i] * scale + bias;
-  }
-}
-void test_scale(int bs,
-                int ic,
-                int ih,
-                int iw,
-                bool bias_after_scale,
-                float scale,
-                float bias) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name("x");
-  std::string out_var_name("out");
-  std::string out_ref_var_name("out_ref");
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-  // initialize input&output data
-  FillTensor<float, int>(x);
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("scale");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("bias_after_scale", bias_after_scale);
-  opdesc.SetAttr("scale", scale);
-  opdesc.SetAttr("bias", bias);
-  // create and convert op to NPU model, then run it on NPU
-  auto op = CreateOp<operators::ScaleOp>(opdesc, &scope);
-  LauchOp(op, {x_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-  // execute reference implementation and save to output tensor('out')
-  scale_ref(op);
-  // compare results
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-5);
-  }
-}
-TEST(NPUBridges, scale) {
-  for (auto bs : {1, 3}) {
-    for (auto ic : {1, 3}) {
-      for (auto ih : {3, 4}) {
-        for (auto iw : {4, 3}) {
-          for (auto bias_after_scale : {true, false}) {
-            for (auto scale : {-1.0f, 5.0f}) {
-              for (auto bias : {-2.0f, 30.0f}) {
-                VLOG(3) << "bs: " << bs << " ic: " << ic << " ih: " << ih
-                        << " iw: " << iw
-                        << " bias_after_scale: " << bias_after_scale
-                        << " scale: " << scale << " bias: " << bias;
-                test_scale(bs, ic, ih, iw, bias_after_scale, scale, bias);
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-}
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-USE_LITE_OP(scale);
-USE_NPU_BRIDGE(scale);
--- a/lite/kernels/npu/subgraph_compute.cc
+++ b/lite/kernels/npu/subgraph_compute.cc
@@ -149,6 +149,9 @@ int SubgraphEngine::BuildDeviceProgram() {
      case PRECISION(kFloat):
        origin_otensors_[i]->mutable_data<float>();
        break;
+      case PRECISION(kBool):
+        origin_otensors_[i]->mutable_data<bool>();
+        break;
      case PRECISION(kInt8):
        origin_otensors_[i]->mutable_data<int8_t>();
        break;
@@ -231,10 +234,12 @@ void SubgraphCompute::Run() {
 REGISTER_LITE_KERNEL(subgraph,
                     kNPU,
-                     kFloat,
+                     kAny,
                     kNCHW,
                     paddle::lite::kernels::npu::SubgraphCompute,
                     def)
-    .BindInput("Inputs", {LiteType::GetTensorTy(TARGET(kHost))})
+    .BindInput("Inputs",
-    .BindOutput("Outputs", {LiteType::GetTensorTy(TARGET(kHost))})
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
+    .BindOutput("Outputs",
+                {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kAny))})
    .Finalize();
--- a/lite/kernels/npu/subgraph_compute.h
+++ b/lite/kernels/npu/subgraph_compute.h
@@ -51,7 +51,7 @@ class SubgraphEngine : public subgraph::Engine {
  std::unique_ptr<hiai::AiModelMngerClient> device_program_{nullptr};
 };
-class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kFloat)> {
+class SubgraphCompute : public KernelLite<TARGET(kNPU), PRECISION(kAny)> {
 public:
  using param_t = operators::SubgraphParam;

--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -19,7 +19,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_
    lite_cc_test(test_kernel_grid_sampler_compute SRCS grid_sampler_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_sequence_softmax_compute SRCS sequence_softmax_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_im2sequence_compute SRCS im2sequence_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    #lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_compare_compute SRCS compare_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_logical_xor_compute SRCS logical_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_topk_compute SRCS topk_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

--- a/lite/tests/kernels/assign_compute_test.cc
+++ b/lite/tests/kernels/assign_compute_test.cc
@@ -69,7 +69,7 @@ void TestAssign(const Place& place) {
 TEST(Assign, precision) {
  Place place;
 #ifdef LITE_WITH_ARM
-  place = {TARGET(kARM), PRECISION(kAny)};
+  place = TARGET(kARM);
 #else
  return;
 #endif

--- a/lite/tests/kernels/assign_value_compute_test.cc
+++ b/lite/tests/kernels/assign_value_compute_test.cc
@@ -97,7 +97,7 @@ class AssignValueComputeTester : public arena::TestCase {
 TEST(AssignValue, precision) {
  Place place;
 #ifdef LITE_WITH_ARM
-  place = {TARGET(kARM), PRECISION(kAny)};
+  place = TARGET(kARM);
 #else
  return;
 #endif

--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
@@ -134,7 +134,7 @@ TEST(Cast, precision) {
  Place place;
  float abs_error = 2e-5;
 #if defined(LITE_WITH_ARM)
-  place = {TARGET(kARM), PRECISION(kAny)};
+  place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU)
  place = TARGET(kXPU);
 #else

--- a/lite/tests/kernels/compare_compute_test.cc
+++ b/lite/tests/kernels/compare_compute_test.cc
@@ -16,12 +16,14 @@
 #include "lite/api/paddle_use_kernels.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/core/arena/framework.h"
+#include "lite/tests/utils/fill_data.h"
 namespace paddle {
 namespace lite {
 #define COMPARE_FUNCTOR(name, op)                                           \
  template <typename T>                                                     \
-  struct _##name##Functor {                                                 \
+  struct name##Functor {                                                    \
    inline bool operator()(const T& a, const T& b) const { return a op b; } \
  };
@@ -33,7 +35,7 @@ COMPARE_FUNCTOR(GreaterThan, >);
 COMPARE_FUNCTOR(GreaterEqual, >=);
 template <>
-struct _EqualFunctor<float> {
+struct EqualFunctor<float> {
  inline bool operator()(const float& a, const float& b) const {
    // It is safe to cast a and b to double.
    return fabs(static_cast<double>(a - b)) < 1e-8;
@@ -41,59 +43,56 @@ struct _EqualFunctor<float> {
 };
 template <>
-struct _NotEqualFunctor<float> {
+struct NotEqualFunctor<float> {
  inline bool operator()(const float& a, const float& b) const {
-    return !_EqualFunctor<float>()(a, b);
+    return !EqualFunctor<float>()(a, b);
  }
 };
-template <template <typename T> class Functor>
+template <typename T, template <typename U> class Functor>
-class LessThanTester : public arena::TestCase {
+class CompareComputeTester : public arena::TestCase {
 protected:
-  std::string input_x_ = "x";
+  std::string x_ = "x";
-  std::string input_y_ = "y";
+  std::string y_ = "y";
-  std::string output_ = "out";
+  std::string out_ = "out";
-  int axis_ = 1;
+  std::string op_ = "less_than";
-  bool force_cpu_ = 0;
  DDim x_dims_{{3, 5, 4, 4}};
  DDim y_dims_{{4}};
-  std::string opname_ = "less_than";
+  int axis_ = -1;
+  bool force_cpu_ = false;
 public:
-  LessThanTester(const Place& place,
+  CompareComputeTester(const Place& place,
                       const std::string& alias,
-                 bool force_cpu,
+                       const std::string op,
-                 int axis,
                       DDim x_dims,
                       DDim y_dims,
-                 const std::string& opname)
+                       int axis = -1)
      : TestCase(place, alias),
-        axis_(axis),
+        op_(op),
-        force_cpu_(force_cpu),
        x_dims_(x_dims),
        y_dims_(y_dims),
-        opname_(opname) {}
+        axis_(axis) {}
  void RunBaseline(Scope* scope) override {
-    auto* out = scope->NewTensor(output_);
+    auto* out = scope->NewTensor(out_);
    CHECK(out);
    out->Resize(x_dims_);
    auto* out_data = out->mutable_data<bool>();
    auto axis = axis_;
-    auto* x = scope->FindTensor(input_x_);
+    auto* x = scope->FindTensor(x_);
-    const auto* x_data = x->data<float>();
+    const auto* x_data = x->data<T>();
-    auto* y = scope->FindTensor(input_y_);
+    auto* y = scope->FindTensor(y_);
-    auto* y_data_in = y->data<float>();
+    auto* y_data_in = y->data<T>();
-    using CompareFunc = Functor<float>;
+    using CompareFunc = Functor<T>;
    if (x_dims_.size() == y_dims_.size()) {
      for (int i = 0; i < x_dims_.production(); i++) {
-        // out_data[i] = x_data[i] < y_data[i];
        out_data[i] = CompareFunc()(x_data[i], y_data_in[i]);
      }
    } else {
-      auto* y_data = reinterpret_cast<float*>(
+      auto* y_data =
-          malloc(x_dims_.production() * sizeof(float)));
+          reinterpret_cast<T*>(malloc(x_dims_.production() * sizeof(T)));
      if (axis < 0) {
        axis = x_dims_.size() - y_dims_.size();
@@ -111,12 +110,12 @@ class LessThanTester : public arena::TestCase {
        num *= x_dims_[i];
      }
      int ysize = channels * num;
-      float* y_data_t = reinterpret_cast<float*>(y_data);
+      T* y_data_t = reinterpret_cast<T*>(y_data);
      if (num == 1) {
        for (int i = 0; i < batch; ++i) {
          memcpy(reinterpret_cast<void*>(y_data_t),
                 reinterpret_cast<const void*>(&y_data_in[0]),
-                 ysize * sizeof(float));
+                 ysize * sizeof(T));
          y_data_t += ysize;
        }
@@ -126,118 +125,118 @@ class LessThanTester : public arena::TestCase {
            y_data_t[i * num + j] = y_data_in[i];
          }
        }
-        float* tempptr = y_data_t;
+        T* tempptr = y_data_t;
        for (int i = 0; i < batch; ++i) {
-          memcpy(y_data_t, tempptr, ysize * sizeof(float));
+          memcpy(y_data_t, tempptr, ysize * sizeof(T));
          y_data_t += ysize;
        }
      }
      for (int i = 0; i < x_dims_.production(); i++) {
-        // out_data[i] = x_data[i] < y_data[i];
        out_data[i] = CompareFunc()(x_data[i], y_data[i]);
      }
    }
  }
  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType(opname_);
+    op_desc->SetType(op_);
-    op_desc->SetInput("X", {input_x_});
+    op_desc->SetInput("X", {x_});
-    op_desc->SetInput("Y", {input_y_});
+    op_desc->SetInput("Y", {y_});
-    op_desc->SetOutput("Out", {output_});
+    op_desc->SetOutput("Out", {out_});
    op_desc->SetAttr("axis", axis_);
    op_desc->SetAttr("force_cpu", force_cpu_);
  }
  void PrepareData() override {
-    std::vector<float> data(x_dims_.production());
+    std::vector<T> dx(x_dims_.production());
-    std::vector<float> datay(
+    std::vector<T> dy(y_dims_.production());
-        y_dims_.production());  // datay(dims_.production());
+    fill_data_rand<T>(dx.data(), -5, 5, x_dims_.production());
-    for (int i = 0; i < x_dims_.production(); i++) {
+    fill_data_rand<T>(dy.data(), -5, 5, y_dims_.production());
-      data[i] = 1.1;
+    SetCommonTensor(x_, x_dims_, dx.data());
-    }
+    SetCommonTensor(y_, y_dims_, dy.data());
-    for (int i = 0; i < y_dims_.production(); i++) {
-      datay[i] = i;
-    }
-    SetCommonTensor(input_x_, x_dims_, data.data());
-    SetCommonTensor(input_y_, y_dims_, datay.data());
  }
 };
-void test_compare(Place place) {
-  for (bool force_cpu : {0}) {
+template <typename T>
-    for (auto n : {1, 3, 4}) {
+void TestCompare(Place place,
-      for (auto c : {1, 3, 4}) {
+                 float abs_error,
-        for (auto h : {1, 3, 4}) {
+                 std::string op,
-          for (auto w : {1, 3, 4}) {
+                 std::vector<int64_t> x_dims,
-            for (auto axis : {-1, 0, 1, 3}) {
+                 std::vector<int64_t> y_dims,
-              for (auto yd : {std::vector<int64_t>({n}),
+                 int axis) {
-                              std::vector<int64_t>({c}),
+  if (typeid(T) == typeid(float)) {
-                              std::vector<int64_t>({h}),
+    place.precision = PRECISION(kFloat);
-                              std::vector<int64_t>({w}),
+  } else if (typeid(T) == typeid(int32_t)) {
-                              std::vector<int64_t>({n, c}),
+    place.precision = PRECISION(kInt32);
-                              std::vector<int64_t>({h, w}),
+  } else if (typeid(T) == typeid(int64_t)) {
-                              std::vector<int64_t>({n, c, h}),
+    place.precision = PRECISION(kInt64);
-                              std::vector<int64_t>({n, c, h, w})}) {
+  } else {
-                DDimLite x_dims = DDim(std::vector<int64_t>({n, c, h, w}));
+    LOG(FATAL) << "unsupported dtype";
-                DDimLite y_dims = DDim(yd);
-                int axis_t = axis < 0 ? x_dims.size() - y_dims.size() : axis;
-                if (axis_t + y_dims.size() > 4) continue;
-                bool flag = false;
-                for (int i = 0; i < y_dims.size(); i++) {
-                  if (x_dims[i + axis_t] != y_dims[i]) flag = true;
-                }
-                if (flag) continue;
-                std::unique_ptr<arena::TestCase> less_than_tester(
-                    new LessThanTester<paddle::lite::_LessThanFunctor>(
-                        place,
-                        "def",
-                        force_cpu,
-                        axis,
-                        x_dims,
-                        y_dims,
-                        "less_than"));
-                arena::Arena less_than_arena(
-                    std::move(less_than_tester), place, 0.001);
-                less_than_arena.TestPrecision();
-                std::unique_ptr<arena::TestCase> equal_tester(
-                    new LessThanTester<paddle::lite::_EqualFunctor>(place,
-                                                                    "def",
-                                                                    force_cpu,
-                                                                    axis,
-                                                                    x_dims,
-                                                                    y_dims,
-                                                                    "equal"));
-                arena::Arena equal_arena(std::move(equal_tester), place, 0.001);
-                equal_arena.TestPrecision();
-                std::unique_ptr<arena::TestCase> greater_than_tester(
-                    new LessThanTester<paddle::lite::_GreaterThanFunctor>(
-                        place,
-                        "def",
-                        force_cpu,
-                        axis,
-                        x_dims,
-                        y_dims,
-                        "greater_than"));
-                arena::Arena greater_than_arena(
-                    std::move(greater_than_tester), place, 0.001);
-                greater_than_arena.TestPrecision();
-              }
-            }
-          }
-        }
  }
+  std::unique_ptr<arena::TestCase> tester = nullptr;
+  if (op == "equal") {
+    tester = static_cast<std::unique_ptr<arena::TestCase>>(
+        new CompareComputeTester<T, EqualFunctor>(
+            place, "def", op, DDim(x_dims), DDim(y_dims), axis));
+  } else if (op == "not_equal") {
+    tester = static_cast<std::unique_ptr<arena::TestCase>>(
+        new CompareComputeTester<T, NotEqualFunctor>(
+            place, "def", op, DDim(x_dims), DDim(y_dims), axis));
+  } else if (op == "less_than") {
+    tester = static_cast<std::unique_ptr<arena::TestCase>>(
+        new CompareComputeTester<T, LessThanFunctor>(
+            place, "def", op, DDim(x_dims), DDim(y_dims), axis));
+  } else if (op == "less_equal") {
+    tester = static_cast<std::unique_ptr<arena::TestCase>>(
+        new CompareComputeTester<T, LessEqualFunctor>(
+            place, "def", op, DDim(x_dims), DDim(y_dims), axis));
+  } else if (op == "greater_than") {
+    tester = static_cast<std::unique_ptr<arena::TestCase>>(
+        new CompareComputeTester<T, GreaterThanFunctor>(
+            place, "def", op, DDim(x_dims), DDim(y_dims), axis));
+  } else if (op == "greater_equal") {
+    tester = static_cast<std::unique_ptr<arena::TestCase>>(
+        new CompareComputeTester<T, GreaterEqualFunctor>(
+            place, "def", op, DDim(x_dims), DDim(y_dims), axis));
+  } else {
+    LOG(FATAL) << "unsupported type";
  }
+  arena::Arena arena(std::move(tester), place, abs_error);
+  arena.TestPrecision();
+}
+#if defined(LITE_WITH_NPU)
+TEST(Compare_OP_NPU, precision) {
+  Place place{TARGET(kNPU)};
+  float abs_error = 1e-2;
+  TestCompare<float>(
+      place, abs_error, "less_than", {2, 3, 4, 5}, {2, 3, 4, 5}, -1);
+  TestCompare<float>(place, abs_error, "less_than", {2, 3, 4}, {2, 3, 4}, 0);
+}
+#elif defined(LITE_WITH_ARM)
+TEST(Compare_OP_ARM, precision) {
+  Place place{TARGET(kARM)};
+  float abs_error = 1e-5;
+  for (auto op : std::vector<std::string>{"equal",
+                                          "not_equal",
+                                          "less_than",
+                                          "less_equal",
+                                          "greater_than",
+                                          "greater_equal"}) {
+    TestCompare<float>(place, abs_error, op, {2, 3, 4, 5}, {2, 3, 4, 5}, -1);
+    TestCompare<float>(place, abs_error, op, {2, 3, 4}, {2, 3, 4}, 0);
  }
+  TestCompare<float>(place, abs_error, "equal", {2, 3, 4}, {3, 4}, 1);
+  TestCompare<float>(place, abs_error, "equal", {2, 3, 4, 5}, {3, 4}, 1);
+  TestCompare<float>(place, abs_error, "equal", {2, 3, 4}, {4}, 2);
+  TestCompare<float>(place, abs_error, "equal", {2, 3, 4, 5}, {5}, 3);
+  TestCompare<int32_t>(place, abs_error, "less_than", {3, 4}, {3, 4}, -1);
+  TestCompare<int64_t>(place, abs_error, "less_than", {3, 4}, {3, 4}, -1);
 }
-TEST(Compare_OP, precision) {
-// #ifdef LITE_WITH_X86
-// //   Place place(TARGET(kX86));
-// // #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_compare(place);
 #endif
-}
 }  // namespace lite
 }  // namespace paddle
--- a/lite/tests/kernels/fill_constant_batch_size_like_compute_test.cc
+++ b/lite/tests/kernels/fill_constant_batch_size_like_compute_test.cc
@@ -136,7 +136,7 @@ TEST(fill_constant_batch_size_like, precision) {
  place = TARGET(kNPU);
  abs_error = 1e-2;  // use fp16 in npu
 #elif defined(LITE_WITH_ARM)
-  place = {TARGET(kARM), PRECISION(kAny)};
+  place = TARGET(kARM);
 #else
  return;
 #endif

--- a/lite/tests/kernels/fill_constant_compute_test.cc
+++ b/lite/tests/kernels/fill_constant_compute_test.cc
@@ -174,7 +174,7 @@ TEST(fill_constant, precision) {
  place = TARGET(kNPU);
  abs_error = 1e-2;  // use fp16 in npu
 #elif defined(LITE_WITH_ARM)
-  place = {TARGET(kARM), PRECISION(kAny)};
+  place = TARGET(kARM);
 #else
  return;
 #endif

--- a/lite/tests/kernels/gather_compute_test.cc
+++ b/lite/tests/kernels/gather_compute_test.cc
@@ -95,7 +95,7 @@ TEST(Gather, precision) {
  float abs_error = 2e-5;
  Place place;
 #if defined(LITE_WITH_ARM)
-  place = {TARGET(kARM), PRECISION(kAny)};
+  place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU)
  place = TARGET(kXPU);
 #else

--- a/lite/tests/kernels/increment_compute_test.cc
+++ b/lite/tests/kernels/increment_compute_test.cc
@@ -66,12 +66,14 @@ class IncrementComputeTester : public arena::TestCase {
 };
 void test_increment(Place place, float abs_error) {
-  DDimLite dims_0{{3, 5, 4, 4}};
+  std::vector<std::vector<int64_t>> x_dims{{3, 5, 4, 4}, {3, 5}, {1}};
-  DDimLite dims_1{{3, 5}};
+  for (auto dims : x_dims) {
-  for (auto dims : {dims_0, dims_1}) {
    for (float step : {1, 2}) {
+#if LITE_WITH_NPU
+      if (dims.size() != 1) continue;
+#endif
      std::unique_ptr<arena::TestCase> tester(
-          new IncrementComputeTester(place, "def", step, dims));
+          new IncrementComputeTester(place, "def", step, DDim(dims)));
      arena::Arena arena(std::move(tester), place, abs_error);
      arena.TestPrecision();
    }
@@ -81,8 +83,11 @@ void test_increment(Place place, float abs_error) {
 TEST(Increment, precision) {
  Place place;
  float abs_error = 2e-5;
-#if defined(LITE_WITH_ARM)
+#if defined(LITE_WITH_NPU)
-  place = {TARGET(kARM), PRECISION(kAny)};
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
 #else
  return;
 #endif

--- a/lite/tests/kernels/lookup_table_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_compute_test.cc
@@ -112,7 +112,7 @@ TEST(LookupTable, precision) {
  float abs_error = 2e-5;
  Place place;
 #if defined(LITE_WITH_ARM)
-  place = {TARGET(kARM), PRECISION(kAny)};
+  place = TARGET(kARM);
 #elif defined(LITE_WITH_XPU)
  place = TARGET(kXPU);
 #else

--- a/lite/tests/kernels/lookup_table_dequant_compute_test.cc
+++ b/lite/tests/kernels/lookup_table_dequant_compute_test.cc
@@ -129,7 +129,7 @@ class LookupTableDequantComputeTest : public arena::TestCase {
 TEST(LookupTableDequant, precision) {
 #ifdef LITE_WITH_ARM
  float abs_error = 2e-5;
-  Place place = {TARGET(kARM), PRECISION(kAny)};
+  Place place = TARGET(kARM);
  for (auto ids_dims :
       std::vector<std::vector<int64_t>>{{5, 2, 3, 1}, {2, 3, 1}, {3, 1}}) {
    for (auto w_dims :

--- a/lite/tests/kernels/read_from_array_compute_test.cc
+++ b/lite/tests/kernels/read_from_array_compute_test.cc
@@ -88,7 +88,7 @@ TEST(ReadFromArray, precision) {
  Place place;
  float abs_error = 1e-5;
 #ifdef LITE_WITH_ARM
-  place = {TARGET(kARM), PRECISION(kAny)};
+  place = TARGET(kARM);
 #else
  return;
 #endif

--- a/lite/tests/kernels/write_to_array_compute_test.cc
+++ b/lite/tests/kernels/write_to_array_compute_test.cc
@@ -85,7 +85,7 @@ TEST(WriteToArray, precision) {
  Place place;
  float abs_error = 1e-5;
 #ifdef LITE_WITH_ARM
-  place = {TARGET(kARM), PRECISION(kAny)};
+  place = TARGET(kARM);
 #else
  return;
 #endif