[NPU] enhance concat, nearest_interp, bilinear_interp ut (#2764)

- enhance interp InferShape

[NPU] enhance concat, nearest_interp, bilinear_interp ut (#2764)
- enhance interp InferShape
5209b4b6 · zhupengyang · GitHub · c4a87224 · c4a87224 · 5209b4b6
9 changed file
--- a/lite/kernels/npu/bridges/concat_op_test.cc
+++ b/lite/kernels/npu/bridges/concat_op_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/concat_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-std::vector<size_t> stride_numel(const DDim& ddim) {
-  std::vector<size_t> strides(ddim.size());
-  strides[ddim.size() - 1] = ddim[ddim.size() - 1];
-  for (int i = ddim.size() - 2; i >= 0; --i) {
-    strides[i] = strides[i + 1] * ddim[i];
-  }
-  return strides;
-}
-
-void concat_ref(const std::shared_ptr<operators::ConcatOpLite> op) {
-  Scope* scope = op->scope();
-  const OpInfo* op_info = op->op_info();
-  auto x = op_info->Input("X");
-  std::vector<lite::Tensor*> inputs;
-  for (auto var : x) {
-    inputs.push_back(scope->FindVar(var)->GetMutable<lite::Tensor>());
-  }
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  int axis = op_info->GetAttr<int>("axis");
-  std::vector<lite::Tensor*> inputs_concat(inputs.size());
-  for (int j = 0; j < inputs.size(); ++j) {
-    inputs_concat[j] = inputs[j];
-  }
-  size_t num = inputs.size();
-  int rows = 1;
-  auto dim_0 = inputs[0]->dims();
-  for (int i = 0; i < axis; ++i) {
-    rows *= dim_0[i];
-  }
-  int out_rows = rows, out_cols = 0;
-  std::vector<int64_t> inputs_cols(inputs.size());
-  for (int i = 0; i < num; ++i) {
-    int t_cols = inputs[i]->numel() / rows;
-    out_cols += t_cols;
-    inputs_cols[i] = t_cols;
-  }
-  for (int k = 0; k < out_rows; ++k) {
-    float* dst_ptr = out->mutable_data<float>() + k * out_cols;
-    int col_idx = 0;
-    for (int j = 0; j < num; ++j) {
-      int col_len = inputs_cols[j];
-      const float* src_prt = inputs[j]->data<float>() + k * col_len;
-      std::memcpy(dst_ptr + col_idx, src_prt, sizeof(float) * col_len);
-      col_idx += col_len;
-    }
-  }
-}
-
-void test_concat(std::vector<vector<int64_t>> input, int axis) {
-  std::string x_var_name = "x";
-  std::string y_var_name = "y";
-  std::string out_var_name = "out";
-  std::string out_ref_var_name = "out_ref";
-
-  // prepare input&output variables
-  Scope scope;
-  auto* x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto* y = scope.Var(y_var_name)->GetMutable<Tensor>();
-  x->Resize(DDim(input[0]));
-  y->Resize(DDim(input[1]));
-  auto* out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto* out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  CHECK_EQ(out->dims(), out_ref->dims());
-
-  // initialize input&output data
-  FillTensor<float>(x);
-  FillTensor<float>(y);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType("concat");
-  opdesc.SetInput("X", {x_var_name, y_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("axis", axis);
-
-  auto op = CreateOp<operators::ConcatOpLite>(opdesc, &scope);
-  LauchOp(op, {x_var_name, y_var_name}, {out_var_name});
-  out_ref->CopyDataFrom(*out);
-  concat_ref(op);
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 5e-4);
-  }
-}
-
-TEST(NPUBridges, concat) {
-  test_concat({{3, 3, 5, 2}, {2, 3, 5, 2}}, 0);
-  test_concat({{3, 5, 5, 2}, {3, 1, 5, 2}}, 1);
-  test_concat({{3, 3, 2, 2}, {3, 3, 4, 2}}, 2);
-  test_concat({{3, 3, 5, 2}, {3, 3, 5, 6}}, 3);
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(concat);
-USE_NPU_BRIDGE(concat);
--- a/lite/kernels/npu/bridges/interpolate_op.cc
+++ b/lite/kernels/npu/bridges/interpolate_op.cc
@@ -48,11 +48,15 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  auto out_w = op_info->GetAttr<int>("out_w");
  auto out_h = op_info->GetAttr<int>("out_h");
  auto align_corners = op_info->GetAttr<bool>("align_corners");
-  int align_mode = op_info->GetAttr<int>("align_mode");
+  int align_mode =
+      op_info->HasAttr("align_mode") ? op_info->GetAttr<int>("align_mode") : 1;
  auto interp_method = op_info->GetAttr<std::string>("interp_method");
-  CHECK(!(align_mode == 0 && !align_corners)) << "[NPU] align_mode = 0 && "
-                                                 "align_corners = false isn't "
-                                                 "supported in HiAI DDK";
+  if (align_mode == 0 && !align_corners) {
+    LOG(WARNING) << "[NPU] align_mode = 0 && "
+                    "align_corners = false isn't "
+                    "supported in HiAI DDK";
+    return FAILED;
+  }

  // X node
  std::shared_ptr<Node> x_node = nullptr;
@@ -93,10 +97,12 @@ int InterpolateConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    if (interp_method == "bilinear") {
      const float largest_multiple = 7.0f;
      float multiple = static_cast<float>(x_h * x_w) / (out_h * out_w);
-      CHECK_LT(multiple, largest_multiple)
-          << "[NPU] multiple=(ih*iw)/(oh*ow)=" << multiple
-          << " is too large, should not exceed " << largest_multiple
-          << " in HiAI DDK";
+      if (multiple >= largest_multiple) {
+        LOG(WARNING) << "[NPU] multiple=(ih*iw)/(oh*ow)=" << multiple
+                     << " is too large, should not exceed " << largest_multiple
+                     << " in HiAI DDK";
+        return FAILED;
+      }
    }
    out_size_node =
        graph->Add(out_name + "/out_size", std::vector<int>({out_h, out_w}));

--- a/lite/kernels/npu/bridges/interpolate_op_test.cc
+++ b/lite/kernels/npu/bridges/interpolate_op_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "lite/operators/interpolate_op.h"
-#include <gtest/gtest.h>
-#include <random>
-#include "lite/core/op_registry.h"
-#include "lite/kernels/npu/bridges/registry.h"
-#include "lite/kernels/npu/bridges/test_helper.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace npu {
-namespace bridges {
-
-template <typename DType>
-void bilinear_interp_ref(const std::shared_ptr<operators::InterpolateOp> op) {
-  auto scope = op->scope();
-  auto op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_dims = x->dims();
-  int batch_size = x_dims[0];
-  int channel_size = x_dims[1];
-  auto x_h = x_dims[2];
-  auto x_w = x_dims[3];
-  CHECK_EQ(x_dims.size(), 4);
-  auto scale = op_info->GetAttr<float>("scale");
-  auto out_w = op_info->GetAttr<int>("out_w");
-  auto out_h = op_info->GetAttr<int>("out_h");
-  auto align_corners = op_info->GetAttr<bool>("align_corners");
-  int align_mode = op_info->GetAttr<int>("align_mode");
-  auto interp_method = op_info->GetAttr<std::string>("interp_method");
-
-  // calc real out_h and out_w
-  if (scale > 0) {
-    out_h = static_cast<int>(x_h * scale);
-    out_w = static_cast<int>(x_w * scale);
-  }
-  if (op_info->HasInput("OutSize")) {
-    auto out_size_var_names = op_info->Input("OutSize");
-    if (out_size_var_names.size() > 0) {
-      auto out_size_var_name = out_size_var_names.front();
-      auto out_size =
-          scope->FindVar(out_size_var_name)->GetMutable<lite::Tensor>();
-      auto out_size_dims = out_size->dims();
-      CHECK_EQ(out_size_dims.size(), 1);
-      CHECK_EQ(out_size_dims.production(), 2);
-      auto out_size_data = out_size->mutable_data<int>();
-      out_h = out_size_data[0];
-      out_w = out_size_data[1];
-    }
-  }
-  CHECK_GT(out_h, 0);
-  CHECK_GT(out_w, 0);
-  out->Resize({batch_size, channel_size, out_h, out_w});
-
-  // copy from x if no change
-  if (x_h == out_h && x_w == out_w) {
-    out->CopyDataFrom(*x);
-    return;
-  }
-
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_h > 1) {
-    ratio_h = (align_corners) ? static_cast<float>(x_h - 1) / (out_h - 1)
-                              : static_cast<float>(x_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = (align_corners) ? static_cast<float>(x_w - 1) / (out_w - 1)
-                              : static_cast<float>(x_w) / out_w;
-  }
-
-  // naive bilinear interpolation
-  auto x_data = x->mutable_data<DType>();
-  auto out_data = out->mutable_data<DType>();
-  bool align_flag = (align_mode == 0 && !align_corners);
-
-  std::vector<int> vy_n, vy_s;
-  std::vector<float> vd_n, vd_s;
-  vy_n.reserve(out_h);
-  vy_s.reserve(out_h);
-  vd_n.reserve(out_h);
-  vd_s.reserve(out_h);
-  for (int k = 0; k < out_h; k++) {
-    int yn = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
-                        : static_cast<int>(ratio_h * k);
-    yn = (yn > 0) ? yn : 0;
-    int ys = (yn + 1) < (x_h - 1) ? (yn + 1) : (x_h - 1);
-    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
-    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
-    float dn = align_flag ? idx_src_y - yn : ratio_h * k - yn;
-    float ds = 1.f - dn;
-    {
-      vy_n[k] = yn;
-      vy_s[k] = ys;
-      vd_n[k] = dn;
-      vd_s[k] = ds;
-    }
-  }
-
-  std::vector<int> vx_w, vx_e;
-  std::vector<float> vd_w, vd_e;
-  vx_w.reserve(out_w);
-  vx_e.reserve(out_w);
-  vd_w.reserve(out_w);
-  vd_e.reserve(out_w);
-  for (int l = 0; l < out_w; l++) {
-    int xw = (align_mode == 0 && !align_corners)
-                 ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
-                 : static_cast<int>(ratio_w * l);
-    xw = (xw > 0) ? xw : 0;
-    int xe = (xw + 1) < (x_w - 1) ? (xw + 1) : (x_w - 1);
-    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
-    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
-    float dw = align_flag ? idx_src_x - xw : ratio_w * l - xw;
-    float de = 1.f - dw;
-    {
-      vx_w[l] = xw;
-      vx_e[l] = xe;
-      vd_w[l] = dw;
-      vd_e[l] = de;
-    }
-  }
-
-  std::vector<int64_t> x_strides(x_dims.size(), 1);
-  for (int idx = x_strides.size() - 2; idx >= 0; idx--) {
-    x_strides[idx] = x_strides[idx + 1] * x_dims[idx + 1];
-  }
-  for (int i = 0; i < batch_size; i++) {
-    for (int j = 0; j < channel_size; j++) {
-      for (int k = 0; k < out_h; k++) {
-        for (int l = 0; l < out_w; l++) {
-          DType x0 = x_data[i * x_strides[0] + j * x_strides[1] +
-                            vy_n[k] * x_strides[2] + vx_w[l] * x_strides[3]];
-          DType x1 = x_data[i * x_strides[0] + j * x_strides[1] +
-                            vy_s[k] * x_strides[2] + vx_w[l] * x_strides[3]];
-          DType x2 = x_data[i * x_strides[0] + j * x_strides[1] +
-                            vy_n[k] * x_strides[2] + vx_e[l] * x_strides[3]];
-          DType x3 = x_data[i * x_strides[0] + j * x_strides[1] +
-                            vy_s[k] * x_strides[2] + vx_e[l] * x_strides[3]];
-          *out_data = x0 * vd_s[k] * vd_e[l] + x1 * vd_n[k] * vd_e[l] +
-                      x2 * vd_s[k] * vd_w[l] + x3 * vd_n[k] * vd_w[l];
-          out_data++;
-        }
-      }
-    }
-  }
-}
-
-template <typename DType>
-void nearest_interp_ref(const std::shared_ptr<operators::InterpolateOp> op) {
-  auto scope = op->scope();
-  auto op_info = op->op_info();
-  auto x = scope->FindVar(op_info->Input("X").front())->GetMutable<Tensor>();
-  auto out =
-      scope->FindVar(op_info->Output("Out").front())->GetMutable<Tensor>();
-  auto x_dims = x->dims();
-  CHECK_EQ(x_dims.size(), 4);
-  auto scale = op_info->GetAttr<float>("scale");
-  auto out_w = op_info->GetAttr<int>("out_w");
-  auto out_h = op_info->GetAttr<int>("out_h");
-  auto align_corners = op_info->GetAttr<bool>("align_corners");
-  // int align_mode = op_info->GetAttr<int>("align_mode");
-  auto interp_method = op_info->GetAttr<std::string>("interp_method");
-  CHECK_EQ(interp_method, "nearest");
-
-  int x_h = x_dims[2];
-  int x_w = x_dims[3];
-  if (scale > 0) {
-    out_h = static_cast<int>(x_h * scale);
-    out_w = static_cast<int>(x_w * scale);
-  }
-  if (op_info->HasInput("OutSize")) {
-    auto out_size_var_names = op_info->Input("OutSize");
-    if (out_size_var_names.size() > 0) {
-      auto out_size_var_name = out_size_var_names.front();
-      auto out_size =
-          scope->FindVar(out_size_var_name)->GetMutable<lite::Tensor>();
-      CHECK_EQ(out_size->numel(), 2);
-      auto out_size_data = out_size->mutable_data<int>();
-      out_h = out_size_data[0];
-      out_w = out_size_data[1];
-    }
-  }
-  CHECK_GT(out_h, 0);
-  CHECK_GT(out_w, 0);
-  out->Resize({x_dims[0], x_dims[1], out_h, out_w});
-
-  float ratio_h = 0.f;
-  float ratio_w = 0.f;
-  if (out_h > 1) {
-    ratio_h = align_corners ? static_cast<float>(x_h - 1.0) / (out_h - 1.0)
-                            : static_cast<float>(x_h) / out_h;
-  }
-  if (out_w > 1) {
-    ratio_w = align_corners ? static_cast<float>(x_w - 1.0) / (out_w - 1.0)
-                            : static_cast<float>(x_w) / out_w;
-  }
-
-  auto x_data = x->data<DType>();
-  auto out_data = out->mutable_data<DType>();
-  auto out_dims = out->dims();
-  std::vector<int64_t> x_strides(x_dims.size(), 1);
-  for (int idx = x_strides.size() - 2; idx >= 0; idx--) {
-    x_strides[idx] = x_strides[idx + 1] * x_dims[idx + 1];
-  }
-
-  for (int n = 0; n < out_dims[0]; n++) {
-    for (int c = 0; c < out_dims[1]; c++) {
-      for (int h = 0; h < out_dims[2]; h++) {
-        for (int w = 0; w < out_dims[3]; w++) {
-          int in_i = ratio_h * h;
-          int in_j = ratio_w * w;
-          if (align_corners) {
-            in_i = ratio_h * h + 0.5;
-            in_j = ratio_w * w + 0.5;
-          }
-          *out_data = x_data[n * x_strides[0] + c * x_strides[1] +
-                             in_i * x_strides[2] + in_j * x_strides[3]];
-          out_data++;
-        }
-      }
-    }
-  }
-}
-
-void test_interpolate(int bs,
-                      int ic,
-                      int ih,
-                      int iw,
-                      int oh,
-                      int ow,
-                      float scale,
-                      int out_size_h,
-                      int out_size_w,
-                      bool align_corners,
-                      int align_mode,
-                      std::string interp_method) {
-  // prepare input&output variables
-  Scope scope;
-  std::string x_var_name("x");
-  std::string out_size_var_name("out_size");
-  std::string out_var_name("out");
-  std::string out_ref_var_name("out_ref");
-  auto x = scope.Var(x_var_name)->GetMutable<Tensor>();
-  auto out_size = scope.Var(out_size_var_name)->GetMutable<Tensor>();
-  auto out = scope.Var(out_var_name)->GetMutable<Tensor>();
-  auto out_ref = scope.Var(out_ref_var_name)->GetMutable<Tensor>();
-  x->Resize({bs, ic, ih, iw});
-  out_size->Resize({2});
-
-  // initialize input&output data
-  FillTensor<float, int>(x);
-
-  // initialize op desc
-  cpp::OpDesc opdesc;
-  opdesc.SetType(interp_method + "_interp");
-  opdesc.SetInput("X", {x_var_name});
-  opdesc.SetOutput("Out", {out_var_name});
-  opdesc.SetAttr("out_h", oh);
-  opdesc.SetAttr("out_w", ow);
-  opdesc.SetAttr("scale", scale);
-  opdesc.SetAttr("align_corners", static_cast<bool>(align_corners));
-  opdesc.SetAttr("align_mode", static_cast<int>(align_mode));
-  opdesc.SetAttr("interp_method", interp_method);
-  if (out_size_h > 0 && out_size_w > 0) {
-    auto out_size_dims = out_size->dims();
-    CHECK_EQ(out_size_dims.size(), 1);
-    CHECK_EQ(out_size_dims.production(), 2);
-    auto out_size_data = out_size->mutable_data<int>();
-    out_size_data[0] = out_size_h;
-    out_size_data[1] = out_size_w;
-    opdesc.SetInput("OutSize", {out_size_var_name});
-  }
-
-  // create op and execute reference implementation
-  auto op = CreateOp<operators::InterpolateOp>(opdesc, &scope);
-  if (interp_method == "bilinear") {
-    bilinear_interp_ref<float>(op);
-  } else {
-    nearest_interp_ref<float>(op);
-  }
-  out_ref->CopyDataFrom(*out);
-
-  // convert op to NPU model, then run it on NPU
-  LauchOp(op, {x_var_name}, {out_var_name});
-
-  // compare results
-  auto out_dims = out->dims();
-  auto out_ref_dims = out_ref->dims();
-  CHECK_EQ(out_dims.size(), out_ref_dims.size());
-  for (int i = 0; i < out_dims.size(); i++) {
-    CHECK_EQ(out_dims[i], out_ref_dims[i]);
-  }
-  auto* out_data = out->mutable_data<float>();
-  auto* out_ref_data = out_ref->mutable_data<float>();
-  for (int i = 0; i < out->dims().production(); i++) {
-    VLOG(5) << i;
-    EXPECT_NEAR(out_data[i], out_ref_data[i], 1e-2f);
-  }
-}
-
-TEST(NPUBridges, bilinear_interp) {
-#if 1
-  for (auto bs : {1, 3}) {
-    for (auto ic : {3, 4}) {
-      for (auto ih : {4, 5}) {
-        for (auto iw : {3, 6}) {
-          for (auto oh : {0, 3, 8}) {
-            for (auto ow : {0, 4, 9}) {
-              for (auto scale : {0.f, 0.5f, 0.6f, 2.0f, 2.2f}) {
-                for (auto out_size_h : {0, 3, 11}) {
-                  for (auto out_size_w : {0, 2, 12}) {
-                    for (auto align_corners : {true, false}) {
-                      for (auto align_mode : {0, 1}) {
-                        for (auto interp_method : {"bilinear", "nearest"}) {
-                          int act_oh = 0, act_ow = 0;
-                          if (out_size_h > 0 && out_size_w > 0) {
-                            act_oh = out_size_h;
-                            act_ow = out_size_w;
-                          } else if (scale > 1e-5) {
-                            act_oh = static_cast<int>(ih * scale);
-                            act_ow = static_cast<int>(iw * scale);
-                          } else if (oh > 0 && ow > 0) {
-                            act_oh = oh;
-                            act_ow = ow;
-                          }
-                          if (act_oh <= 0 || act_ow <= 0) {
-                            continue;
-                          }
-                          // TODO(hong19860320) multiple=(ih*iw)/(oh*ow)
-                          // should
-                          // not exceed 7.0 in NPU DDK, delete the following
-                          // lines
-                          // if the limination is removed.
-                          const float largest_multiple = 7.0f;
-                          float multiple =
-                              static_cast<float>(ih * iw) / (act_oh * act_ow);
-                          if (multiple > largest_multiple) {
-                            continue;
-                          }
-                          if (align_mode == 0 && !align_corners) {
-                            continue;
-                          }
-                          VLOG(3) << "bs: " << bs << " ic: " << ic
-                                  << " ih: " << ih << " iw: " << iw
-                                  << " oh: " << oh << " ow: " << ow
-                                  << " scale: " << scale
-                                  << " out_size: " << out_size_h << ","
-                                  << out_size_w
-                                  << " align_corners: " << align_corners
-                                  << " align_mode: " << align_mode;
-                          test_interpolate(bs,
-                                           ic,
-                                           ih,
-                                           iw,
-                                           oh,
-                                           ow,
-                                           scale,
-                                           out_size_h,
-                                           out_size_w,
-                                           align_corners,
-                                           align_mode,
-                                           interp_method);
-                        }
-                      }
-                    }
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-#else
-  test_interpolate(1, 1, 4, 3, 0, 0, 1.f, 3, 6, false, 1, "nearest");
-#endif
-}
-
-}  // namespace bridges
-}  // namespace npu
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_OP(bilinear_interp);
-USE_NPU_BRIDGE(bilinear_interp);
-
-USE_LITE_OP(nearest_interp);
-USE_NPU_BRIDGE(nearest_interp);
--- a/lite/operators/interpolate_op.cc
+++ b/lite/operators/interpolate_op.cc
@@ -35,8 +35,7 @@ bool InterpolateOp::CheckShape() const {
 }

 bool InterpolateOp::InferShape() const {
-  auto* X = param_.X;
-  auto* OutSize = param_.OutSize;
+  auto X = param_.X;

  int n = X->dims()[0];
  int c = X->dims()[1];
@@ -46,39 +45,40 @@ bool InterpolateOp::InferShape() const {
  int out_w;

  auto SizeTensor = param_.SizeTensor;
+  auto OutSize = param_.OutSize;
+  auto Scale = param_.Scale;
  if (!SizeTensor.empty()) {
-    CHECK(SizeTensor.size() == 2)
+    CHECK_EQ(SizeTensor.size(), 2)
        << "Input(SizeTensor)'size of Op(interpolate) must be 2. "
           "Attr(out_shape)'s length must be 2 for 4-D input tensor.";
+    out_h = SizeTensor[0]->data<int>()[0];
+    out_w = SizeTensor[1]->data<int>()[0];
+  } else if (OutSize) {
+    auto OutSize_dims = OutSize->dims();
+    CHECK_EQ(OutSize_dims.size(), 1) << "Input(OutSize)'s dims size must be 1";
+    CHECK_EQ(OutSize_dims[0], 2) << "OutSize's dim[0] must be 2";
+    auto OutSize_data = OutSize->data<int>();
+    out_h = OutSize_data[0];
+    out_w = OutSize_data[1];
+  } else if (param_.out_h > 0 && param_.out_w > 0) {
    out_h = param_.out_h;
    out_w = param_.out_w;
-    param_.Out->Resize({n, c, out_h, out_w});
-    return true;
-  }
-
-  auto Scale = param_.Scale;
-  if (Scale) {
-    auto scale_dims = Scale->dims();
-    CHECK(scale_dims.size() == 1) << "Scale's dimension size must be 1.";
-    out_h = -1;
-    out_w = -1;
  } else {
-    auto scale = param_.scale;
-    if (scale > 0) {
-      out_h = static_cast<int>(h * scale);
-      out_w = static_cast<int>(w * scale);
-      out_h = out_h > 0 ? out_h : -1;
-      out_w = out_w > 0 ? out_w : -1;
+    float scale = -1.f;
+    if (Scale) {
+      auto Scale_dims = Scale->dims();
+      CHECK_EQ(Scale_dims.size(), 1) << "Scale's dimension size must be 1.";
+      scale = Scale->data<float>()[0];
    } else {
-      out_h = param_.out_h;
-      out_w = param_.out_w;
+      scale = param_.scale;
    }
+    CHECK(scale > 0) << "scale must large than 0.";
+    out_h = static_cast<int>(h * scale);
+    out_w = static_cast<int>(w * scale);
  }

-  if (OutSize != nullptr) {
-    auto out_lod = param_.Out->mutable_lod();
-    *out_lod = param_.X->lod();
-  }
+  auto out_lod = param_.Out->mutable_lod();
+  *out_lod = param_.X->lod();
  param_.Out->Resize({n, c, out_h, out_w});

  return true;

--- a/lite/tests/kernels/CMakeLists.txt
+++ b/lite/tests/kernels/CMakeLists.txt
@@ -25,7 +25,7 @@ if((NOT LITE_WITH_OPENCL AND NOT LITE_WITH_FPGA AND NOT LITE_WITH_BM) AND (LITE_
    #lite_cc_test(test_kernel_increment_compute SRCS increment_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_write_to_array_compute SRCS write_to_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    #lite_cc_test(test_kernel_read_from_array_compute SRCS read_from_array_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_concat_compute SRCS concat_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_transpose_compute SRCS transpose_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_reshape_compute SRCS reshape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_layer_norm_compute SRCS layer_norm_compute_test.cc DEPS arena_framework ${xpu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
@@ -59,8 +59,7 @@ endif()
    lite_cc_test(test_kernel_pad2d_compute SRCS  pad2d_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_prior_box_compute SRCS  prior_box_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels}  ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_negative_compute SRCS negative_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_bilinear_interp_compute SRCS bilinear_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
-    lite_cc_test(test_kernel_nearest_interp_compute SRCS nearest_interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
+    lite_cc_test(test_kernel_interp_compute SRCS interp_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_shape_compute SRCS shape_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${bm_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_crop_compute SRCS crop_compute_test.cc DEPS arena_framework ${xpu_kernels} ${npu_kernels} ${x86_kernels} ${bm_kernels}  ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})
    lite_cc_test(test_kernel_sequence_expand_compute SRCS sequence_expand_compute_test.cc DEPS arena_framework ${xpu_kernels} ${bm_kernels} ${npu_kernels} ${x86_kernels} ${cuda_kernels} ${arm_kernels} ${lite_ops} ${host_kernels})

--- a/lite/tests/kernels/bilinear_interp_compute_test.cc
+++ b/lite/tests/kernels/bilinear_interp_compute_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <string>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-inline std::vector<int> get_new_shape(
-    std::vector<const lite::Tensor*> list_new_shape_tensor) {
-  // get tensor from
-  std::vector<int> vec_new_shape;
-  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
-    auto tensor = list_new_shape_tensor[i];
-    vec_new_shape.push_back(static_cast<int32_t>(*(tensor->data<int32_t>())));
-  }
-  return vec_new_shape;
-}
-
-template <typename T>
-inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
-  std::vector<T> vec_new_data;
-  auto* new_data = new_data_tensor->data<T>();
-  lite::Tensor cpu_starts_tensor;
-  vec_new_data =
-      std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
-  return vec_new_data;
-}
-
-template <typename dtype>
-void resize_bilinear_align(std::vector<const lite::Tensor*> inputs,
-                           lite::Tensor* output) {
-  int hin = inputs[0]->dims()[2];
-  int win = inputs[0]->dims()[3];
-  int channels = inputs[0]->dims()[1];
-  int num = inputs[0]->dims()[0];
-  int hout = output->dims()[2];
-  int wout = output->dims()[3];
-
-  dtype scale_w = static_cast<dtype>(win - 1) / (wout - 1);
-  dtype scale_h = static_cast<dtype>(hin - 1) / (hout - 1);
-  const dtype* src = inputs[0]->data<dtype>();
-  dtype* dst = output->mutable_data<dtype>();
-  int dst_stride_w = 1;
-  int dst_stride_h = wout;
-  int dst_stride_c = wout * hout;
-  int dst_stride_batch = wout * hout * channels;
-  int src_stride_w = 1;
-  int src_stride_h = win;
-  int src_stride_c = win * hin;
-  int src_stride_batch = win * hin * channels;
-
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      int src_index = n * src_stride_batch + c * src_stride_c;
-
-      for (int h = 0; h < hout; ++h) {
-        for (int w = 0; w < wout; ++w) {
-          dtype fw = w * scale_w;
-          dtype fh = h * scale_h;
-          int w_start = static_cast<int>(fw);
-          int w_id = w_start < win - 1 ? 1 : 0;
-          int w_end = static_cast<int>(fw + w_id);
-          int h_start = static_cast<int>(fh);
-          int h_id = h_start < hin - 1 ? 1 : 0;
-          int h_end = static_cast<int>(fh + h_id);
-          fw -= w_start;
-          fh -= h_start;
-          const dtype w00 = (1.0 - fh) * (1.0 - fw);
-          const dtype w01 = fw * (1.0 - fh);
-          const dtype w10 = fh * (1.0 - fw);
-          const dtype w11 = fw * fh;
-          dtype tl =
-              src[src_index + w_start * src_stride_w + h_start * src_stride_h];
-          dtype tr =
-              src[src_index + w_end * src_stride_w + h_start * src_stride_h];
-          dtype bl =
-              src[src_index + w_start * src_stride_w + h_end * src_stride_h];
-          dtype br =
-              src[src_index + w_end * src_stride_w + h_end * src_stride_h];
-          int dst_index = n * dst_stride_batch + c * dst_stride_c +
-                          h * dst_stride_h + w * dst_stride_w;
-          dst[dst_index] =
-              static_cast<dtype>(w00 * tl + w01 * tr + w10 * bl + w11 * br);
-        }
-      }
-    }
-  }
-}
-
-template <typename dtype>
-void resize_bilinear_no_align(std::vector<const lite::Tensor*> inputs,
-                              lite::Tensor* output) {
-  int hin = inputs[0]->dims()[2];
-  int win = inputs[0]->dims()[3];
-  int channels = inputs[0]->dims()[1];
-  int num = inputs[0]->dims()[0];
-  int hout = output->dims()[2];
-  int wout = output->dims()[3];
-  dtype scale_w = static_cast<dtype>(win) / (wout);
-  dtype scale_h = static_cast<dtype>(hin) / (hout);
-  const dtype* src = inputs[0]->data<dtype>();
-  dtype* dst = output->mutable_data<dtype>();
-  int dst_stride_w = 1;
-  int dst_stride_h = wout;
-  int dst_stride_c = wout * hout;
-  int dst_stride_batch = wout * hout * channels;
-  int src_stride_w = 1;
-  int src_stride_h = win;
-  int src_stride_c = win * hin;
-  int src_stride_batch = win * hin * channels;
-
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      int src_index = n * src_stride_batch + c * src_stride_c;
-
-      for (int h = 0; h < hout; ++h) {
-        for (int w = 0; w < wout; ++w) {
-          dtype fw = scale_w * (w + 0.5f) - 0.5f;
-          fw = (fw < 0) ? 0 : fw;
-          dtype fh = scale_h * (h + 0.5f) - 0.5f;
-          fh = (fh < 0) ? 0 : fh;
-          int w_start = static_cast<int>(fw);
-          int w_id = w_start < win - 1 ? 1 : 0;
-          int w_end = static_cast<int>(fw + w_id);
-          int h_start = static_cast<int>(fh);
-          int h_id = h_start < hin - 1 ? 1 : 0;
-          int h_end = static_cast<int>(fh + h_id);
-          fw -= w_start;
-          fh -= h_start;
-          const dtype w00 = (1.0 - fh) * (1.0 - fw);
-          const dtype w01 = fw * (1.0 - fh);
-          const dtype w10 = fh * (1.0 - fw);
-          const dtype w11 = fw * fh;
-          dtype tl =
-              src[src_index + w_start * src_stride_w + h_start * src_stride_h];
-          dtype tr =
-              src[src_index + w_end * src_stride_w + h_start * src_stride_h];
-          dtype bl =
-              src[src_index + w_start * src_stride_w + h_end * src_stride_h];
-          dtype br =
-              src[src_index + w_end * src_stride_w + h_end * src_stride_h];
-          int dst_index = n * dst_stride_batch + c * dst_stride_c +
-                          h * dst_stride_h + w * dst_stride_w;
-          dst[dst_index] =
-              static_cast<dtype>(w00 * tl + w01 * tr + w10 * bl + w11 * br);
-        }
-      }
-    }
-  }
-}
-
-class BilinearInterpComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input0_ = "X";
-  std::string sizetensor0_ = "SizeTensor0";
-  std::string sizetensor1_ = "SizeTensor1";
-  std::string input_scale_ = "Scale";
-  std::string input1_ = "OutSize";
-  std::string output_ = "Out";
-
-  float height_scale_ = 0.f;
-  float width_scale_ = 0.f;
-  int out_height_ = -1;
-  int out_width_ = -1;
-  int outsize_height_ = -1;
-  int outsize_width_ = -1;
-  bool align_corners_ = true;
-  std::string interp_method_ = "Bilinear";
-  DDim _dims0_{{1, 1, 16, 16}};
-  DDim _dims1_{{2}};
-  DDim sizetensor_dims_{{1}};
-  DDim scale_dims_{{1}};
-
- public:
-  BilinearInterpComputeTester(const Place& place,
-                              const std::string& alias,
-                              float scale,
-                              int out_height,
-                              int out_width,
-                              int outsize_height,
-                              int outsize_width,
-                              bool align_corners,
-                              std::string interp_method)
-      : TestCase(place, alias),
-        height_scale_(scale),
-        width_scale_(scale),
-        out_height_(out_height),
-        out_width_(out_width),
-        outsize_height_(outsize_height),
-        outsize_width_(outsize_width),
-        align_corners_(align_corners),
-        interp_method_(interp_method) {}
-
-  void RunBaseline(Scope* scope) override {
-    width_scale_ = height_scale_;
-    std::vector<const lite::Tensor*> inputs;
-    inputs.emplace_back(scope->FindTensor(input0_));
-    if (outsize_height_ > 0 && outsize_width_ > 0) {
-      inputs.emplace_back(scope->FindTensor(input1_));
-    }
-    std::vector<const lite::Tensor*> SizeTensor;
-    if (outsize_height_ > 0 && outsize_width_ > 0) {
-      SizeTensor.emplace_back(scope->FindTensor(sizetensor0_));
-      SizeTensor.emplace_back(scope->FindTensor(sizetensor1_));
-    }
-    const lite::Tensor* input_scale = scope->FindTensor(input_scale_);
-    float scale = height_scale_;
-    int in_h = inputs[0]->dims()[2];
-    int in_w = inputs[0]->dims()[3];
-    if (SizeTensor.size() > 0) {
-      auto new_size = get_new_shape(SizeTensor);
-      out_height_ = new_size[0];
-      out_width_ = new_size[1];
-    } else {
-      auto scale_tensor = input_scale;
-      if (scale_tensor != nullptr) {
-        auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
-        scale = scale_data[0];
-      }
-      if (scale > 0) {
-        out_height_ = static_cast<int>(in_h * scale);
-        out_width_ = static_cast<int>(in_w * scale);
-      }
-      if (inputs.size() > 1) {
-        auto out_size = inputs[1];
-        auto out_size_data = get_new_data_from_tensor<int>(out_size);
-        out_height_ = out_size_data[0];
-        out_width_ = out_size_data[1];
-      }
-    }
-    height_scale_ = scale;
-    width_scale_ = scale;
-
-    if (out_width_ != -1 && out_height_ != -1) {
-      height_scale_ = static_cast<float>(out_height_ / inputs[0]->dims()[2]);
-      width_scale_ = static_cast<float>(out_width_ / inputs[0]->dims()[3]);
-    }
-    auto* outputs = scope->NewTensor(output_);
-    CHECK(outputs);
-    int num_cout = inputs[0]->dims()[0];
-    int c_cout = inputs[0]->dims()[1];
-    outputs->Resize({num_cout, c_cout, out_height_, out_width_});
-    if (align_corners_) {
-      resize_bilinear_align<float>(inputs, outputs);
-    } else {
-      resize_bilinear_no_align<float>(inputs, outputs);
-    }
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("bilinear_interp");
-    op_desc->SetInput("X", {input0_});
-    if (outsize_height_ > 0 && outsize_width_ > 0) {
-      op_desc->SetInput("OutSize", {input1_});
-      op_desc->SetInput("SizeTensor", {sizetensor0_, sizetensor1_});
-    }
-    if (height_scale_ > 0) {
-      op_desc->SetInput("Scale", {input_scale_});
-    }
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("scale", height_scale_);
-    op_desc->SetAttr("out_h", out_height_);
-    op_desc->SetAttr("out_w", out_width_);
-    op_desc->SetAttr("align_corners", align_corners_);
-    op_desc->SetAttr("interp_method", interp_method_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data0(_dims0_.production());
-    for (int i = 0; i < _dims0_.production(); i++) {
-      data0[i] = i * 1.1;
-    }
-    SetCommonTensor(input0_, _dims0_, data0.data());
-
-    if (outsize_height_ > 0 && outsize_width_ > 0) {
-      std::vector<int> data1(2);
-      data1[0] = outsize_height_;
-      data1[1] = outsize_width_;
-      SetCommonTensor(input1_, _dims1_, data1.data());
-
-      std::vector<int> sizetensor_data(1);
-      sizetensor_data[0] = outsize_height_;
-      SetCommonTensor(sizetensor0_, sizetensor_dims_, sizetensor_data.data());
-
-      sizetensor_data[0] = outsize_width_;
-      SetCommonTensor(sizetensor1_, sizetensor_dims_, sizetensor_data.data());
-    }
-
-    if (height_scale_ > 0) {
-      std::vector<float> scale_data(1);
-      scale_data[0] = height_scale_;
-      SetCommonTensor(input_scale_, scale_dims_, scale_data.data());
-    }
-  }
-};
-
-void test_bilinear_interp(Place place) {
-  std::string interp_method = "Bilinear";
-  for (float scale : {2., 1., 0.3}) {
-    for (bool align_corners : {true, false}) {
-      std::unique_ptr<arena::TestCase> tester(new BilinearInterpComputeTester(
-          place, "def", scale, -1, -1, -1, -1, align_corners, interp_method));
-      arena::Arena arena(std::move(tester), place, 5e-5);
-      arena.TestPrecision();
-    }
-  }
-  for (int out_height : {8, 16, 24}) {
-    for (int out_width : {8, 16, 24}) {
-      for (bool align_corners : {true, false}) {
-        std::unique_ptr<arena::TestCase> tester(
-            new BilinearInterpComputeTester(place,
-                                            "def",
-                                            0,
-                                            out_height,
-                                            out_width,
-                                            -1,
-                                            -1,
-                                            align_corners,
-                                            interp_method));
-        arena::Arena arena(std::move(tester), place, 5e-5);
-        arena.TestPrecision();
-      }
-    }
-  }
-  for (int outsize_height : {8, 16, 24}) {
-    for (int outsize_width : {8, 16, 24}) {
-      for (bool align_corners : {true, false}) {
-        std::unique_ptr<arena::TestCase> tester(
-            new BilinearInterpComputeTester(place,
-                                            "def",
-                                            0,
-                                            -1,
-                                            -1,
-                                            outsize_height,
-                                            outsize_width,
-                                            align_corners,
-                                            interp_method));
-        arena::Arena arena(std::move(tester), place, 5e-5);
-        arena.TestPrecision();
-      }
-    }
-  }
-}
-
-TEST(BilinearInterp, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_bilinear_interp(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle
--- a/lite/tests/kernels/concat_compute_test.cc
+++ b/lite/tests/kernels/concat_compute_test.cc
@@ -142,35 +142,29 @@ class ConcateComputeTester : public arena::TestCase {

 TEST(Concat, precision) {
  LOG(INFO) << "test concat op, kARM";
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  for (int axis : {1, 2}) {
-    for (bool is_use_axis_tensor : {false, true}) {
-      LOG(INFO) << "axis:" << axis
-                << ", is_use_axis_tensor:" << is_use_axis_tensor;
-      std::unique_ptr<arena::TestCase> tester(
-          new ConcateComputeTester(place, "def", axis, is_use_axis_tensor));
-      arena::Arena arena(std::move(tester), place, 2e-5);
-      arena.TestPrecision();
-    }
-  }
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#elif defined(LITE_WITH_X86)
+  place = TARGET(kX86);
+#else
+  return;
 #endif

-#ifdef LITE_WITH_X86
-  Place place(TARGET(kX86));
-  LOG(INFO) << "test concate op, x86";
  for (int axis : {1, 2}) {
    for (bool is_use_axis_tensor : {false, true}) {
      LOG(INFO) << "axis:" << axis
                << ", is_use_axis_tensor:" << is_use_axis_tensor;
      std::unique_ptr<arena::TestCase> tester(
          new ConcateComputeTester(place, "def", axis, is_use_axis_tensor));
-      arena::Arena arena(std::move(tester), place, 2e-5);
+      arena::Arena arena(std::move(tester), place, abs_error);
      arena.TestPrecision();
    }
  }
-
-#endif
 }

 }  // namespace lite

--- a/lite/tests/kernels/interp_compute_test.cc
+++ b/lite/tests/kernels/interp_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <string>
+#include "lite/api/paddle_use_kernels.h"
+#include "lite/api/paddle_use_ops.h"
+#include "lite/core/arena/framework.h"
+#include "lite/core/tensor.h"
+#include "lite/tests/utils/fill_data.h"
+
+namespace paddle {
+namespace lite {
+
+template <typename dtype>
+void ResizeNearestAlign(const lite::Tensor* x,
+                        lite::Tensor* out,
+                        bool with_align) {
+  auto x_dims = x->dims();
+  int num = x_dims[0];
+  int channels = x_dims[1];
+  int hin = x_dims[2];
+  int win = x_dims[3];
+  int hout = out->dims()[2];
+  int wout = out->dims()[3];
+  dtype scale_w = (with_align) ? (static_cast<float>(win - 1) / (wout - 1))
+                               : (static_cast<float>(win) / (wout));
+  dtype scale_h = (with_align) ? (static_cast<float>(hin - 1) / (hout - 1))
+                               : (static_cast<float>(hin) / (hout));
+  const dtype* src = x->data<dtype>();
+  dtype* dst = out->mutable_data<dtype>();
+  int dst_stride_w = 1;
+  int dst_stride_h = wout;
+  int dst_stride_c = wout * hout;
+  int dst_stride_batch = wout * hout * channels;
+  int src_stride_w = 1;
+  int src_stride_h = win;
+  int src_stride_c = win * hin;
+  int src_stride_batch = win * hin * channels;
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < channels; ++c) {
+      int src_index = n * src_stride_batch + c * src_stride_c;
+      for (int h = 0; h < hout; ++h) {
+        for (int w = 0; w < wout; ++w) {
+          int fw = (with_align) ? static_cast<int>(scale_w * w + 0.5)
+                                : static_cast<int>(scale_w * w);
+          fw = (fw < 0) ? 0 : fw;
+          int fh = (with_align) ? static_cast<int>(scale_h * h + 0.5)
+                                : static_cast<int>(scale_h * h);
+          fh = (fh < 0) ? 0 : fh;
+          int w_start = static_cast<int>(fw);
+          int h_start = static_cast<int>(fh);
+          int dst_index = n * dst_stride_batch + c * dst_stride_c +
+                          h * dst_stride_h + w * dst_stride_w;
+          dst[dst_index] =
+              src[src_index + w_start * src_stride_w + h_start * src_stride_h];
+        }
+      }
+    }
+  }
+}
+
+template <typename DType>
+void BilinearInterpRef(const lite::Tensor* x,
+                       lite::Tensor* out,
+                       bool align_corners,
+                       int align_mode) {
+  auto x_dims = x->dims();
+  int batch_size = x_dims[0];
+  int channel_size = x_dims[1];
+  auto x_h = x_dims[2];
+  auto x_w = x_dims[3];
+  CHECK_EQ(x_dims.size(), 4);
+
+  auto out_dims = out->dims();
+  int out_h = out_dims[2];
+  int out_w = out_dims[3];
+
+  // copy from x if no change
+  if (x_h == out_h && x_w == out_w) {
+    out->CopyDataFrom(*x);
+    return;
+  }
+
+  float ratio_h = 0.f;
+  float ratio_w = 0.f;
+  if (out_h > 1) {
+    ratio_h = (align_corners) ? static_cast<float>(x_h - 1) / (out_h - 1)
+                              : static_cast<float>(x_h) / out_h;
+  }
+  if (out_w > 1) {
+    ratio_w = (align_corners) ? static_cast<float>(x_w - 1) / (out_w - 1)
+                              : static_cast<float>(x_w) / out_w;
+  }
+
+  // naive bilinear interpolation
+  auto x_data = x->data<DType>();
+  auto out_data = out->mutable_data<DType>();
+  bool align_flag = (align_mode == 0 && !align_corners);
+
+  std::vector<int> vy_n, vy_s;
+  std::vector<float> vd_n, vd_s;
+  vy_n.reserve(out_h);
+  vy_s.reserve(out_h);
+  vd_n.reserve(out_h);
+  vd_s.reserve(out_h);
+  for (int k = 0; k < out_h; k++) {
+    int yn = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
+                        : static_cast<int>(ratio_h * k);
+    yn = (yn > 0) ? yn : 0;
+    int ys = (yn + 1) < (x_h - 1) ? (yn + 1) : (x_h - 1);
+    float idx_src_y = ratio_h * (k + 0.5) - 0.5;
+    idx_src_y = (idx_src_y > 0) ? idx_src_y : 0;
+    float dn = align_flag ? idx_src_y - yn : ratio_h * k - yn;
+    float ds = 1.f - dn;
+    {
+      vy_n[k] = yn;
+      vy_s[k] = ys;
+      vd_n[k] = dn;
+      vd_s[k] = ds;
+    }
+  }
+
+  std::vector<int> vx_w, vx_e;
+  std::vector<float> vd_w, vd_e;
+  vx_w.reserve(out_w);
+  vx_e.reserve(out_w);
+  vd_w.reserve(out_w);
+  vd_e.reserve(out_w);
+  for (int l = 0; l < out_w; l++) {
+    int xw = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
+                        : static_cast<int>(ratio_w * l);
+    xw = (xw > 0) ? xw : 0;
+    int xe = (xw + 1) < (x_w - 1) ? (xw + 1) : (x_w - 1);
+    float idx_src_x = ratio_w * (l + 0.5) - 0.5;
+    idx_src_x = (idx_src_x > 0) ? idx_src_x : 0;
+    float dw = align_flag ? idx_src_x - xw : ratio_w * l - xw;
+    float de = 1.f - dw;
+    {
+      vx_w[l] = xw;
+      vx_e[l] = xe;
+      vd_w[l] = dw;
+      vd_e[l] = de;
+    }
+  }
+
+  std::vector<int64_t> x_strides(x_dims.size(), 1);
+  for (int idx = x_strides.size() - 2; idx >= 0; idx--) {
+    x_strides[idx] = x_strides[idx + 1] * x_dims[idx + 1];
+  }
+  for (int i = 0; i < batch_size; i++) {
+    for (int j = 0; j < channel_size; j++) {
+      for (int k = 0; k < out_h; k++) {
+        for (int l = 0; l < out_w; l++) {
+          DType x0 = x_data[i * x_strides[0] + j * x_strides[1] +
+                            vy_n[k] * x_strides[2] + vx_w[l] * x_strides[3]];
+          DType x1 = x_data[i * x_strides[0] + j * x_strides[1] +
+                            vy_s[k] * x_strides[2] + vx_w[l] * x_strides[3]];
+          DType x2 = x_data[i * x_strides[0] + j * x_strides[1] +
+                            vy_n[k] * x_strides[2] + vx_e[l] * x_strides[3]];
+          DType x3 = x_data[i * x_strides[0] + j * x_strides[1] +
+                            vy_s[k] * x_strides[2] + vx_e[l] * x_strides[3]];
+          *out_data = x0 * vd_s[k] * vd_e[l] + x1 * vd_n[k] * vd_e[l] +
+                      x2 * vd_s[k] * vd_w[l] + x3 * vd_n[k] * vd_w[l];
+          out_data++;
+        }
+      }
+    }
+  }
+}
+class NearestInterpComputeTester : public arena::TestCase {
+ protected:
+  // common attributes for this op.
+  std::string x_ = "X";
+  std::string sizetensor0_ = "SizeTensor0";
+  std::string sizetensor1_ = "SizeTensor1";
+  std::string input_scale_ = "Scale";
+  std::string outsize_ = "OutSize";
+  std::string out_ = "Out";
+  DDim dims_{{1, 2, 3, 4}};
+
+  std::string interp_method_ = "nearest";
+  float scale_ = -1.f;
+  int out_h_ = -1;
+  int out_w_ = -1;
+  bool align_corners_ = true;
+  int align_mode_ = 1;
+  bool use_sizetensor_ = false;
+  bool use_input_scale_ = false;
+  bool use_outsize_ = false;
+
+ public:
+  NearestInterpComputeTester(const Place& place,
+                             const std::string& alias,
+                             DDim dims,
+                             std::string interp_method = "nearest",
+                             float scale = -1.f,
+                             int out_h = -1,
+                             int out_w = -1,
+                             bool align_corners = true,
+                             int align_mode = 1,
+                             bool use_sizetensor = false,
+                             bool use_input_scale = false,
+                             bool use_outsize = false)
+      : TestCase(place, alias),
+        dims_(dims),
+        interp_method_(interp_method),
+        scale_(scale),
+        out_h_(out_h),
+        out_w_(out_w),
+        align_corners_(align_corners),
+        align_mode_(align_mode),
+        use_sizetensor_(use_sizetensor),
+        use_input_scale_(use_input_scale),
+        use_outsize_(use_outsize) {}
+
+  void RunBaseline(Scope* scope) override {
+    int out_h = out_h_;
+    int out_w = out_w_;
+    if (scale_ > 0) {
+      out_h = dims_[2] * scale_;
+      out_w = dims_[3] * scale_;
+    }
+
+    auto input = scope->FindTensor(x_);
+    auto output = scope->NewTensor(out_);
+    std::vector<int64_t> out_shape{dims_[0], dims_[1], out_h, out_w};
+    output->Resize(out_shape);
+    if (interp_method_ == "nearest") {
+      ResizeNearestAlign<float>(input, output, align_corners_);
+    } else if (interp_method_ == "bilinear") {
+      BilinearInterpRef<float>(input, output, align_corners_, align_mode_);
+    }
+  }
+
+  void PrepareOpDesc(cpp::OpDesc* op_desc) {
+    if (interp_method_ == "nearest") {
+      op_desc->SetType("nearest_interp");
+    } else if (interp_method_ == "bilinear") {
+      op_desc->SetType("bilinear_interp");
+    } else {
+      LOG(FATAL) << "unsupport";
+    }
+    op_desc->SetInput("X", {x_});
+    if (use_sizetensor_) {
+      op_desc->SetInput("SizeTensor", {sizetensor0_, sizetensor1_});
+    }
+    if (use_input_scale_) {
+      op_desc->SetInput("Scale", {input_scale_});
+    }
+    if (use_outsize_) {
+      op_desc->SetInput("OutSize", {outsize_});
+    }
+    op_desc->SetOutput("Out", {out_});
+    op_desc->SetAttr("scale", scale_);
+    op_desc->SetAttr("out_h", out_h_);
+    op_desc->SetAttr("out_w", out_w_);
+    op_desc->SetAttr("align_corners", align_corners_);
+    op_desc->SetAttr("align_mode", align_mode_);
+    op_desc->SetAttr("interp_method", interp_method_);
+  }
+
+  void PrepareData() override {
+    std::vector<float> din(dims_.production());
+    fill_data_rand(din.data(), -1.f, 1.f, dims_.production());
+    SetCommonTensor(x_, dims_, din.data());
+
+    if (use_sizetensor_) {
+      DDim sizetensor_dims(std::vector<int64_t>{1});
+      std::vector<int> dsizetensor0{out_h_};
+      std::vector<int> dsizetensor1{out_w_};
+      SetCommonTensor(
+          sizetensor0_, sizetensor_dims, dsizetensor0.data(), {}, true);
+      SetCommonTensor(
+          sizetensor1_, sizetensor_dims, dsizetensor1.data(), {}, true);
+    }
+
+    if (use_input_scale_) {
+      DDim input_scale_dims(std::vector<int64_t>{1});
+      std::vector<float> dinput_scale{scale_};
+      SetCommonTensor(
+          input_scale_, input_scale_dims, dinput_scale.data(), {}, true);
+    }
+
+    if (use_outsize_) {
+      DDim outsize_dims(std::vector<int64_t>{2});
+      std::vector<int> doutsize{out_h_, out_w_};
+      SetCommonTensor(outsize_, outsize_dims, doutsize.data(), {}, true);
+    }
+  }
+};
+
+void TestInterpOuthw(Place place, float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (auto interp_method : std::vector<std::string>{"nearest", "bilinear"}) {
+      for (int out_h : {6, 8, 12}) {
+        for (int out_w : {6, 9, 12}) {
+          std::unique_ptr<arena::TestCase> tester(
+              new NearestInterpComputeTester(place,
+                                             "def",
+                                             DDim(x_dims),
+                                             interp_method,
+                                             -1.f,
+                                             out_h,
+                                             out_w));
+          arena::Arena arena(std::move(tester), place, abs_error);
+          arena.TestPrecision();
+        }
+      }
+    }
+  }
+}
+
+void TestInterpScale(Place place, float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (auto interp_method : std::vector<std::string>{"nearest", "bilinear"}) {
+      for (float scale : {0.3f, 1.f, 1.7f}) {
+        std::unique_ptr<arena::TestCase> tester(new NearestInterpComputeTester(
+            place, "def", DDim(x_dims), interp_method, scale));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+
+void TestInterpSizetensor(Place place, float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (auto interp_method : std::vector<std::string>{"nearest", "bilinear"}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new NearestInterpComputeTester(place,
+                                         "def",
+                                         DDim(x_dims),
+                                         interp_method,
+                                         -1.f,
+                                         10,
+                                         12,
+                                         true,
+                                         1,
+                                         true,
+                                         false,
+                                         false));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestInterpInputScale(Place place, float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (auto interp_method : std::vector<std::string>{"nearest", "bilinear"}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new NearestInterpComputeTester(place,
+                                         "def",
+                                         DDim(x_dims),
+                                         interp_method,
+                                         0.7,
+                                         -1,
+                                         -1,
+                                         true,
+                                         1,
+                                         false,
+                                         true,
+                                         false));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestInterpOutsize(Place place, float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (auto interp_method : std::vector<std::string>{"nearest", "bilinear"}) {
+      std::unique_ptr<arena::TestCase> tester(
+          new NearestInterpComputeTester(place,
+                                         "def",
+                                         DDim(x_dims),
+                                         interp_method,
+                                         -1,
+                                         4,
+                                         4,
+                                         true,
+                                         1,
+                                         false,
+                                         false,
+                                         true));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestInterpAlignCorners(Place place, float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (bool align_corners : {true, false}) {
+      std::unique_ptr<arena::TestCase> tester(new NearestInterpComputeTester(
+          place, "def", DDim(x_dims), "nearest", 0.4, -1, -1, align_corners));
+      arena::Arena arena(std::move(tester), place, abs_error);
+      arena.TestPrecision();
+    }
+  }
+}
+
+void TestInterpAlignMode(Place place, float abs_error = 2e-5) {
+  for (auto x_dims : std::vector<std::vector<int64_t>>{{3, 4, 8, 9}}) {
+    for (bool align_corners : {true, false}) {
+      for (int align_mode : {0, 1}) {
+        // may exist bug in arm kernel
+        if (place == TARGET(kARM) && align_mode == 1 && !align_corners) {
+          continue;
+        }
+        std::unique_ptr<arena::TestCase> tester(
+            new NearestInterpComputeTester(place,
+                                           "def",
+                                           DDim(x_dims),
+                                           "bilinear",
+                                           0.7,
+                                           -1,
+                                           -1,
+                                           align_corners,
+                                           align_mode));
+        arena::Arena arena(std::move(tester), place, abs_error);
+        arena.TestPrecision();
+      }
+    }
+  }
+}
+
+TEST(Interp, precision) {
+  Place place;
+  float abs_error = 2e-5;
+#if defined(LITE_WITH_NPU)
+  place = TARGET(kNPU);
+  abs_error = 1e-2;  // use fp16 in npu
+#elif defined(LITE_WITH_ARM)
+  place = TARGET(kARM);
+#else
+  return;
+#endif
+
+  TestInterpOuthw(place, abs_error);
+  TestInterpScale(place, abs_error);
+  TestInterpSizetensor(place, abs_error);
+  TestInterpInputScale(place, abs_error);
+  TestInterpOutsize(place, abs_error);
+  TestInterpAlignCorners(place, abs_error);
+  TestInterpAlignMode(place, abs_error);
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/lite/tests/kernels/nearest_interp_compute_test.cc
+++ b/lite/tests/kernels/nearest_interp_compute_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <gtest/gtest.h>
-#include <string>
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/core/arena/framework.h"
-#include "lite/core/tensor.h"
-
-namespace paddle {
-namespace lite {
-
-inline std::vector<int> get_new_shape(
-    const std::vector<const lite::Tensor*>& list_new_shape_tensor) {
-  // get tensor from
-  std::vector<int> vec_new_shape;
-  for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) {
-    auto tensor = list_new_shape_tensor[i];
-    vec_new_shape.push_back(static_cast<int32_t>(*tensor->data<int32_t>()));
-  }
-
-  return vec_new_shape;
-}
-
-template <typename T>
-inline std::vector<T> get_new_data_from_tensor(const Tensor* new_data_tensor) {
-  std::vector<T> vec_new_data;
-  auto* new_data = new_data_tensor->data<T>();
-  lite::Tensor cpu_starts_tensor;
-  vec_new_data =
-      std::vector<T>(new_data, new_data + new_data_tensor->dims().production());
-  return vec_new_data;
-}
-
-template <typename dtype>
-void resize_nearest_align(std::vector<const lite::Tensor*> inputs,
-                          lite::Tensor* output,
-                          bool with_align) {
-  int hin = inputs[0]->dims()[2];
-  int win = inputs[0]->dims()[3];
-  int channels = inputs[0]->dims()[1];
-  int num = inputs[0]->dims()[0];
-  int hout = output->dims()[2];
-  int wout = output->dims()[3];
-  dtype scale_w = (with_align) ? (static_cast<float>(win - 1) / (wout - 1))
-                               : (static_cast<float>(win) / (wout));
-  dtype scale_h = (with_align) ? (static_cast<float>(hin - 1) / (hout - 1))
-                               : (static_cast<float>(hin) / (hout));
-  const dtype* src = inputs[0]->data<dtype>();
-  dtype* dst = output->mutable_data<dtype>();
-  int dst_stride_w = 1;
-  int dst_stride_h = wout;
-  int dst_stride_c = wout * hout;
-  int dst_stride_batch = wout * hout * channels;
-  int src_stride_w = 1;
-  int src_stride_h = win;
-  int src_stride_c = win * hin;
-  int src_stride_batch = win * hin * channels;
-  for (int n = 0; n < num; ++n) {
-    for (int c = 0; c < channels; ++c) {
-      int src_index = n * src_stride_batch + c * src_stride_c;
-      for (int h = 0; h < hout; ++h) {
-        for (int w = 0; w < wout; ++w) {
-          int fw = (with_align) ? static_cast<int>(scale_w * w + 0.5)
-                                : static_cast<int>(scale_w * w);
-          fw = (fw < 0) ? 0 : fw;
-          int fh = (with_align) ? static_cast<int>(scale_h * h + 0.5)
-                                : static_cast<int>(scale_h * h);
-          fh = (fh < 0) ? 0 : fh;
-          int w_start = static_cast<int>(fw);
-          int h_start = static_cast<int>(fh);
-          int dst_index = n * dst_stride_batch + c * dst_stride_c +
-                          h * dst_stride_h + w * dst_stride_w;
-          dst[dst_index] =
-              src[src_index + w_start * src_stride_w + h_start * src_stride_h];
-        }
-      }
-    }
-  }
-}
-
-class NearestInterpComputeTester : public arena::TestCase {
- protected:
-  // common attributes for this op.
-  std::string input0_ = "X";
-  std::string sizetensor0_ = "SizeTensor0";
-  std::string sizetensor1_ = "SizeTensor1";
-  std::string input_scale_ = "Scale";
-  std::string input1_ = "OutSize";
-  std::string output_ = "Out";
-
-  float height_scale_ = 0.0f;
-  float width_scale_ = 0.0f;
-  int out_height_ = -1;
-  int out_width_ = -1;
-  bool align_corners_ = true;
-  std::string interp_method_ = "Nearest";
-  DDim dims_{{2, 3}};
-  DDim _dims0_{{2, 3, 3, 2}};
-  DDim _dims1_{{2}};
-  DDim sizetensor_dims_{{1}};
-  DDim scale_dims_{{1}};
-
- public:
-  NearestInterpComputeTester(const Place& place,
-                             const std::string& alias,
-                             float height_scale,
-                             float width_scale,
-                             int out_height,
-                             int out_width,
-                             bool align_corners,
-                             std::string interp_method)
-      : TestCase(place, alias),
-        height_scale_(height_scale),
-        width_scale_(width_scale),
-        out_height_(out_height),
-        out_width_(out_width),
-        align_corners_(align_corners),
-        interp_method_(interp_method) {}
-
-  void RunBaseline(Scope* scope) override {
-    width_scale_ = height_scale_;
-    auto* outputs = scope->NewTensor(output_);
-    CHECK(outputs);
-    outputs->Resize(dims_);
-    std::vector<const lite::Tensor*> inputs;
-    inputs.emplace_back(scope->FindTensor(input0_));
-    inputs.emplace_back(scope->FindTensor(input1_));
-
-    std::vector<const lite::Tensor*> SizeTensor(2);
-    SizeTensor[0] = scope->FindTensor(sizetensor0_);
-    SizeTensor[1] = scope->FindTensor(sizetensor1_);
-    const lite::Tensor* input_scale = scope->FindTensor(input_scale_);
-
-    float scale = height_scale_;
-    int in_h = inputs[0]->dims()[2];
-    int in_w = inputs[0]->dims()[3];
-    if (SizeTensor.size() > 0) {
-      auto new_size = get_new_shape(SizeTensor);
-      out_height_ = new_size[0];
-      out_width_ = new_size[1];
-    } else {
-      auto scale_tensor = input_scale;
-      if (scale_tensor != nullptr) {
-        auto scale_data = get_new_data_from_tensor<float>(scale_tensor);
-        scale = scale_data[0];
-      }
-      if (scale > 0) {
-        out_height_ = static_cast<int>(in_h * scale);
-        out_width_ = static_cast<int>(in_w * scale);
-      }
-      auto out_size = inputs[1];
-      if (out_size != nullptr) {
-        auto out_size_data = get_new_data_from_tensor<int>(out_size);
-        out_height_ = out_size_data[0];
-        out_width_ = out_size_data[1];
-      }
-    }
-    height_scale_ = scale;
-    width_scale_ = scale;
-
-    if (out_width_ != -1 && out_height_ != -1) {
-      height_scale_ = static_cast<float>(out_height_ / inputs[0]->dims()[2]);
-      width_scale_ = static_cast<float>(out_width_ / inputs[0]->dims()[3]);
-    }
-    int num_cout = inputs[0]->dims()[0];
-    int c_cout = inputs[0]->dims()[1];
-    outputs->Resize({num_cout, c_cout, out_height_, out_width_});
-
-    resize_nearest_align<float>(inputs, outputs, align_corners_);
-  }
-
-  void PrepareOpDesc(cpp::OpDesc* op_desc) {
-    op_desc->SetType("nearest_interp");
-    op_desc->SetInput("X", {input0_});
-    op_desc->SetInput("SizeTensor", {sizetensor0_, sizetensor1_});
-    op_desc->SetInput("Scale", {input_scale_});
-    op_desc->SetInput("OutSize", {input1_});
-    op_desc->SetOutput("Out", {output_});
-    op_desc->SetAttr("scale", height_scale_);
-    op_desc->SetAttr("out_h", out_height_);
-    op_desc->SetAttr("out_w", out_width_);
-    op_desc->SetAttr("align_corners", align_corners_);
-    op_desc->SetAttr("interp_method", interp_method_);
-  }
-
-  void PrepareData() override {
-    std::vector<float> data0(_dims0_.production());
-    for (int i = 0; i < _dims0_.production(); i++) {
-      data0[i] = i * 1.1;
-    }
-
-    std::vector<int> data1(_dims1_.production());
-    for (int i = 0; i < _dims1_.production(); i++) {
-      data1[i] = (i + 1) * 2;
-    }
-
-    SetCommonTensor(input0_, _dims0_, data0.data());
-    SetCommonTensor(input1_, _dims1_, data1.data());
-
-    std::vector<int> sizetensor_data(1);
-    sizetensor_data[0] = out_height_;
-    SetCommonTensor(sizetensor0_, sizetensor_dims_, sizetensor_data.data());
-
-    sizetensor_data[0] = out_width_;
-    SetCommonTensor(sizetensor1_, sizetensor_dims_, sizetensor_data.data());
-
-    std::vector<float> scale_data(1);
-    scale_data[0] = height_scale_;
-    SetCommonTensor(input_scale_, scale_dims_, scale_data.data());
-  }
-};
-
-void test_nearest_interp(Place place) {
-  std::string interp_method = "Nearest";
-  for (float scale : {0.123, 2., 1.2}) {
-    for (int out_height : {2, 1, 6}) {
-      for (int out_width : {2, 3, 5}) {
-        for (bool align_corners : {true, false}) {
-          std::unique_ptr<arena::TestCase> tester(
-              new NearestInterpComputeTester(place,
-                                             "def",
-                                             scale,
-                                             scale,
-                                             out_height,
-                                             out_width,
-                                             align_corners,
-                                             interp_method));
-          arena::Arena arena(std::move(tester), place, 2e-5);
-          arena.TestPrecision();
-        }
-      }
-    }
-  }
-}
-
-TEST(NearestInterp, precision) {
-// #ifdef LITE_WITH_X86
-//   Place place(TARGET(kX86));
-// #endif
-#ifdef LITE_WITH_ARM
-  Place place(TARGET(kARM));
-  test_nearest_interp(place);
-#endif
-}
-
-}  // namespace lite
-}  // namespace paddle