From be2884eb29e52a2277908dd188b4a600d7f22419 Mon Sep 17 00:00:00 2001
From: zhulei <563755780@qq.com>
Date: Thu, 4 Nov 2021 11:52:05 +0800
Subject: [PATCH] [NPU] Add bilinear_interpolate_v2 (#36971)

---
 .../fluid/operators/interpolate_v2_op_npu.cc  | 424 ++++++++++++++++--
 .../fluid/tests/unittests/npu/CMakeLists.txt  |   1 +
 .../npu/test_bilinear_interp_v2_op_npu.py     | 279 ++++++++++++
 3 files changed, 666 insertions(+), 38 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/npu/test_bilinear_interp_v2_op_npu.py
diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc
index b30c7ac810c..24ad6746ced 100644
--- a/paddle/fluid/operators/interpolate_v2_op_npu.cc
+++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc
@@ -20,6 +20,369 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 using DataLayout = framework::DataLayout;
+using DDim = framework::DDim;
+using fp16 = paddle::platform::float16;
+
+template <typename T>
+struct InterpolateFunction {
+ public:
+  explicit InterpolateFunction(const framework::ExecutionContext& ctx)
+      : ctx(ctx) {
+    place = ctx.GetPlace();
+    stream = ctx.template device_context<paddle::platform::NPUDeviceContext>()
+                 .stream();
+    t0.mutable_data<float>({1}, place);
+    t1.mutable_data<float>({1}, place);
+    tn.mutable_data<float>({1}, place);
+    FillNpuTensorWithConstant<float>(&t0, static_cast<float>(0));
+    FillNpuTensorWithConstant<float>(&t1, static_cast<float>(1));
+  }
+  void Arange(int n, Tensor* x) {
+    FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
+    const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {*x}, {});
+    runner.Run(stream);
+  }
+  void ReduceSum(const Tensor* x, Tensor* y, const std::vector<int>& dim,
+                 bool keep_dims = true) {
+    const auto& runner = NpuOpRunner("ReduceSumD", {*x}, {*y},
+                                     {{"axes", dim}, {"keep_dims", keep_dims}});
+    runner.Run(stream);
+  }
+  void Add(const Tensor* x, const Tensor* y, Tensor* z) {
+    const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Adds(const Tensor* x, float scalar, Tensor* y) {
+    const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}});
+    runner.Run(stream);
+  }
+  void Mul(const Tensor* x, const Tensor* y, Tensor* z) {
+    const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Sub(const Tensor* x, const Tensor* y, Tensor* z) {
+    const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Cast(const Tensor* x, Tensor* y) {
+    auto dst_dtype = ConvertToNpuDtype(y->type());
+    const auto& runner = NpuOpRunner(
+        "Cast", {*x}, {*y}, {{"dst_type", static_cast<int>(dst_dtype)}});
+    runner.Run(stream);
+  }
+  void Gather(const Tensor* x, const Tensor* indices, const int axis,
+              Tensor* y) {
+    const auto& runner =
+        NpuOpRunner("GatherV2D", {*x, *indices}, {*y}, {{"axis", axis}});
+    runner.Run(stream);
+  }
+  void GatherGrad(const Tensor* gy, const Tensor* indices, const int axis,
+                  Tensor* gx) {
+    //  1  gy swapaxis: axis & 0
+    int len = (gy->dims()).size();
+    std::vector<int> axis_swap(len);
+    for (int i = 0; i < len; i++) {
+      axis_swap[i] = i;
+    }
+    axis_swap[0] = axis;
+    axis_swap[axis] = 0;
+    auto y_new_shape = gy->dims();
+    auto yt = y_new_shape[axis];
+    y_new_shape[axis] = y_new_shape[0];
+    y_new_shape[0] = yt;
+    Tensor gy_t;
+    gy_t.mutable_data<T>(y_new_shape, place);
+    Transpose(gy, &gy_t, axis_swap);
+    //  2  scatter
+    auto x_new_shape = gx->dims();
+    auto xt = x_new_shape[axis];
+    x_new_shape[axis] = x_new_shape[0];
+    x_new_shape[0] = xt;
+    Tensor gx_zero, gx_t;
+    gx_zero.mutable_data<T>(x_new_shape, place);
+    gx_t.mutable_data<T>(x_new_shape, place);
+    FillNpuTensorWithConstant<T>(&gx_zero, static_cast<T>(0));
+    gx_zero.Resize(x_new_shape);
+    Scatter(&gx_zero, indices, &gy_t, &gx_t);
+    //  3  gx swapaxis: axis, 0
+    Transpose(&gx_t, gx, axis_swap);
+  }
+  void Scatter(const Tensor* x, const Tensor* index, const Tensor* updates,
+               Tensor* y) {
+    const auto& runner =
+        NpuOpRunner("TensorScatterAdd", {*x, *index, *updates}, {*y}, {});
+    runner.Run(stream);
+  }
+  void Transpose(const Tensor* x, Tensor* y, const std::vector<int>& axis) {
+    const auto& runner =
+        NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}});
+    runner.Run(stream);
+  }
+  void Muls(const Tensor* x, float scalar, Tensor* y) {
+    const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}});
+    runner.Run(stream);
+  }
+  void Maximum(const Tensor* x, const Tensor* y, Tensor* z) {
+    const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Minimum(const Tensor* x, const Tensor* y, Tensor* z) {
+    const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {});
+    runner.Run(stream);
+  }
+  void Floor(const Tensor* x, Tensor* y) {
+    const auto& runner = NpuOpRunner("Floor", {*x}, {*y}, {});
+    runner.Run(stream);
+  }
+
+ private:
+  platform::Place place;
+  aclrtStream stream;
+  const framework::ExecutionContext& ctx;
+  Tensor t0;
+  Tensor t1;
+  Tensor tn;
+};
+
+template <>
+void InterpolateFunction<fp16>::Arange(int n, Tensor* x) {
+  Tensor x_fp32(framework::proto::VarType::FP32);
+  x_fp32.mutable_data<float>(x->dims(), place);
+  FillNpuTensorWithConstant<float>(&tn, static_cast<float>(n));
+  const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {});
+  runner.Run(stream);
+  Cast(&x_fp32, x);
+}
+
+void InterpolateParamCompute(const float scale_h, const float scale_w,
+                             const bool align_corners, const int align_mode,
+                             const DataLayout& data_layout, const DDim& indim,
+                             const DDim& outdim, int* axis_h, int* axis_w,
+                             int* in_h, int* in_w, int* out_h, int* out_w,
+                             float* ratio_h, float* ratio_w) {
+  if (data_layout == DataLayout::kNCHW) {
+    *axis_h = 2;
+    *axis_w = 3;
+  } else {
+    *axis_h = 1;
+    *axis_w = 2;
+  }
+  *out_h = outdim[*axis_h];
+  *out_w = outdim[*axis_w];
+  *in_h = indim[*axis_h];
+  *in_w = indim[*axis_w];
+  *ratio_h = 0.0f;
+  *ratio_w = 0.0f;
+  if (*out_h > 1) {
+    *ratio_h =
+        align_corners
+            ? static_cast<float>(*in_h - 1) / (*out_h - 1)
+            : (scale_h > 0 ? 1 / scale_h : static_cast<float>(*in_h) / *out_h);
+  }
+  if (*out_w > 1) {
+    *ratio_w =
+        align_corners
+            ? static_cast<float>(*in_w - 1) / (*out_w - 1)
+            : (scale_w > 0 ? 1 / scale_w : static_cast<float>(*in_w) / *out_w);
+  }
+}
+
+template <typename T>
+void BilinearParamTensorCompute(const framework::ExecutionContext& ctx,
+                                const DataLayout& data_layout, int in_h,
+                                int in_w, int out_h, int out_w, bool align_cond,
+                                float ratio_h, float ratio_w, Tensor* h0,
+                                Tensor* h1, Tensor* w0, Tensor* w1,
+                                Tensor* coef_h0, Tensor* coef_h1,
+                                Tensor* coef_w0, Tensor* coef_w1) {
+  InterpolateFunction<T> F(ctx);
+  auto place = ctx.GetPlace();
+  Tensor _h0, _w0;
+  _h0.mutable_data<T>({out_h}, place);
+  _w0.mutable_data<T>({out_w}, place);
+  F.Arange(out_h, &_h0);
+  F.Arange(out_w, &_w0);
+  if (align_cond) {
+    F.Adds(&_h0, static_cast<float>(0.5), &_h0);
+    F.Adds(&_w0, static_cast<float>(0.5), &_w0);
+    F.Muls(&_h0, ratio_h, &_h0);
+    F.Muls(&_w0, ratio_w, &_w0);
+    F.Adds(&_h0, static_cast<float>(-0.5), &_h0);
+    F.Adds(&_w0, static_cast<float>(-0.5), &_w0);
+  } else {
+    F.Muls(&_h0, ratio_h, &_h0);
+    F.Muls(&_w0, ratio_w, &_w0);
+  }
+
+  Tensor zero_t;
+  Tensor one_t;
+  zero_t.mutable_data<T>({1}, place);
+  one_t.mutable_data<T>({1}, place);
+  FillNpuTensorWithConstant<T>(&zero_t, static_cast<T>(0));
+  FillNpuTensorWithConstant<T>(&one_t, static_cast<T>(1));
+  F.Maximum(&_h0, &zero_t, &_h0);
+  F.Maximum(&_w0, &zero_t, &_w0);
+
+  Tensor _h0_floor, _w0_floor;
+  _h0_floor.mutable_data<T>({out_h}, place);
+  _w0_floor.mutable_data<T>({out_w}, place);
+  F.Floor(&_h0, &_h0_floor);
+  F.Floor(&_w0, &_w0_floor);
+  F.Cast(&_h0_floor, h0);
+  F.Cast(&_w0_floor, w0);
+
+  Tensor one_int;
+  one_int.mutable_data<int>({1}, place);
+  FillNpuTensorWithConstant<int>(&one_int, static_cast<int>(1));
+  F.Add(h0, &one_int, h1);
+  F.Add(w0, &one_int, w1);
+  Tensor t_max_h, t_max_w;
+  t_max_h.mutable_data<int>({1}, place);
+  t_max_w.mutable_data<int>({1}, place);
+  FillNpuTensorWithConstant<int>(&t_max_h, static_cast<int>(in_h - 1));
+  FillNpuTensorWithConstant<int>(&t_max_w, static_cast<int>(in_w - 1));
+  F.Minimum(h1, &t_max_h, h1);
+  F.Minimum(w1, &t_max_w, w1);
+
+  F.Sub(&_h0, &_h0_floor, coef_h1);
+  F.Sub(&_w0, &_w0_floor, coef_w1);
+  F.Sub(&one_t, coef_h1, coef_h0);
+  F.Sub(&one_t, coef_w1, coef_w0);
+
+  if (data_layout == DataLayout::kNCHW) {
+    coef_h0->Resize({out_h, 1});
+    coef_h1->Resize({out_h, 1});
+  } else {
+    coef_h0->Resize({out_h, 1, 1});
+    coef_h1->Resize({out_h, 1, 1});
+    coef_w0->Resize({out_w, 1});
+    coef_w1->Resize({out_w, 1});
+  }
+}
+
+template <typename T>
+void BilinearFwdNpu(const framework::ExecutionContext& ctx, const Tensor* input,
+                    Tensor* output, const float scale_h, const float scale_w,
+                    const bool align_corners, const int align_mode,
+                    const DataLayout& data_layout) {
+  InterpolateFunction<T> F(ctx);
+  auto place = ctx.GetPlace();
+  auto outdim = output->dims();
+  auto indim = input->dims();
+
+  int axis_h, axis_w;
+  int out_h, out_w, in_h, in_w;
+  float ratio_h, ratio_w;
+  InterpolateParamCompute(scale_h, scale_w, align_corners, align_mode,
+                          data_layout, indim, outdim, &axis_h, &axis_w, &in_h,
+                          &in_w, &out_h, &out_w, &ratio_h, &ratio_w);
+
+  Tensor h0, h1, w0, w1;
+  h0.mutable_data<int>({out_h}, place);
+  h1.mutable_data<int>({out_h}, place);
+  w0.mutable_data<int>({out_w}, place);
+  w1.mutable_data<int>({out_w}, place);
+  Tensor coef_h0, coef_h1, coef_w0, coef_w1;
+  coef_h0.mutable_data<T>({out_h}, place);
+  coef_h1.mutable_data<T>({out_h}, place);
+  coef_w0.mutable_data<T>({out_w}, place);
+  coef_w1.mutable_data<T>({out_w}, place);
+  bool align_cond = align_mode == 0 && !align_corners;
+  BilinearParamTensorCompute<T>(ctx, data_layout, in_h, in_w, out_h, out_w,
+                                align_cond, ratio_h, ratio_w, &h0, &h1, &w0,
+                                &w1, &coef_h0, &coef_h1, &coef_w0, &coef_w1);
+
+  Tensor input_gather_h0, input_gather_h1;
+  auto dim_gather_h = indim;
+  dim_gather_h[axis_h] = out_h;
+  input_gather_h0.mutable_data<T>(dim_gather_h, place);
+  input_gather_h1.mutable_data<T>(dim_gather_h, place);
+
+  F.Gather(input, &h0, axis_h, &input_gather_h0);
+  F.Gather(input, &h1, axis_h, &input_gather_h1);
+
+  F.Mul(&input_gather_h0, &coef_h0, &input_gather_h0);
+  F.Mul(&input_gather_h1, &coef_h1, &input_gather_h1);
+  Tensor out_x4;
+  out_x4.mutable_data<T>({4, outdim[0], outdim[1], outdim[2], outdim[3]},
+                         place);
+  Tensor input_gather_h0_w0 = out_x4.Slice(0, 1);
+  Tensor input_gather_h0_w1 = out_x4.Slice(1, 2);
+  Tensor input_gather_h1_w0 = out_x4.Slice(2, 3);
+  Tensor input_gather_h1_w1 = out_x4.Slice(3, 4);
+  F.Gather(&input_gather_h0, &w0, axis_w, &input_gather_h0_w0);
+  F.Gather(&input_gather_h0, &w1, axis_w, &input_gather_h0_w1);
+  F.Gather(&input_gather_h1, &w0, axis_w, &input_gather_h1_w0);
+  F.Gather(&input_gather_h1, &w1, axis_w, &input_gather_h1_w1);
+  F.Mul(&input_gather_h0_w0, &coef_w0, &input_gather_h0_w0);
+  F.Mul(&input_gather_h0_w1, &coef_w1, &input_gather_h0_w1);
+  F.Mul(&input_gather_h1_w0, &coef_w0, &input_gather_h1_w0);
+  F.Mul(&input_gather_h1_w1, &coef_w1, &input_gather_h1_w1);
+  F.ReduceSum(&out_x4, output, std::vector<int>{0}, false);
+}
+
+template <typename T>
+void BilinearBwdNpu(const framework::ExecutionContext& ctx, const Tensor* gout,
+                    Tensor* gin, const float scale_h, const float scale_w,
+                    const bool align_corners, const int align_mode,
+                    const DataLayout& data_layout) {
+  InterpolateFunction<T> F(ctx);
+  auto place = ctx.GetPlace();
+  auto outdim = gout->dims();
+  auto indim = gin->dims();
+
+  int axis_h, axis_w;
+  int out_h, out_w, in_h, in_w;
+  float ratio_h, ratio_w;
+  InterpolateParamCompute(scale_h, scale_w, align_corners, align_mode,
+                          data_layout, indim, outdim, &axis_h, &axis_w, &in_h,
+                          &in_w, &out_h, &out_w, &ratio_h, &ratio_w);
+
+  Tensor h0, h1, w0, w1;
+  h0.mutable_data<int>({out_h}, place);
+  h1.mutable_data<int>({out_h}, place);
+  w0.mutable_data<int>({out_w}, place);
+  w1.mutable_data<int>({out_w}, place);
+  Tensor coef_h0, coef_h1, coef_w0, coef_w1;
+  coef_h0.mutable_data<T>({out_h}, place);
+  coef_h1.mutable_data<T>({out_h}, place);
+  coef_w0.mutable_data<T>({out_w}, place);
+  coef_w1.mutable_data<T>({out_w}, place);
+  bool align_cond = align_mode == 0 && !align_corners;
+  BilinearParamTensorCompute<T>(ctx, data_layout, in_h, in_w, out_h, out_w,
+                                align_cond, ratio_h, ratio_w, &h0, &h1, &w0,
+                                &w1, &coef_h0, &coef_h1, &coef_w0, &coef_w1);
+
+  Tensor gy_w0, gy_w1;
+  gy_w0.mutable_data<T>(outdim, place);
+  gy_w1.mutable_data<T>(outdim, place);
+  F.Mul(gout, &coef_w0, &gy_w0);
+  F.Mul(gout, &coef_w1, &gy_w1);
+
+  auto dim_gather_h = indim;
+  dim_gather_h[axis_h] = out_h;
+  Tensor g_gather_w0, g_gather_w1;
+  g_gather_w0.mutable_data<T>(dim_gather_h, place);
+  g_gather_w1.mutable_data<T>(dim_gather_h, place);
+  w0.Resize({out_w, 1});
+  w1.Resize({out_w, 1});
+  F.GatherGrad(&gy_w0, &w0, axis_w, &g_gather_w0);
+  F.GatherGrad(&gy_w1, &w1, axis_w, &g_gather_w1);
+
+  F.Add(&g_gather_w0, &g_gather_w1, &g_gather_w0);
+  F.Mul(&g_gather_w0, &coef_h1, &g_gather_w1);
+  F.Mul(&g_gather_w0, &coef_h0, &g_gather_w0);
+
+  Tensor gx_0, gx_1;
+  gx_0.mutable_data<T>(indim, place);
+  gx_1.mutable_data<T>(indim, place);
+  h0.Resize({out_h, 1});
+  h1.Resize({out_h, 1});
+  F.GatherGrad(&g_gather_w0, &h0, axis_h, &gx_0);
+  F.GatherGrad(&g_gather_w1, &h1, axis_h, &gx_1);
+
+  F.Add(&gx_0, &gx_1, gin);
+}
 
 template <typename DeviceContext, typename T>
 class InterpolateV2NPUKernel : public framework::OpKernel<T> {
@@ -39,19 +402,6 @@ class InterpolateV2NPUKernel : public framework::OpKernel<T> {
     int n, c, in_d, in_h, in_w;
     ExtractNCDWH(input_dims, data_layout, &n, &c, &in_d, &in_h, &in_w);
 
-    PADDLE_ENFORCE_EQ(
-        input->layout(), data_layout,
-        platform::errors::InvalidArgument(
-            "Interpolate OP's input tensor layout should equal to attr "
-            "data_layout, but got tensor layout <%s>, attr layout <%s>",
-            framework::DataLayoutToString(input->layout()), data_layout_str));
-    PADDLE_ENFORCE_EQ(
-        output->layout(), data_layout,
-        platform::errors::InvalidArgument(
-            "Interpolate OP's output tensor layout should equal to attr "
-            "data_layout, but got tensor layout <%s>, attr layout <%s>",
-            framework::DataLayoutToString(output->layout()), data_layout_str));
-
     auto interp_method = ctx.Attr<std::string>("interp_method");
     bool align_corners = ctx.Attr<bool>("align_corners");
 
@@ -156,17 +506,22 @@ class InterpolateV2NPUKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    NpuOpRunner runner;
     // To-do(qili93): need to support bilineare, try ResizeD
+    // Add bilineare by zhulei
     if ("nearest" == interp_method) {
+      NpuOpRunner runner;
       runner.SetType("ResizeNearestNeighborV2")
           .AddInput(*input)
           .AddInput(std::vector<int32_t>{out_h, out_w})
           .AddOutput(*output)
           .AddAttr("align_corners", align_corners)
           .AddAttr("half_pixel_centers", false);
+      runner.Run(stream);
+    } else if ("bilinear" == interp_method) {
+      int align_mode = ctx.Attr<int>("align_mode");
+      BilinearFwdNpu<T>(ctx, input, output, scale_h, scale_w, align_corners,
+                        align_mode, data_layout);
     }
-    runner.Run(stream);
   }
 };
 
@@ -184,27 +539,6 @@ class InterpolateV2NPUGradKernel : public framework::OpKernel<T> {
     int n, c, in_d, in_h, in_w;
     ExtractNCDWH(input->dims(), data_layout, &n, &c, &in_d, &in_h, &in_w);
 
-    PADDLE_ENFORCE_EQ(
-        input->layout(), data_layout,
-        platform::errors::InvalidArgument(
-            "Interpolate OP's input tensor layout should equal to attr "
-            "data_layout, but got tensor layout <%s>, attr layout <%s>",
-            framework::DataLayoutToString(input->layout()), data_layout_str));
-    PADDLE_ENFORCE_EQ(output_grad->layout(), data_layout,
-                      platform::errors::InvalidArgument(
-                          "Interpolate OP's output_grad tensor layout should "
-                          "equal to attr data_layout, but got tensor layout is "
-                          "<%s>, and attr layout is <%s>",
-                          framework::DataLayoutToString(output_grad->layout()),
-                          data_layout_str));
-    PADDLE_ENFORCE_EQ(input_grad->layout(), data_layout,
-                      platform::errors::InvalidArgument(
-                          "Interpolate OP's input_grad tensor layout should "
-                          "equal to attr data_layout, but got tensor layout is "
-                          "<%s>, and attr layout is <%s>",
-                          framework::DataLayoutToString(input_grad->layout()),
-                          data_layout_str));
-
     auto interp_method = ctx.Attr<std::string>("interp_method");
     bool align_corners = ctx.Attr<bool>("align_corners");
 
@@ -301,17 +635,21 @@ class InterpolateV2NPUGradKernel : public framework::OpKernel<T> {
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    NpuOpRunner runner;
     // To-do(qili93): need to support bilineare, try ResizeGradD
     if ("nearest" == interp_method) {
+      NpuOpRunner runner;
       runner.SetType("ResizeNearestNeighborV2Grad")
           .AddInput(*output_grad)
           .AddInput(std::vector<int32_t>{in_h, in_w})
           .AddOutput(*input_grad)
           .AddAttr("align_corners", align_corners)
           .AddAttr("half_pixel_centers", false);
+      runner.Run(stream);
+    } else if ("bilinear" == interp_method) {
+      int align_mode = ctx.Attr<int>("align_mode");
+      BilinearBwdNpu<T>(ctx, output_grad, input_grad, scale_h, scale_w,
+                        align_corners, align_mode, data_layout);
     }
-    runner.Run(stream);
   }
 };
 
@@ -330,3 +668,13 @@ REGISTER_OP_NPU_KERNEL(
     nearest_interp_v2_grad,
     ops::InterpolateV2NPUGradKernel<plat::NPUDeviceContext, float>,
     ops::InterpolateV2NPUGradKernel<plat::NPUDeviceContext, plat::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    bilinear_interp_v2,
+    ops::InterpolateV2NPUKernel<plat::NPUDeviceContext, float>,
+    ops::InterpolateV2NPUKernel<plat::NPUDeviceContext, plat::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    bilinear_interp_v2_grad,
+    ops::InterpolateV2NPUGradKernel<plat::NPUDeviceContext, float>,
+    ops::InterpolateV2NPUGradKernel<plat::NPUDeviceContext, plat::float16>);
diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
index 4e81bb9544c..8e31d58195b 100644
--- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt
@@ -17,6 +17,7 @@ if (WITH_ASCEND_CL)
     # Note: the following test cases has running time more than 120s
     set_tests_properties(test_nearest_interp_op_npu PROPERTIES TIMEOUT 200)
     set_tests_properties(test_nearest_interp_v2_op_npu PROPERTIES TIMEOUT 200)
+    set_tests_properties(test_bilinear_interp_v2_op_npu PROPERTIES TIMEOUT 200)
     set_tests_properties(test_stack_op_npu PROPERTIES TIMEOUT 300)
     set_tests_properties(test_conv2d_transpose_op_npu PROPERTIES TIMEOUT 200)
     set_tests_properties(test_conv2d_op_npu PROPERTIES TIMEOUT 300)
diff --git a/python/paddle/fluid/tests/unittests/npu/test_bilinear_interp_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_bilinear_interp_v2_op_npu.py
new file mode 100644
index 00000000000..6da49b8d84d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/npu/test_bilinear_interp_v2_op_npu.py
@@ -0,0 +1,279 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+sys.path.append("..")
+from op_test import OpTest
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from paddle.nn.functional import interpolate
+import paddle
+
+from test_bilinear_interp_v2_op import bilinear_interp_np
+
+paddle.enable_static()
+
+
+class TestBilinearInterpOp(OpTest):
+    def set_npu(self):
+        self.__class__.use_npu = True
+        self.place = paddle.NPUPlace(0)
+
+    def setUp(self):
+        self.set_npu()
+        self.out_size = None
+        self.actual_shape = None
+        self.data_layout = 'NCHW'
+        self.init_test_case()
+        self.op_type = "bilinear_interp_v2"
+        input_np = np.random.random(self.input_shape).astype(self.dtype)
+
+        if self.data_layout == "NCHW":
+            in_h = self.input_shape[2]
+            in_w = self.input_shape[3]
+        else:
+            in_h = self.input_shape[1]
+            in_w = self.input_shape[2]
+        scale_h = 0
+        scale_w = 0
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    scale_h = scale_w = float(self.scale)
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                scale_w = scale_h = self.scale[0]
+            elif isinstance(self.scale, list) and len(self.scale) > 1:
+                scale_w = self.scale[1]
+                scale_h = self.scale[0]
+            out_h = int(in_h * scale_h)
+            out_w = int(in_w * scale_w)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bilinear_interp_np(input_np, out_h, out_w, scale_w, scale_h,
+                                       self.out_size, self.actual_shape,
+                                       self.align_corners, self.align_mode,
+                                       self.data_layout)
+
+        self.inputs = {'X': input_np}
+        if self.out_size is not None:
+            self.inputs['OutSize'] = self.out_size
+        if self.actual_shape is not None:
+            self.inputs['OutSize'] = self.actual_shape
+
+        self.attrs = {
+            'out_h': self.out_h,
+            'out_w': self.out_w,
+            'interp_method': self.interp_method,
+            'align_corners': self.align_corners,
+            'align_mode': self.align_mode,
+            'data_layout': self.data_layout
+        }
+        if self.scale:
+            if isinstance(self.scale, float) or isinstance(self.scale, int):
+                if self.scale > 0.:
+                    self.scale = [self.scale]
+            if isinstance(self.scale, list) and len(self.scale) == 1:
+                self.scale = [self.scale[0], self.scale[0]]
+            self.attrs['scale'] = self.scale
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=self.atol)
+
+    def test_check_grad(self):
+        self.__class__.exist_check_grad = True
+        if self.dtype == 'float16':
+            return
+        self.max_relative_error = 0.005
+        inputs_to_check = ['X']
+        output_names = ['Out']
+        no_grad_set = set()
+        cpu_place = fluid.CPUPlace()
+        cpu_grads = self._get_gradient(inputs_to_check, cpu_place, output_names,
+                                       no_grad_set)
+        npu_grads = self._get_gradient(inputs_to_check, self.place,
+                                       output_names, no_grad_set)
+        self._assert_is_close(cpu_grads, npu_grads, inputs_to_check,
+                              self.max_relative_error,
+                              "Gradient Check between places")
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 1.5
+        self.align_corners = False
+        self.align_mode = 1
+        self.dtype = 'float32'
+        self.atol = 1e-5
+
+
+class TestBilinearInterpCaseFP16(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpCaseFP16, self).init_test_case()
+        self.dtype = 'float16'
+        self.atol = 1e-2
+
+
+class TestBilinearInterpCase1(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpCase1, self).init_test_case()
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+
+
+class TestBilinearInterpCase2(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpCase2, self).init_test_case()
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+
+
+class TestBilinearInterpCase3(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpCase3, self).init_test_case()
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+
+
+class TestBilinearInterpCase4(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpCase4, self).init_test_case()
+        self.input_shape = [4, 1, 7, 8]
+        self.out_h = 1
+        self.out_w = 1
+        self.scale = 0.
+        self.out_size = np.array([2, 2]).astype("int32")
+
+
+class TestBilinearInterpCase5(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpCase5, self).init_test_case()
+        self.input_shape = [3, 3, 9, 6]
+        self.out_h = 12
+        self.out_w = 12
+        self.scale = 0.
+        self.out_size = np.array([11, 11]).astype("int32")
+
+
+class TestBilinearInterpCase6(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpCase6, self).init_test_case()
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([65, 33]).astype("int32")
+
+
+class TestBilinearInterpCase7(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpCase7, self).init_test_case()
+        self.input_shape = [1, 1, 32, 64]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = [2.0, 0.5]
+
+
+class TestBilinearInterpSame(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpSame, self).init_test_case()
+        self.input_shape = [2, 3, 32, 64]
+        self.out_h = 32
+        self.out_w = 64
+        self.scale = 0.
+
+
+class TestBilinearInterpActualShape(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpActualShape, self).init_test_case()
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 0.
+        self.out_size = np.array([66, 40]).astype("int32")
+
+
+class TestBilinearInterpDataLayout(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpDataLayout, self).init_test_case()
+        self.input_shape = [2, 5, 5, 3]
+        self.out_h = 2
+        self.out_w = 2
+        self.scale = 0.
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.data_layout = "NHWC"
+
+
+class TestBilinearInterpOtherMethod1(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 1
+
+
+class TestBilinearInterpWithMethod2(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = False
+        self.align_mode = 0
+
+
+class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
+    def set_align_mode(self):
+        self.align_corners = True
+        self.align_mode = 0
+
+
+class TestBilinearInterpScale1(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpScale1, self).init_test_case()
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 2.
+
+
+class TestBilinearInterpScale2(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpScale2, self).init_test_case()
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 1.
+
+
+class TestBilinearInterpZero(TestBilinearInterpOp):
+    def init_test_case(self):
+        super(TestBilinearInterpZero, self).init_test_case()
+        self.input_shape = [2, 3, 5, 7]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 0.2
+        self.align_mode = 0
+
+
+if __name__ == "__main__":
+    unittest.main()
-- 
GitLab