interpolate (forward grad) op support fp16 on gpu (#45061)

b12c27eb · Yuanle Liu · GitHub · cbf26bb1 · b12c27eb · b12c27eb
8 changed file
--- a/paddle/phi/kernels/funcs/interpolate_function.h
+++ b/paddle/phi/kernels/funcs/interpolate_function.h
@@ -28,26 +28,28 @@ namespace funcs {

 template <typename T>
 HOSTDEVICE inline T CubicConvolution1(T x, T A) {
-  return ((A + 2) * x - (A + 3)) * x * x + 1;
+  return ((A + static_cast<T>(2)) * x - (A + static_cast<T>(3))) * x * x +
+         static_cast<T>(1);
 }

 template <typename T>
 HOSTDEVICE inline T CubicConvolution2(T x, T A) {
-  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+  return ((A * x - static_cast<T>(5) * A) * x + static_cast<T>(8) * A) * x -
+         static_cast<T>(4) * A;
 }

 template <typename T>
 HOSTDEVICE inline void get_cubic_upsample_coefficients(T coeffs[4], T t) {
-  T A = -0.75;
+  T A = static_cast<T>(-0.75);

  T x1 = t;
-  coeffs[0] = CubicConvolution2<T>(x1 + 1.0, A);
+  coeffs[0] = CubicConvolution2<T>(x1 + static_cast<T>(1.0), A);
  coeffs[1] = CubicConvolution1<T>(x1, A);

  // opposite coefficients
-  T x2 = 1.0 - t;
+  T x2 = static_cast<T>(1.0) - t;
  coeffs[2] = CubicConvolution1<T>(x2, A);
-  coeffs[3] = CubicConvolution2<T>(x2 + 1.0, A);
+  coeffs[3] = CubicConvolution2<T>(x2 + static_cast<T>(1.0), A);
 }

 inline void ExtractNCDWH(const DDim& dims,

--- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
@@ -18,6 +18,7 @@
 #include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/interpolate_function.h"
@@ -34,11 +35,12 @@ __forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex(
    T* lambda2,
    T src_x,
    const int in_img_x) {
-  src_x = (src_x > 0) ? src_x : 0.f;
+  src_x = (src_x > static_cast<T>(0)) ? src_x : static_cast<T>(0);
  *in_img_idx = static_cast<int>(src_x);
  *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0;
-  *lambda1 = src_x - *in_img_idx;
-  *lambda2 = 1.f - *lambda1;
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  *lambda1 = static_cast<T>(static_cast<MT>(src_x) - *in_img_idx);
+  *lambda2 = static_cast<T>(1.0) - *lambda1;
 }

 template <typename T>
@@ -50,7 +52,7 @@ __global__ void KeLinearInterpBw(T* in,
                                 const size_t output_h,
                                 const size_t output_w,
                                 const size_t num_channels,
-                                 const T ratio_w,
+                                 const float ratio_w,
                                 const bool align_corners,
                                 const int align_mode,
                                 const DataLayout data_layout) {
@@ -77,12 +79,13 @@ __global__ void KeLinearInterpBw(T* in,
                                : ratio_w * out_img_idx;
    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;  // w
    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;  // w_id
-
-    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
-    src_w = (src_w > 0) ? src_w : 0;
-    T w1lambda =
-        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
-    T w2lambda = 1.f - w1lambda;
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    T src_w = static_cast<T>(ratio_w * (out_img_idx + 0.5) - 0.5);
+    src_w = (src_w > static_cast<T>(0)) ? src_w : static_cast<T>(0);
+    T w1lambda = align_flag
+                     ? static_cast<T>(static_cast<MT>(src_w) - in_img_idx)
+                     : static_cast<T>(ratio_w * out_img_idx - in_img_idx);
+    T w2lambda = static_cast<T>(1.0) - w1lambda;

    T* in_pos;
    if (data_layout == DataLayout::kNCHW) {
@@ -245,7 +248,7 @@ __global__ void KeBilinearInterpBwShareMemory(T* in,
                                              const int num_channels,
                                              float ratio_h,
                                              float ratio_w,
-                                              const T align_type_value,
+                                              const float align_type_value,
                                              bool is_nchw) {
  __shared__ T s_data[2][1024];
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -267,8 +270,10 @@ __global__ void KeBilinearInterpBwShareMemory(T* in,

    int in_img_idx, in_img_idy, w_id, h_id;
    T w1lambda, h1lambda, w2lambda, h2lambda;
-    T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
-    T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
+    T src_w = static_cast<T>(ratio_w * (out_img_idx + align_type_value) -
+                             align_type_value);
+    T src_h = static_cast<T>(ratio_h * (out_img_idy + align_type_value) -
+                             align_type_value);

    PreCalculatorForLinearInterpInputIndex(
        &in_img_idx, &w_id, &w1lambda, &w2lambda, src_w, in_w);
@@ -283,8 +288,8 @@ __global__ void KeBilinearInterpBwShareMemory(T* in,
    int bot_right_index = input_index + h_id * in_w + w_id;
    int in_top_min_index, in_bot_min_index;

-    s_data[0][threadIdx.x] = 0.f;
-    s_data[1][threadIdx.x] = 0.f;
+    s_data[0][threadIdx.x] = static_cast<T>(0);
+    s_data[1][threadIdx.x] = static_cast<T>(0);
    int remain = nthreads - (tid & (-blockDim.x));
    int in_top_max_index =
        phi::funcs::blockReduceMax(top_right_index, FINAL_MASK);
@@ -353,7 +358,7 @@ __global__ void KeBilinearInterpNCHWBw(T* in,
                                       float ratio_h,
                                       float ratio_w,
                                       const T* __restrict__ out,
-                                       const T align_type_value) {
+                                       const float align_type_value) {
  int index = threadIdx.x + blockDim.x * blockIdx.x;
  int stride = blockDim.x * gridDim.x;
  int num_out = n * num_channels * out_h * out_w;
@@ -368,13 +373,15 @@ __global__ void KeBilinearInterpNCHWBw(T* in,

    int h1, y_id;
    T h1lambda, h0lambda;
-    T src_y = ratio_h * (h2 + align_type_value) - align_type_value;
+    T src_y =
+        static_cast<T>(ratio_h * (h2 + align_type_value) - align_type_value);

    PreCalculatorForLinearInterpInputIndex(
        &h1, &y_id, &h1lambda, &h0lambda, src_y, in_h);
    int w1, x_id;
    T w1lambda, w0lambda;
-    T src_x = ratio_w * (w2 + align_type_value) - align_type_value;
+    T src_x =
+        static_cast<T>(ratio_w * (w2 + align_type_value) - align_type_value);
    PreCalculatorForLinearInterpInputIndex(
        &w1, &x_id, &w1lambda, &w0lambda, src_x, in_w);

@@ -406,7 +413,7 @@ __global__ void KeBilinearInterpBw(T* in,
                                   const int num_channels,
                                   float ratio_h,
                                   float ratio_w,
-                                   const T align_type_value,
+                                   const float align_type_value,
                                   funcs::FastDivModForInterpolate divmods) {
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
@@ -426,8 +433,10 @@ __global__ void KeBilinearInterpBw(T* in,

    int in_img_idx, in_img_idy, w_id, h_id;
    T w1lambda, h1lambda, w2lambda, h2lambda;
-    T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
-    T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
+    T src_w = static_cast<T>(ratio_w * (out_img_idx + align_type_value) -
+                             align_type_value);
+    T src_h = static_cast<T>(ratio_h * (out_img_idy + align_type_value) -
+                             align_type_value);

    PreCalculatorForLinearInterpInputIndex(
        &in_img_idx, &w_id, &w1lambda, &w2lambda, src_w, in_w);
@@ -489,14 +498,13 @@ __global__ void KeBicubicInterpBw(T* in,
                       ? static_cast<T>(ratio_h * out_img_idy)
                       : static_cast<T>(ratio_h * (out_img_idy + 0.5) - 0.5);
    int input_y = floorf(in_img_idy);
-    const T y_t = in_img_idy - input_y;
-
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    const T y_t = static_cast<T>(static_cast<MT>(in_img_idy) - input_y);
    T in_img_idx = align_corners
                       ? static_cast<T>(ratio_w * out_img_idx)
                       : static_cast<T>(ratio_w * (out_img_idx + 0.5) - 0.5);
    int input_x = floorf(in_img_idx);
-
-    const T x_t = in_img_idx - input_x;
+    const T x_t = static_cast<T>(static_cast<MT>(in_img_idx) - input_x);

    T x_coeffs[4];
    T y_coeffs[4];
@@ -543,9 +551,9 @@ __global__ void KeTrilinearInterpBw(T* in,
                                    const size_t output_h,
                                    const size_t output_w,
                                    const size_t num_channels,
-                                    const T ratio_d,
-                                    const T ratio_h,
-                                    const T ratio_w,
+                                    const float ratio_d,
+                                    const float ratio_h,
+                                    const float ratio_w,
                                    const bool align_corners,
                                    const int align_mode,
                                    const DataLayout data_layout) {
@@ -578,33 +586,37 @@ __global__ void KeTrilinearInterpBw(T* in,
                         : static_cast<int>(ratio_d * out_img_idt);
    in_img_idt = (in_img_idt > 0) ? in_img_idt : 0;
    int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0;
-    T src_d = ratio_d * (out_img_idt + 0.5) - 0.5;
-    src_d = (src_d > 0) ? src_d : 0;
-    T d1lambda =
-        align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt;
-    T d2lambda = 1.f - d1lambda;
+    T src_d = static_cast<T>(ratio_d * (out_img_idt + 0.5) - 0.5);
+    src_d = (src_d > static_cast<T>(0)) ? src_d : static_cast<T>(0);
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    T d1lambda = align_flag
+                     ? static_cast<T>(static_cast<MT>(src_d) - in_img_idt)
+                     : static_cast<T>(ratio_d * out_img_idt - in_img_idt);
+    T d2lambda = static_cast<T>(1.0) - d1lambda;

    int in_img_idy = align_flag
                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
                         : static_cast<int>(ratio_h * out_img_idy);
    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
-    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
-    src_h = (src_h > 0) ? src_h : 0;
-    T h1lambda =
-        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
-    T h2lambda = 1.f - h1lambda;
+    T src_h = static_cast<T>(ratio_h * (out_img_idy + 0.5) - 0.5);
+    src_h = (src_h > static_cast<T>(0)) ? src_h : static_cast<T>(0);
+    T h1lambda = align_flag
+                     ? static_cast<T>(static_cast<MT>(src_h) - in_img_idy)
+                     : static_cast<T>(ratio_h * out_img_idy - in_img_idy);
+    T h2lambda = static_cast<T>(1.0) - h1lambda;

    int in_img_idx = align_flag
                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
                         : static_cast<int>(ratio_w * out_img_idx);
    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
-    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
-    src_w = (src_w > 0) ? src_w : 0;
-    T w1lambda =
-        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
-    T w2lambda = 1.f - w1lambda;
+    T src_w = static_cast<T>(ratio_w * (out_img_idx + 0.5) - 0.5);
+    src_w = (src_w > static_cast<T>(0)) ? src_w : static_cast<T>(0);
+    T w1lambda = align_flag
+                     ? static_cast<T>(static_cast<MT>(src_w) - in_img_idx)
+                     : static_cast<T>(ratio_w * out_img_idx - in_img_idx);
+    T w2lambda = static_cast<T>(1.0) - w1lambda;

    if (data_layout == DataLayout::kNCHW) {
      int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size +
@@ -1031,7 +1043,8 @@ static void Interpolate2DCUDABwd(
                                                         interp_divmods);
    }
  } else if ("bilinear" == interp_method) {
-    const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0;
+    const float align_type_value =
+        (align_mode == 0 && !align_corners) ? 0.5f : 0.f;
    bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false;
    bool optimize_flag = false;
 #ifndef __HIPCC__
@@ -1148,7 +1161,7 @@ static void Interpolate3DCUDABwd(
  if (scale_tensor) {
    auto scale_data =
        funcs::get_new_data_from_tensor<float>(scale_tensor.get_ptr());
-    if (scale_data.size() > 1) {
+    if (scale_data.size() > 2) {
      scale_d = scale_data[0];
      scale_h = scale_data[1];
      scale_w = scale_data[2];
@@ -1179,7 +1192,7 @@ static void Interpolate3DCUDABwd(
            "should be greater than 0, but received value is %d.",
            scale_d));
  } else {
-    if (scale.size() > 1) {
+    if (scale.size() > 2) {
      scale_d = scale[0];
      scale_h = scale[1];
      scale_w = scale[2];
@@ -1574,7 +1587,8 @@ PD_REGISTER_KERNEL(bilinear_interp_grad,
                   ALL_LAYOUT,
                   phi::BilinearInterpGradKernel,
                   float,
-                   double) {
+                   double,
+                   phi::dtype::float16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
@@ -1583,7 +1597,8 @@ PD_REGISTER_KERNEL(nearest_interp_grad,
                   ALL_LAYOUT,
                   phi::NearestInterpGradKernel,
                   float,
-                   double) {
+                   double,
+                   phi::dtype::float16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
@@ -1592,7 +1607,8 @@ PD_REGISTER_KERNEL(trilinear_interp_grad,
                   ALL_LAYOUT,
                   phi::TrilinearInterpGradKernel,
                   float,
-                   double) {
+                   double,
+                   phi::dtype::float16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
@@ -1601,7 +1617,8 @@ PD_REGISTER_KERNEL(linear_interp_grad,
                   ALL_LAYOUT,
                   phi::LinearInterpGradKernel,
                   float,
-                   double) {
+                   double,
+                   phi::dtype::float16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
@@ -1610,7 +1627,8 @@ PD_REGISTER_KERNEL(bicubic_interp_grad,
                   ALL_LAYOUT,
                   phi::BicubicInterpGradKernel,
                   float,
-                   double) {
+                   double,
+                   phi::dtype::float16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
--- a/paddle/phi/kernels/gpu/interpolate_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu
@@ -19,6 +19,8 @@
 #include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/interpolate_function.h"
@@ -34,11 +36,12 @@ __forceinline__ __device__ void PreCalculatorForLinearInterpInputIndex(
    T* lambda2,
    T src_x,
    const int in_img_x) {
-  src_x = (src_x > 0) ? src_x : 0.f;
+  src_x = (src_x > static_cast<T>(0)) ? src_x : static_cast<T>(0);
  *in_img_idx = static_cast<int>(src_x);
  *x_id = (*in_img_idx < in_img_x - 1) ? 1 : 0;
-  *lambda1 = src_x - *in_img_idx;
-  *lambda2 = 1.f - *lambda1;
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  *lambda1 = static_cast<T>(static_cast<MT>(src_x) - *in_img_idx);
+  *lambda2 = static_cast<T>(1.0) - *lambda1;
 }

 template <typename T>
@@ -78,12 +81,13 @@ __global__ void KeLinearInterpFw(const T* in,
                         : static_cast<int>(ratio_w * out_img_idx);
    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;  // w
    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;  // w_id
-
-    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
-    src_w = (src_w > 0) ? src_w : 0;
-    T w1lambda =
-        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
-    T w2lambda = 1.f - w1lambda;
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    T src_w = static_cast<T>(ratio_w * (out_img_idx + 0.5) - 0.5);
+    src_w = (src_w > static_cast<T>(0)) ? src_w : static_cast<T>(0);
+    T w1lambda = align_flag
+                     ? static_cast<T>(static_cast<MT>(src_w) - in_img_idx)
+                     : static_cast<T>(ratio_w * out_img_idx - in_img_idx);
+    T w2lambda = static_cast<T>(1.0) - w1lambda;

    if (data_layout == DataLayout::kNCHW) {
      const T* in_pos =
@@ -203,7 +207,7 @@ __global__ void KeBilinearInterpFw(const T* in,
                                   const size_t num_channels,
                                   const float ratio_h,
                                   const float ratio_w,
-                                   const T align_type_value,
+                                   const float align_type_value,
                                   funcs::FastDivModForInterpolate divmods) {
  int nthreads = output_h * output_w;
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
@@ -222,8 +226,10 @@ __global__ void KeBilinearInterpFw(const T* in,

    int in_img_idx, in_img_idy, h_id, w_id;
    T h1lambda, w1lambda, h2lambda, w2lambda;
-    T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
-    T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
+    T src_w = static_cast<T>(ratio_w * (out_img_idx + align_type_value) -
+                             align_type_value);
+    T src_h = static_cast<T>(ratio_h * (out_img_idy + align_type_value) -
+                             align_type_value);

    PreCalculatorForLinearInterpInputIndex(
        &in_img_idx, &w_id, &w1lambda, &w2lambda, src_w, in_img_w);
@@ -254,7 +260,7 @@ __global__ void KeBilinearInterpNCHWFw(const T* in,
                                       const size_t nc,
                                       const float ratio_h,
                                       const float ratio_w,
-                                       const T align_type_value) {
+                                       const float align_type_value) {
  int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x;
  int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y;
  int nc_id = threadIdx.z + blockIdx.z * blockDim.z;
@@ -262,8 +268,10 @@ __global__ void KeBilinearInterpNCHWFw(const T* in,

  int in_img_idx, in_img_idy, h_id, w_id;
  T h1lambda, w1lambda, h2lambda, w2lambda;
-  T src_w = ratio_w * (out_img_idx + align_type_value) - align_type_value;
-  T src_h = ratio_h * (out_img_idy + align_type_value) - align_type_value;
+  T src_w = static_cast<T>(ratio_w * (out_img_idx + align_type_value) -
+                           align_type_value);
+  T src_h = static_cast<T>(ratio_h * (out_img_idy + align_type_value) -
+                           align_type_value);

  PreCalculatorForLinearInterpInputIndex(
      &in_img_idx, &w_id, &w1lambda, &w2lambda, src_w, in_img_w);
@@ -296,13 +304,13 @@ template <typename T>
 __device__ __forceinline__ static T Kecubic_interp(
    const T x0, const T x1, const T x2, const T x3, T t) {
  T coeffs[4];
-  T a = -0.75;
+  T a = static_cast<T>(-0.75);
  T x_1 = t;
-  T x_2 = 1.0 - t;
-  coeffs[0] = funcs::CubicConvolution2<T>(x_1 + 1.0, a);
+  T x_2 = static_cast<T>(1.0) - t;
+  coeffs[0] = funcs::CubicConvolution2<T>(x_1 + static_cast<T>(1.0), a);
  coeffs[1] = funcs::CubicConvolution1<T>(x_1, a);
  coeffs[2] = funcs::CubicConvolution1<T>(x_2, a);
-  coeffs[3] = funcs::CubicConvolution2<T>(x_2 + 1.0, a);
+  coeffs[3] = funcs::CubicConvolution2<T>(x_2 + static_cast<T>(1.0), a);
  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
 }

@@ -348,13 +356,14 @@ __global__ void KeBicubicInterpFw(const T* in,
                       ? static_cast<T>(ratio_h * out_img_idy)
                       : static_cast<T>(ratio_h * (out_img_idy + 0.5) - 0.5);
    int input_y = floorf(in_img_idy);
-    const T y_t = in_img_idy - input_y;
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    const T y_t = static_cast<T>(static_cast<MT>(in_img_idy) - input_y);

    T in_img_idx = align_corners
                       ? static_cast<T>(ratio_w * out_img_idx)
                       : static_cast<T>(ratio_w * (out_img_idx + 0.5) - 0.5);
    int input_x = floorf(in_img_idx);
-    const T x_t = in_img_idx - input_x;
+    const T x_t = static_cast<T>(static_cast<MT>(in_img_idx) - input_x);

    T coefficients[4];
    const T* in_pos_0;
@@ -419,16 +428,15 @@ __global__ void KeBicubicInterpFw(const T* in,
            &in[out_id_h * input_w + access_y * in_img_w * num_channels +
                access_x_3 * num_channels + channel_id];

-        coefficients[k] = Kecubic_interp(
+        coefficients[k] = Kecubic_interp<T>(
            in_pos_0[0], in_pos_1[0], in_pos_2[0], in_pos_3[0], x_t);
      }

-      out[out_id_h * output_w + out_id_w] =
-          static_cast<T>(Kecubic_interp(coefficients[0],
-                                        coefficients[1],
-                                        coefficients[2],
-                                        coefficients[3],
-                                        y_t));
+      out[out_id_h * output_w + out_id_w] = Kecubic_interp<T>(coefficients[0],
+                                                              coefficients[1],
+                                                              coefficients[2],
+                                                              coefficients[3],
+                                                              y_t);
    }
  }
 }
@@ -482,33 +490,37 @@ __global__ void KeTrilinearInterpFw(const T* in,
                         : static_cast<int>(ratio_d * out_img_idt);
    in_img_idt = (in_img_idt > 0) ? in_img_idt : 0;
    int d_id = (in_img_idt < in_img_d - 1) ? 1 : 0;
-    T src_d = ratio_d * (out_img_idt + 0.5) - 0.5;
-    src_d = (src_d > 0) ? src_d : 0;
-    T d1lambda =
-        align_flag ? src_d - in_img_idt : ratio_d * out_img_idt - in_img_idt;
-    T d2lambda = 1.f - d1lambda;
+    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+    T src_d = static_cast<T>(ratio_d * (out_img_idt + 0.5) - 0.5);
+    src_d = (src_d > static_cast<T>(0)) ? src_d : static_cast<T>(0);
+    T d1lambda = align_flag
+                     ? static_cast<T>(static_cast<MT>(src_d) - in_img_idt)
+                     : static_cast<T>(ratio_d * out_img_idt - in_img_idt);
+    T d2lambda = static_cast<T>(1.0) - d1lambda;

    int in_img_idy = align_flag
                         ? static_cast<int>(ratio_h * (out_img_idy + 0.5) - 0.5)
                         : static_cast<int>(ratio_h * out_img_idy);
    in_img_idy = (in_img_idy > 0) ? in_img_idy : 0;
    int h_id = (in_img_idy < in_img_h - 1) ? 1 : 0;
-    T src_h = ratio_h * (out_img_idy + 0.5) - 0.5;
-    src_h = (src_h > 0) ? src_h : 0;
-    T h1lambda =
-        align_flag ? src_h - in_img_idy : ratio_h * out_img_idy - in_img_idy;
-    T h2lambda = 1.f - h1lambda;
+    T src_h = static_cast<T>(ratio_h * (out_img_idy + 0.5) - 0.5);
+    src_h = (src_h > static_cast<T>(0)) ? src_h : static_cast<T>(0);
+    T h1lambda = align_flag
+                     ? static_cast<T>(static_cast<MT>(src_h) - in_img_idy)
+                     : static_cast<T>(ratio_h * out_img_idy - in_img_idy);
+    T h2lambda = static_cast<T>(1.0) - h1lambda;

    int in_img_idx = align_flag
                         ? static_cast<int>(ratio_w * (out_img_idx + 0.5) - 0.5)
                         : static_cast<int>(ratio_w * out_img_idx);
    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;
    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;
-    T src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
-    src_w = (src_w > 0) ? src_w : 0;
-    T w1lambda =
-        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
-    T w2lambda = 1.f - w1lambda;
+    T src_w = static_cast<T>(ratio_w * (out_img_idx + 0.5) - 0.5);
+    src_w = (src_w > static_cast<T>(0)) ? src_w : static_cast<T>(0);
+    T w1lambda = align_flag
+                     ? static_cast<T>(static_cast<MT>(src_w) - in_img_idx)
+                     : static_cast<T>(ratio_w * out_img_idx - in_img_idx);
+    T w2lambda = static_cast<T>(1.0) - w1lambda;

    if (data_layout == DataLayout::kNCHW) {
      int in_pos1_idx = out_id_h * input_w + channel_id * in_img_size +
@@ -926,7 +938,8 @@ static void Interpolate2DCUDAFwd(
      thread_num = 512;
    }
 #endif
-    const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0;
+    const float align_type_value =
+        (align_mode == 0 && !align_corners) ? 0.5f : 0.f;
    if (data_layout == DataLayout::kNCHW) {
      // get launch 3D config
      int nc = n * c;
@@ -1028,7 +1041,7 @@ static void Interpolate3DCUDAFwd(
    if (scale_tensor) {
      auto scale_data =
          funcs::get_new_data_from_tensor<float>(scale_tensor.get_ptr());
-      if (scale_data.size() > 1) {
+      if (scale_data.size() > 2) {
        scale_d = scale_data[0];
        scale_h = scale_data[1];
        scale_w = scale_data[2];
@@ -1060,7 +1073,7 @@ static void Interpolate3DCUDAFwd(
              "should be greater than 0, but received value is %d.",
              scale_d));
    } else {
-      if (scale.size() > 1) {
+      if (scale.size() > 2) {
        scale_d = scale[0];
        scale_h = scale[1];
        scale_w = scale[2];
@@ -1446,6 +1459,7 @@ PD_REGISTER_KERNEL(bilinear_interp,
                   phi::BilinearInterpKernel,
                   float,
                   double,
+                   phi::dtype::float16,
                   int) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
@@ -1456,6 +1470,7 @@ PD_REGISTER_KERNEL(nearest_interp,
                   phi::NearestInterpKernel,
                   float,
                   double,
+                   phi::dtype::float16,
                   int,
                   int64_t) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
@@ -1467,6 +1482,7 @@ PD_REGISTER_KERNEL(trilinear_interp,
                   phi::TrilinearInterpKernel,
                   float,
                   double,
+                   phi::dtype::float16,
                   int) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
@@ -1477,6 +1493,7 @@ PD_REGISTER_KERNEL(linear_interp,
                   phi::LinearInterpKernel,
                   float,
                   double,
+                   phi::dtype::float16,
                   int) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
@@ -1487,6 +1504,7 @@ PD_REGISTER_KERNEL(bicubic_interp,
                   phi::BicubicInterpKernel,
                   float,
                   double,
+                   phi::dtype::float16,
                   int) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);

--- a/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bicubic_interp_v2_op.py
@@ -622,6 +622,44 @@ class TestBicubicOpError(unittest.TestCase):
            self.test_imperative_errors()


+@unittest.skipIf(not fluid.core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestBicubicInterpOpForFloat16(unittest.TestCase):
+
+    def init_test_case(self):
+        self.interp_method = 'bicubic'
+        self.input_shape = [2, 3, 5, 5]
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+        self.data_layout = 'NCHW'
+
+    def check_main(self, x_np, dtype):
+        paddle.disable_static()
+        x_np = x_np.astype(dtype)
+        x = paddle.to_tensor(x_np)
+        x.stop_gradient = False
+        y = interpolate(x,
+                        size=self.out_size.tolist(),
+                        mode=self.interp_method,
+                        align_corners=self.align_corners,
+                        data_format=self.data_layout)
+        x_g = paddle.grad(y, x)
+        y_np = y[0].numpy().astype('float32')
+        x_g_np = x_g[0].numpy().astype('float32')
+        paddle.enable_static()
+        return y_np, x_g_np
+
+    def test_main(self):
+        self.init_test_case()
+        x_np = np.random.random(self.input_shape).astype("float16")
+
+        y_np_1, x_g_np_1 = self.check_main(x_np, 'float16')
+        y_np_2, x_g_np_2 = self.check_main(x_np, 'float32')
+
+        np.testing.assert_allclose(y_np_1, y_np_2)
+        np.testing.assert_allclose(x_g_np_1, x_g_np_2)
+
+
 if __name__ == "__main__":
    paddle.enable_static()
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_v2_op.py
@@ -766,5 +766,45 @@ class TestBilinearInterpOpAPI_dy4(unittest.TestCase):
            np.testing.assert_allclose(out.numpy(), expect_res, rtol=1e-05)


+@unittest.skipIf(not fluid.core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestBilinearInterpOpForFloat16(unittest.TestCase):
+
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 5, 5]
+        self.out_size = np.array([3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+        self.data_layout = 'NCHW'
+
+    def check_main(self, x_np, dtype):
+        paddle.disable_static()
+        x_np = x_np.astype(dtype)
+        x = paddle.to_tensor(x_np)
+        x.stop_gradient = False
+        y = interpolate(x,
+                        size=self.out_size.tolist(),
+                        mode=self.interp_method,
+                        align_mode=self.align_mode,
+                        align_corners=self.align_corners,
+                        data_format=self.data_layout)
+        x_g = paddle.grad(y, x)
+        y_np = y[0].numpy().astype('float32')
+        x_g_np = x_g[0].numpy().astype('float32')
+        paddle.enable_static()
+        return y_np, x_g_np
+
+    def test_main(self):
+        self.init_test_case()
+        x_np = np.random.random(self.input_shape).astype("float16")
+
+        y_np_1, x_g_np_1 = self.check_main(x_np, 'float16')
+        y_np_2, x_g_np_2 = self.check_main(x_np, 'float32')
+
+        np.testing.assert_allclose(y_np_1, y_np_2)
+        np.testing.assert_allclose(x_g_np_1, x_g_np_2)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_interp_v2_op.py
@@ -376,9 +376,7 @@ class TestLinearInterpOpAPI2_0(unittest.TestCase):

        # dygraph
        x_data = np.random.random((1, 3, 128)).astype("float32")
-        us_1 = paddle.nn.Upsample(size=[
-            64,
-        ],
+        us_1 = paddle.nn.Upsample(size=[64],
                                  mode='linear',
                                  align_mode=1,
                                  align_corners=False,
@@ -493,28 +491,21 @@ class TestLinearInterpOpError(unittest.TestCase):

            def input_shape_error():
                x1 = fluid.data(name="x1", shape=[1], dtype="float32")
-                out1 = paddle.nn.Upsample(size=[
-                    256,
-                ],
+                out1 = paddle.nn.Upsample(size=[256],
                                          data_format='NCW',
                                          mode='linear')
                out1_res = out1(x1)

            def data_format_error():
                x2 = fluid.data(name="x2", shape=[1, 3, 128], dtype="float32")
-                out2 = paddle.nn.Upsample(size=[
-                    256,
-                ],
+                out2 = paddle.nn.Upsample(size=[256],
                                          data_format='NHWCD',
                                          mode='linear')
                out2_res = out2(x2)

            def out_shape_error():
                x3 = fluid.data(name="x3", shape=[1, 3, 128], dtype="float32")
-                out3 = paddle.nn.Upsample(size=[
-                    256,
-                    256,
-                ],
+                out3 = paddle.nn.Upsample(size=[256, 256],
                                          data_format='NHWC',
                                          mode='linear')
                out3_res = out3(x3)
@@ -524,5 +515,46 @@ class TestLinearInterpOpError(unittest.TestCase):
            self.assertRaises(ValueError, out_shape_error)


+@unittest.skipIf(not fluid.core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestLinearInterpOpForFloat16(unittest.TestCase):
+
+    def init_test_case(self):
+        self.interp_method = 'linear'
+        self.input_shape = [1, 3, 64]
+        self.scale = 2
+        self.align_corners = False
+        self.align_mode = 1
+        self.data_layout = 'NCW'
+
+    def check_main(self, x_np, dtype):
+        paddle.disable_static()
+        x_np = x_np.astype(dtype)
+        x = paddle.to_tensor(x_np)
+        x.stop_gradient = False
+        y = interpolate(x,
+                        scale_factor=self.scale,
+                        mode=self.interp_method,
+                        align_mode=self.align_mode,
+                        align_corners=self.align_corners,
+                        data_format=self.data_layout)
+        x_g = paddle.grad(y, x)
+        y_np = y[0].numpy().astype('float32')
+        x_g_np = x_g[0].numpy().astype('float32')
+        paddle.enable_static()
+        return y_np, x_g_np
+
+    def test_main(self):
+        self.init_test_case()
+        x_np = np.random.random(self.input_shape).astype("float16")
+
+        y_np_1, x_g_np_1 = self.check_main(x_np, 'float16')
+        y_np_2, x_g_np_2 = self.check_main(x_np, 'float32')
+        # forward
+        np.testing.assert_allclose(y_np_1, y_np_2, rtol=1e-03)
+        # backward
+        np.testing.assert_allclose(x_g_np_1, x_g_np_2)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_v2_op.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -802,5 +802,81 @@ class TestNearestInterpException(unittest.TestCase):
        self.assertRaises(ValueError, mode_error)


+@unittest.skipIf(not fluid.core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestNearestInterp3DOpForFloat16(unittest.TestCase):
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 2, 6, 6, 6]
+        self.scale = [2, 2, 2]
+        self.align_corners = False
+        self.data_layout = 'NCDHW'
+
+    def check_main(self, x_np, dtype):
+        paddle.disable_static()
+        x_np = x_np.astype(dtype)
+        x = paddle.to_tensor(x_np)
+        x.stop_gradient = False
+        y = interpolate(x,
+                        scale_factor=self.scale,
+                        mode=self.interp_method,
+                        align_corners=self.align_corners,
+                        data_format=self.data_layout)
+        x_g = paddle.grad(y, x)
+        y_np = y[0].numpy().astype('float32')
+        x_g_np = x_g[0].numpy().astype('float32')
+        paddle.enable_static()
+        return y_np, x_g_np
+
+    def test_main(self):
+        self.init_test_case()
+        x_np = np.random.random(self.input_shape).astype("float16")
+
+        y_np_1, x_g_np_1 = self.check_main(x_np, 'float16')
+        y_np_2, x_g_np_2 = self.check_main(x_np, 'float32')
+        # forward
+        np.testing.assert_allclose(y_np_1, y_np_2, rtol=1e-03)
+        # backward
+        np.testing.assert_allclose(x_g_np_1, x_g_np_2)
+
+
+@unittest.skipIf(not fluid.core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestNearestInterpOpForFloat16(unittest.TestCase):
+
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [2, 2, 6, 6]
+        self.scale = [2, 2]
+        self.align_corners = False
+
+    def check_main(self, x_np, dtype):
+        paddle.disable_static()
+        x_np = x_np.astype(dtype)
+        x = paddle.to_tensor(x_np)
+        x.stop_gradient = False
+        y = interpolate(x,
+                        scale_factor=self.scale,
+                        mode=self.interp_method,
+                        align_corners=self.align_corners)
+        x_g = paddle.grad(y, x)
+        y_np = y[0].numpy().astype('float32')
+        x_g_np = x_g[0].numpy().astype('float32')
+        paddle.enable_static()
+        return y_np, x_g_np
+
+    def test_main(self):
+        self.init_test_case()
+        x_np = np.random.random(self.input_shape).astype("float16")
+
+        y_np_1, x_g_np_1 = self.check_main(x_np, 'float16')
+        y_np_2, x_g_np_2 = self.check_main(x_np, 'float32')
+        # forward
+        np.testing.assert_allclose(y_np_1, y_np_2)
+        # backward
+        np.testing.assert_allclose(x_g_np_1, x_g_np_2)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_trilinear_interp_v2_op.py
@@ -154,15 +154,15 @@ def trilinear_interp_np(input,

                out[:, :, i, j, k] = \
                    d2lambda * \
-                    (h2lambda * (w2lambda * input[:, :, d, h, w] + \
-                              w1lambda * input[:, :, d, h, w+wid]) + \
-                    h1lambda * (w2lambda * input[:, :, d, h+hid, w] + \
-                              w1lambda * input[:, :, d, h+hid, w+wid])) + \
+                    (h2lambda * (w2lambda * input[:, :, d, h, w] +
+                                 w1lambda * input[:, :, d, h, w+wid]) +
+                     h1lambda * (w2lambda * input[:, :, d, h+hid, w] +
+                                 w1lambda * input[:, :, d, h+hid, w+wid])) + \
                    d1lambda * \
-                    (h2lambda * (w2lambda * input[:, :, d+did, h, w] + \
-                              w1lambda * input[:, :, d+did, h, w+wid]) + \
-                    h1lambda * (w2lambda * input[:, :, d+did, h+hid, w] + \
-                              w1lambda * input[:, :, d+did, h+hid, w+wid]))
+                    (h2lambda * (w2lambda * input[:, :, d+did, h, w] +
+                                 w1lambda * input[:, :, d+did, h, w+wid]) +
+                     h1lambda * (w2lambda * input[:, :, d+did, h+hid, w] +
+                                 w1lambda * input[:, :, d+did, h+hid, w+wid]))
    if data_layout == "NDHWC":
        out = np.transpose(out, (0, 2, 3, 4, 1))  # NCDHW => NDHWC

@@ -809,5 +809,59 @@ class TestTrilinearInterpOpException(unittest.TestCase):
        self.assertRaises(ValueError, attr_data_format)


+@unittest.skipIf(not fluid.core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestTrilinearInterpOpForFloat16(unittest.TestCase):
+
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 3, 4, 4, 4]
+        self.out_size = np.array([3, 3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+        self.data_layout = 'NCDHW'
+
+    def check_main(self, x_np, dtype):
+        paddle.disable_static()
+        x_np = x_np.astype(dtype)
+        x = paddle.to_tensor(x_np)
+        x.stop_gradient = False
+        y = interpolate(x,
+                        size=self.out_size.tolist(),
+                        mode=self.interp_method,
+                        align_corners=self.align_corners,
+                        align_mode=self.align_mode,
+                        data_format=self.data_layout)
+        x_g = paddle.grad(y, x)
+        y_np = y[0].numpy().astype('float32')
+        x_g_np = x_g[0].numpy().astype('float32')
+        paddle.enable_static()
+        return y_np, x_g_np
+
+    def test_main(self):
+        self.init_test_case()
+        x_np = np.random.random(self.input_shape).astype("float16")
+
+        y_np_1, x_g_np_1 = self.check_main(x_np, 'float16')
+        y_np_2, x_g_np_2 = self.check_main(x_np, 'float32')
+        # forward
+        np.testing.assert_allclose(y_np_1, y_np_2, rtol=1e-03)
+        # backward
+        np.testing.assert_allclose(x_g_np_1, x_g_np_2, rtol=1e-05)
+
+
+@unittest.skipIf(not fluid.core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestTrilinearInterpDatalayoutForFloat16(TestTrilinearInterpOpForFloat16):
+
+    def init_test_case(self):
+        self.interp_method = 'trilinear'
+        self.input_shape = [2, 4, 4, 4, 3]
+        self.out_size = np.array([3, 3, 3]).astype("int32")
+        self.align_corners = True
+        self.align_mode = 1
+        self.data_layout = "NDHWC"
+
+
 if __name__ == "__main__":
    unittest.main()