[AMP OP&Test] add bf16 fp16 type support for interpolate (#51153)

* add bf16 fp16 type support for interpolate * add bf16 fp16 support for interpolate in phi on cpu

[AMP OP&Test] add bf16 fp16 type support for interpolate (#51153)
* add bf16 fp16 type support for interpolate * add bf16 fp16 support for interpolate in phi on cpu
2f2bf4e8 · 傅剑寒 · GitHub · f57f9b6b · 2f2bf4e8 · 2f2bf4e8
5 changed file
--- a/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_grad_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/interpolate_grad_kernel.h"

 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/interpolate_function.h"
@@ -36,6 +37,7 @@ static void LinearInterpolationGrad(const DenseTensor& output_grad,
  auto input_grad_t = EigenTensor<T, 3>::From(*input_grad);
  auto output_grad_t = EigenTensor<T, 3>::From(output_grad);
  bool align_flag = (align_mode == 0 && !align_corners);
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
  for (int l = 0; l < out_w; l++) {
    int x_w = align_flag ? static_cast<int>(ratio_w * (l + 0.5) - 0.5)
                         : static_cast<int>(ratio_w * l);
@@ -51,11 +53,11 @@ static void LinearInterpolationGrad(const DenseTensor& output_grad,
      for (int j = 0; j < c; j++) {  // loop for channels
        // linear interpolation grad
        if (data_layout == DataLayout::kNCHW) {
-          const T grad = output_grad_t(i, j, l);
+          const MT grad = static_cast<MT>(output_grad_t(i, j, l));
          input_grad_t(i, j, x_w) += static_cast<T>(grad * d_e);
          input_grad_t(i, j, x_e) += static_cast<T>(grad * d_w);
        } else {
-          const T grad = output_grad_t(i, l, j);
+          const MT grad = static_cast<MT>(output_grad_t(i, l, j));
          input_grad_t(i, x_w, j) += static_cast<T>(grad * d_e);
          input_grad_t(i, x_e, j) += static_cast<T>(grad * d_w);
        }
@@ -81,6 +83,9 @@ static void BilinearInterpolationGrad(const DenseTensor& output_grad,
  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
  bool align_flag = (align_mode == 0 && !align_corners);
+
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+
  for (int k = 0; k < out_h; k++) {  // loop for images
    int y_n = align_flag ? static_cast<int>(ratio_h * (k + 0.5) - 0.5)
                         : static_cast<int>(ratio_h * k);
@@ -105,13 +110,14 @@ static void BilinearInterpolationGrad(const DenseTensor& output_grad,
        for (int j = 0; j < c; j++) {  // loop for channels
          // bilinear interpolation grad
          if (data_layout == DataLayout::kNCHW) {
-            const T grad = output_grad_t(i, j, k, l);
+            // const T grad = output_grad_t(i, j, k, l);
+            const MT grad = static_cast<MT>(output_grad_t(i, j, k, l));
            input_grad_t(i, j, y_n, x_w) += static_cast<T>(grad * d_s * d_e);
            input_grad_t(i, j, y_s, x_w) += static_cast<T>(grad * d_n * d_e);
            input_grad_t(i, j, y_n, x_e) += static_cast<T>(grad * d_s * d_w);
            input_grad_t(i, j, y_s, x_e) += static_cast<T>(grad * d_n * d_w);
          } else {
-            const T grad = output_grad_t(i, k, l, j);
+            const MT grad = static_cast<MT>(output_grad_t(i, k, l, j));
            input_grad_t(i, y_n, x_w, j) += static_cast<T>(grad * d_s * d_e);
            input_grad_t(i, y_s, x_w, j) += static_cast<T>(grad * d_n * d_e);
            input_grad_t(i, y_n, x_e, j) += static_cast<T>(grad * d_s * d_w);
@@ -173,24 +179,23 @@ static void BicubicInterpolationGrad(const DenseTensor& output_grad,
                                     const DataLayout data_layout) {
  auto input_grad_t = EigenTensor<T, 4>::From(*input_grad);
  auto output_grad_t = EigenTensor<T, 4>::From(output_grad);
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;

  for (int k = 0; k < out_h; k++) {  // loop for images
-    T y_n = align_corners ? static_cast<T>(ratio_h * k)
-                          : static_cast<T>(ratio_h * (k + 0.5) - 0.5);
+    MT y_n = align_corners ? ratio_h * k : ratio_h * (k + 0.5) - 0.5;
    int input_y = floorf(y_n);
-    T y_t = y_n - input_y;
+    MT y_t = y_n - input_y;

    for (int l = 0; l < out_w; l++) {
-      T x_n = align_corners ? static_cast<T>(ratio_w * l)
-                            : static_cast<T>(ratio_w * (l + 0.5) - 0.5);
+      MT x_n = align_corners ? ratio_w * l : ratio_w * (l + 0.5) - 0.5;
      int input_x = floorf(x_n);
-      T x_t = x_n - input_x;
+      MT x_t = x_n - input_x;

-      T x_coeffs[4];
-      T y_coeffs[4];
+      MT x_coeffs[4];
+      MT y_coeffs[4];

-      funcs::get_cubic_upsample_coefficients<T>(x_coeffs, x_t);
-      funcs::get_cubic_upsample_coefficients<T>(y_coeffs, y_t);
+      funcs::get_cubic_upsample_coefficients<MT>(x_coeffs, x_t);
+      funcs::get_cubic_upsample_coefficients<MT>(y_coeffs, y_t);

      for (int i = 0; i < n; i++) {    // loop for batches
        for (int j = 0; j < c; j++) {  // loop for channels
@@ -202,13 +207,13 @@ static void BicubicInterpolationGrad(const DenseTensor& output_grad,
              int access_y = std::max(std::min(input_y - 1 + jj, in_h - 1),
                                      static_cast<int>(0));
              if (data_layout == DataLayout::kNCHW) {
-                T grad = output_grad_t(i, j, k, l);
+                MT grad = static_cast<MT>(output_grad_t(i, j, k, l));
                input_grad_t(i, j, access_y, access_x) +=
-                    grad * y_coeffs[jj] * x_coeffs[ii];
+                    static_cast<T>(grad * y_coeffs[jj] * x_coeffs[ii]);
              } else {
-                T grad = output_grad_t(i, k, l, j);
+                MT grad = static_cast<MT>(output_grad_t(i, k, l, j));
                input_grad_t(i, access_y, access_x, j) +=
-                    grad * y_coeffs[jj] * x_coeffs[ii];
+                    static_cast<T>(grad * y_coeffs[jj] * x_coeffs[ii]);
              }
            }
          }
@@ -238,6 +243,7 @@ static void TrilinearInterpolationGrad(const DenseTensor& output_grad,
  auto input_grad_t = EigenTensor<T, 5>::From(*input_grad);
  auto output_grad_t = EigenTensor<T, 5>::From(output_grad);
  bool align_flag = (align_mode == 0 && !align_corners);
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
  for (int j = 0; j < out_d; j++) {  // loop for D
    int t_f = align_flag ? static_cast<int>(ratio_d * (j + 0.5) - 0.5)
                         : static_cast<int>(ratio_d * j);
@@ -272,7 +278,7 @@ static void TrilinearInterpolationGrad(const DenseTensor& output_grad,
          for (int i = 0; i < c; i++) {  // loop for channels
            // trilinear interpolation grad
            if (data_layout == DataLayout::kNCHW) {
-              const T grad = output_grad_t(b, i, j, k, l);
+              const MT grad = static_cast<MT>(output_grad_t(b, i, j, k, l));
              input_grad_t(b, i, t_f, y_n, x_w) +=
                  static_cast<T>(grad * d_b * d_s * d_e);
              input_grad_t(b, i, t_f, y_n, x_e) +=
@@ -290,7 +296,7 @@ static void TrilinearInterpolationGrad(const DenseTensor& output_grad,
              input_grad_t(b, i, t_b, y_s, x_e) +=
                  static_cast<T>(grad * d_f * d_n * d_w);
            } else {
-              const T grad = output_grad_t(b, j, k, l, i);
+              const MT grad = static_cast<MT>(output_grad_t(b, j, k, l, i));
              input_grad_t(b, t_f, y_n, x_w, i) +=
                  static_cast<T>(grad * d_b * d_s * d_e);
              input_grad_t(b, t_f, y_n, x_e, i) +=
@@ -1038,7 +1044,9 @@ PD_REGISTER_KERNEL(bilinear_interp_grad,
                   ALL_LAYOUT,
                   phi::BilinearInterpGradKernel,
                   float,
-                   double) {
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
@@ -1047,7 +1055,9 @@ PD_REGISTER_KERNEL(nearest_interp_grad,
                   ALL_LAYOUT,
                   phi::NearestInterpGradKernel,
                   float,
-                   double) {
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
@@ -1056,7 +1066,9 @@ PD_REGISTER_KERNEL(trilinear_interp_grad,
                   ALL_LAYOUT,
                   phi::TrilinearInterpGradKernel,
                   float,
-                   double) {
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
@@ -1065,7 +1077,9 @@ PD_REGISTER_KERNEL(linear_interp_grad,
                   ALL_LAYOUT,
                   phi::LinearInterpGradKernel,
                   float,
-                   double) {
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
@@ -1074,7 +1088,9 @@ PD_REGISTER_KERNEL(bicubic_interp_grad,
                   ALL_LAYOUT,
                   phi::BicubicInterpGradKernel,
                   float,
-                   double) {
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
--- a/paddle/phi/kernels/cpu/interpolate_kernel.cc
+++ b/paddle/phi/kernels/cpu/interpolate_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/interpolate_kernel.h"

 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/interpolate_function.h"
@@ -43,6 +44,7 @@ static void LinearInterpolation(const DenseTensor& input,
  auto input_t = EigenTensor<T, 3>::From(input);
  auto output_t = EigenTensor<T, 3>::From(*output);
  bool align_flag = (align_mode == 0 && !align_corners);
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;

  std::vector<int> vx_w, vx_e;
  std::vector<float> vd_w, vd_e;
@@ -80,12 +82,14 @@ static void LinearInterpolation(const DenseTensor& input,
        // linear interpolation
        T out_t;
        if (data_layout == DataLayout::kNCHW) {
-          out_t = input_t(i, j, vx_w[l]) * vd_e[l] +
-                  input_t(i, j, vx_e[l]) * vd_w[l];
+          out_t =
+              static_cast<T>(static_cast<MT>(input_t(i, j, vx_w[l])) * vd_e[l] +
+                             static_cast<MT>(input_t(i, j, vx_e[l])) * vd_w[l]);
          output_t(i, j, l) = out_t;
        } else {
-          out_t = input_t(i, vx_w[l], j) * vd_e[l] +
-                  input_t(i, vx_e[l], j) * vd_w[l];
+          out_t =
+              static_cast<T>(static_cast<MT>(input_t(i, vx_w[l], j)) * vd_e[l] +
+                             static_cast<MT>(input_t(i, vx_e[l], j)) * vd_w[l]);
          output_t(i, l, j) = out_t;
        }
      }
@@ -110,6 +114,7 @@ static void BilinearInterpolation(const DenseTensor& input,
  auto input_t = EigenTensor<T, 4>::From(input);
  auto output_t = EigenTensor<T, 4>::From(*output);
  bool align_flag = (align_mode == 0 && !align_corners);
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;

  std::vector<int> vy_n, vy_s;
  std::vector<float> vd_n, vd_s;
@@ -174,17 +179,27 @@ static void BilinearInterpolation(const DenseTensor& input,
          // bilinear interpolation
          T out_t;
          if (data_layout == DataLayout::kNCHW) {
-            out_t = input_t(i, j, vy_n[k], vx_w[l]) * vd_s[k] * vd_e[l] +
-                    input_t(i, j, vy_s[k], vx_w[l]) * vd_n[k] * vd_e[l] +
-                    input_t(i, j, vy_n[k], vx_e[l]) * vd_s[k] * vd_w[l] +
-                    input_t(i, j, vy_s[k], vx_e[l]) * vd_n[k] * vd_w[l];
+            out_t = static_cast<T>(
+                static_cast<MT>(input_t(i, j, vy_n[k], vx_w[l])) * vd_s[k] *
+                    vd_e[l] +
+                static_cast<MT>(input_t(i, j, vy_s[k], vx_w[l])) * vd_n[k] *
+                    vd_e[l] +
+                static_cast<MT>(input_t(i, j, vy_n[k], vx_e[l])) * vd_s[k] *
+                    vd_w[l] +
+                static_cast<MT>(input_t(i, j, vy_s[k], vx_e[l])) * vd_n[k] *
+                    vd_w[l]);
            output_t(i, j, k, l) = out_t;

          } else {
-            out_t = input_t(i, vy_n[k], vx_w[l], j) * vd_s[k] * vd_e[l] +
-                    input_t(i, vy_s[k], vx_w[l], j) * vd_n[k] * vd_e[l] +
-                    input_t(i, vy_n[k], vx_e[l], j) * vd_s[k] * vd_w[l] +
-                    input_t(i, vy_s[k], vx_e[l], j) * vd_n[k] * vd_w[l];
+            out_t = static_cast<T>(
+                static_cast<MT>(input_t(i, vy_n[k], vx_w[l], j)) * vd_s[k] *
+                    vd_e[l] +
+                static_cast<MT>(input_t(i, vy_s[k], vx_w[l], j)) * vd_n[k] *
+                    vd_e[l] +
+                static_cast<MT>(input_t(i, vy_n[k], vx_e[l], j)) * vd_s[k] *
+                    vd_w[l] +
+                static_cast<MT>(input_t(i, vy_s[k], vx_e[l], j)) * vd_n[k] *
+                    vd_w[l]);
            output_t(i, k, l, j) = out_t;
          }
        }
@@ -206,6 +221,7 @@ static void NearestNeighborInterpolate(const DenseTensor& input,
                                       const DataLayout& data_layout) {
  auto input_t = EigenTensor<T, 4>::From(input);
  auto output_t = EigenTensor<T, 4>::From(*output);
+
  for (int k = 0; k < out_h; k++) {  // loop for images
    int in_k = (align_corners) ? static_cast<int>(ratio_h * k + 0.5)
                               : static_cast<int>(ratio_h * k);
@@ -242,22 +258,23 @@ static void BicubicInterpolation(const DenseTensor& input,
                                 const DataLayout data_layout) {
  auto input_t = EigenTensor<T, 4>::From(input);
  auto output_t = EigenTensor<T, 4>::From(*output);
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;

  for (int k = 0; k < out_h; k++) {  // loop for images
-    T y_n = align_corners ? static_cast<T>(ratio_h * k)
-                          : static_cast<T>(ratio_h * (k + 0.5) - 0.5);
+    MT y_n = align_corners ? static_cast<MT>(ratio_h * k)
+                           : static_cast<MT>(ratio_h * (k + 0.5) - 0.5);
    int input_y = floorf(y_n);
-    const T y_t = y_n - input_y;
+    const MT y_t = y_n - input_y;

    for (int l = 0; l < out_w; l++) {
-      T x_n = align_corners ? static_cast<T>(ratio_w * l)
-                            : static_cast<T>(ratio_w * (l + 0.5) - 0.5);
+      MT x_n = align_corners ? static_cast<MT>(ratio_w * l)
+                             : static_cast<MT>(ratio_w * (l + 0.5) - 0.5);
      int input_x = floorf(x_n);
-      const T x_t = x_n - input_x;
+      const MT x_t = x_n - input_x;

      for (int i = 0; i < n; i++) {    // loop for batches
        for (int j = 0; j < c; j++) {  // loop for channels
-          T coefficients[4];
+          MT coefficients[4];
          // interp 4 times in x direction
          for (int ii = 0; ii < 4; ii++) {
            int access_y = std::max(std::min(input_y - 1 + ii, in_h - 1),
@@ -271,35 +288,37 @@ static void BicubicInterpolation(const DenseTensor& input,
            int access_x_3 =
                std::max(std::min(input_x + 2, in_w - 1), static_cast<int>(0));
            if (data_layout == DataLayout::kNCHW) {
-              coefficients[ii] =
-                  cubic_interp<T>(input_t(i, j, access_y, access_x_0),
-                                  input_t(i, j, access_y, access_x_1),
-                                  input_t(i, j, access_y, access_x_2),
-                                  input_t(i, j, access_y, access_x_3),
+              coefficients[ii] = cubic_interp<MT>(
+                  static_cast<MT>(input_t(i, j, access_y, access_x_0)),
+                  static_cast<MT>(input_t(i, j, access_y, access_x_1)),
+                  static_cast<MT>(input_t(i, j, access_y, access_x_2)),
+                  static_cast<MT>(input_t(i, j, access_y, access_x_3)),
                  x_t);
            } else {
-              coefficients[ii] =
-                  cubic_interp<T>(input_t(i, access_y, access_x_0, j),
-                                  input_t(i, access_y, access_x_1, j),
-                                  input_t(i, access_y, access_x_2, j),
-                                  input_t(i, access_y, access_x_3, j),
+              coefficients[ii] = cubic_interp<MT>(
+                  static_cast<MT>(input_t(i, access_y, access_x_0, j)),
+                  static_cast<MT>(input_t(i, access_y, access_x_1, j)),
+                  static_cast<MT>(input_t(i, access_y, access_x_2, j)),
+                  static_cast<MT>(input_t(i, access_y, access_x_3, j)),
                  x_t);
            }
          }

          // interp y direction
          if (data_layout == DataLayout::kNCHW) {
-            output_t(i, j, k, l) = cubic_interp<T>(coefficients[0],
+            output_t(i, j, k, l) =
+                static_cast<T>(cubic_interp<MT>(coefficients[0],
                                                coefficients[1],
                                                coefficients[2],
                                                coefficients[3],
-                                                   y_t);
+                                                y_t));
          } else {
-            output_t(i, k, l, j) = cubic_interp<T>(coefficients[0],
+            output_t(i, k, l, j) =
+                static_cast<T>(cubic_interp<MT>(coefficients[0],
                                                coefficients[1],
                                                coefficients[2],
                                                coefficients[3],
-                                                   y_t);
+                                                y_t));
          }
        }
      }
@@ -327,6 +346,7 @@ static void TrilinearInterpolation(const DenseTensor& input,
  auto input_t = EigenTensor<T, 5>::From(input);
  auto output_t = EigenTensor<T, 5>::From(*output);
  bool align_flag = (align_mode == 0 && !align_corners);
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;

  std::vector<int> vt_f, vt_b;
  std::vector<float> vd_f, vd_b;
@@ -417,40 +437,42 @@ static void TrilinearInterpolation(const DenseTensor& input,
          for (int l = 0; l < out_w; l++) {
            // trilinear interpolation
            if (data_layout == DataLayout::kNCHW) {
-              T out_t = input_t(b, i, vt_f[j], vy_n[k], vx_w[l]) * vd_b[j] *
-                            vd_s[k] * vd_e[l] +
-                        input_t(b, i, vt_f[j], vy_n[k], vx_e[l]) * vd_b[j] *
-                            vd_s[k] * vd_w[l] +
-                        input_t(b, i, vt_f[j], vy_s[k], vx_w[l]) * vd_b[j] *
-                            vd_n[k] * vd_e[l] +
-                        input_t(b, i, vt_f[j], vy_s[k], vx_e[l]) * vd_b[j] *
-                            vd_n[k] * vd_w[l] +
-                        input_t(b, i, vt_b[j], vy_n[k], vx_w[l]) * vd_f[j] *
-                            vd_s[k] * vd_e[l] +
-                        input_t(b, i, vt_b[j], vy_n[k], vx_e[l]) * vd_f[j] *
-                            vd_s[k] * vd_w[l] +
-                        input_t(b, i, vt_b[j], vy_s[k], vx_w[l]) * vd_f[j] *
-                            vd_n[k] * vd_e[l] +
-                        input_t(b, i, vt_b[j], vy_s[k], vx_e[l]) * vd_f[j] *
-                            vd_n[k] * vd_w[l];
+              T out_t = static_cast<T>(
+                  static_cast<MT>(input_t(b, i, vt_f[j], vy_n[k], vx_w[l])) *
+                      vd_b[j] * vd_s[k] * vd_e[l] +
+                  static_cast<MT>(input_t(b, i, vt_f[j], vy_n[k], vx_e[l])) *
+                      vd_b[j] * vd_s[k] * vd_w[l] +
+                  static_cast<MT>(input_t(b, i, vt_f[j], vy_s[k], vx_w[l])) *
+                      vd_b[j] * vd_n[k] * vd_e[l] +
+                  static_cast<MT>(input_t(b, i, vt_f[j], vy_s[k], vx_e[l])) *
+                      vd_b[j] * vd_n[k] * vd_w[l] +
+                  static_cast<MT>(input_t(b, i, vt_b[j], vy_n[k], vx_w[l])) *
+                      vd_f[j] * vd_s[k] * vd_e[l] +
+                  static_cast<MT>(input_t(b, i, vt_b[j], vy_n[k], vx_e[l])) *
+                      vd_f[j] * vd_s[k] * vd_w[l] +
+                  static_cast<MT>(input_t(b, i, vt_b[j], vy_s[k], vx_w[l])) *
+                      vd_f[j] * vd_n[k] * vd_e[l] +
+                  static_cast<MT>(input_t(b, i, vt_b[j], vy_s[k], vx_e[l])) *
+                      vd_f[j] * vd_n[k] * vd_w[l]);
              output_t(b, i, j, k, l) = out_t;
            } else {
-              T out_t = input_t(b, vt_f[j], vy_n[k], vx_w[l], i) * vd_b[j] *
-                            vd_s[k] * vd_e[l] +
-                        input_t(b, vt_f[j], vy_n[k], vx_e[l], i) * vd_b[j] *
-                            vd_s[k] * vd_w[l] +
-                        input_t(b, vt_f[j], vy_s[k], vx_w[l], i) * vd_b[j] *
-                            vd_n[k] * vd_e[l] +
-                        input_t(b, vt_f[j], vy_s[k], vx_e[l], i) * vd_b[j] *
-                            vd_n[k] * vd_w[l] +
-                        input_t(b, vt_b[j], vy_n[k], vx_w[l], i) * vd_f[j] *
-                            vd_s[k] * vd_e[l] +
-                        input_t(b, vt_b[j], vy_n[k], vx_e[l], i) * vd_f[j] *
-                            vd_s[k] * vd_w[l] +
-                        input_t(b, vt_b[j], vy_s[k], vx_w[l], i) * vd_f[j] *
-                            vd_n[k] * vd_e[l] +
-                        input_t(b, vt_b[j], vy_s[k], vx_e[l], i) * vd_f[j] *
-                            vd_n[k] * vd_w[l];
+              T out_t = static_cast<T>(
+                  static_cast<MT>(input_t(b, vt_f[j], vy_n[k], vx_w[l], i)) *
+                      vd_b[j] * vd_s[k] * vd_e[l] +
+                  static_cast<MT>(input_t(b, vt_f[j], vy_n[k], vx_e[l], i)) *
+                      vd_b[j] * vd_s[k] * vd_w[l] +
+                  static_cast<MT>(input_t(b, vt_f[j], vy_s[k], vx_w[l], i)) *
+                      vd_b[j] * vd_n[k] * vd_e[l] +
+                  static_cast<MT>(input_t(b, vt_f[j], vy_s[k], vx_e[l], i)) *
+                      vd_b[j] * vd_n[k] * vd_w[l] +
+                  static_cast<MT>(input_t(b, vt_b[j], vy_n[k], vx_w[l], i)) *
+                      vd_f[j] * vd_s[k] * vd_e[l] +
+                  static_cast<MT>(input_t(b, vt_b[j], vy_n[k], vx_e[l], i)) *
+                      vd_f[j] * vd_s[k] * vd_w[l] +
+                  static_cast<MT>(input_t(b, vt_b[j], vy_s[k], vx_w[l], i)) *
+                      vd_f[j] * vd_n[k] * vd_e[l] +
+                  static_cast<MT>(input_t(b, vt_b[j], vy_s[k], vx_e[l], i)) *
+                      vd_f[j] * vd_n[k] * vd_w[l]);
              output_t(b, j, k, l, i) = out_t;
            }
          }
@@ -1190,7 +1212,9 @@ PD_REGISTER_KERNEL(bilinear_interp,
                   phi::BilinearInterpKernel,
                   float,
                   double,
-                   uint8_t) {
+                   uint8_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
@@ -1202,7 +1226,9 @@ PD_REGISTER_KERNEL(nearest_interp,
                   double,
                   int,
                   int64_t,
-                   uint8_t) {
+                   uint8_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
@@ -1212,7 +1238,9 @@ PD_REGISTER_KERNEL(trilinear_interp,
                   phi::TrilinearInterpKernel,
                   float,
                   double,
-                   uint8_t) {
+                   uint8_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
@@ -1222,12 +1250,20 @@ PD_REGISTER_KERNEL(linear_interp,
                   phi::LinearInterpKernel,
                   float,
                   double,
-                   uint8_t) {
+                   uint8_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
-PD_REGISTER_KERNEL(
-    bicubic_interp, CPU, ALL_LAYOUT, phi::BicubicInterpKernel, float, double) {
+PD_REGISTER_KERNEL(bicubic_interp,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::BicubicInterpKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
--- a/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_grad_kernel.cu
@@ -60,6 +60,7 @@ __global__ void KeLinearInterpBw(T* in,
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
  bool align_flag = (align_mode == 0 && !align_corners);
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
  for (; tid < nthreads; tid += stride) {
    int out_id_h = tid / output_w;
    int out_id_w = tid % output_w;
@@ -79,13 +80,12 @@ __global__ void KeLinearInterpBw(T* in,
                                : ratio_w * out_img_idx;
    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;  // w
    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;  // w_id
-    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-    T src_w = static_cast<T>(ratio_w * (out_img_idx + 0.5) - 0.5);
-    src_w = (src_w > static_cast<T>(0)) ? src_w : static_cast<T>(0);
-    T w1lambda = align_flag
-                     ? static_cast<T>(static_cast<MT>(src_w) - in_img_idx)
-                     : static_cast<T>(ratio_w * out_img_idx - in_img_idx);
-    T w2lambda = static_cast<T>(1.0) - w1lambda;
+
+    MT src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    MT w1lambda =
+        align_flag ? src_w - in_img_idx : ratio_w * out_img_idx - in_img_idx;
+    MT w2lambda = 1.0 - w1lambda;

    T* in_pos;
    if (data_layout == DataLayout::kNCHW) {
@@ -96,11 +96,17 @@ __global__ void KeLinearInterpBw(T* in,
    const T* out_pos = &out[out_id_w];

    if (data_layout == DataLayout::kNCHW) {
-      phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
-      phi::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(
+          &in_pos[0], static_cast<T>(w2lambda * static_cast<MT>(out_pos[0])));
+      phi::CudaAtomicAdd(
+          &in_pos[w_id],
+          static_cast<T>(w1lambda * static_cast<MT>(out_pos[0])));
    } else {
-      phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
-      phi::CudaAtomicAdd(&in_pos[w_id * num_channels], w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(
+          &in_pos[0], static_cast<T>(w2lambda * static_cast<MT>(out_pos[0])));
+      phi::CudaAtomicAdd(
+          &in_pos[w_id * num_channels],
+          static_cast<T>(w1lambda * static_cast<MT>(out_pos[0])));
    }
  }
 }
@@ -469,6 +475,7 @@ __global__ void KeBicubicInterpBw(T* in,
  int nthreads = output_h * output_w;
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;

  for (; tid < nthreads; tid += stride) {
    int out_id_h = tid / output_w;
@@ -487,23 +494,21 @@ __global__ void KeBicubicInterpBw(T* in,
      channel_id = tid % num_channels;
    }

-    T in_img_idy = align_corners
-                       ? static_cast<T>(ratio_h * out_img_idy)
-                       : static_cast<T>(ratio_h * (out_img_idy + 0.5) - 0.5);
+    MT in_img_idy = align_corners ? ratio_h * out_img_idy
+                                  : ratio_h * (out_img_idy + 0.5) - 0.5;
    int input_y = floorf(static_cast<float>(in_img_idy));
-    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-    const T y_t = static_cast<T>(static_cast<MT>(in_img_idy) - input_y);
-    T in_img_idx = align_corners
-                       ? static_cast<T>(ratio_w * out_img_idx)
-                       : static_cast<T>(ratio_w * (out_img_idx + 0.5) - 0.5);
+
+    const MT y_t = in_img_idy - input_y;
+    MT in_img_idx = align_corners ? ratio_w * out_img_idx
+                                  : ratio_w * (out_img_idx + 0.5) - 0.5;
    int input_x = floorf(static_cast<float>(in_img_idx));
-    const T x_t = static_cast<T>(static_cast<MT>(in_img_idx) - input_x);
+    const MT x_t = in_img_idx - input_x;

-    T x_coeffs[4];
-    T y_coeffs[4];
+    MT x_coeffs[4];
+    MT y_coeffs[4];

-    funcs::get_cubic_upsample_coefficients(x_coeffs, x_t);
-    funcs::get_cubic_upsample_coefficients(y_coeffs, y_t);
+    funcs::get_cubic_upsample_coefficients<MT>(x_coeffs, x_t);
+    funcs::get_cubic_upsample_coefficients<MT>(y_coeffs, y_t);

    const T* out_pos = &out[out_id_h * output_w + out_id_w];
    T* in_pos;
@@ -524,7 +529,8 @@ __global__ void KeBicubicInterpBw(T* in,
                       access_x * num_channels + channel_id];
        }
        phi::CudaAtomicAdd(&in_pos[0],
-                           (out_pos[0] * y_coeffs[j] * x_coeffs[i]));
+                           static_cast<T>(static_cast<MT>(out_pos[0]) *
+                                          y_coeffs[j] * x_coeffs[i]));
      }
    }
  }
@@ -1568,7 +1574,8 @@ PD_REGISTER_KERNEL(bilinear_interp_grad,
                   phi::BilinearInterpGradKernel,
                   float,
                   double,
-                   phi::dtype::float16) {
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
@@ -1589,7 +1596,8 @@ PD_REGISTER_KERNEL(trilinear_interp_grad,
                   phi::TrilinearInterpGradKernel,
                   float,
                   double,
-                   phi::dtype::float16) {
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
@@ -1599,7 +1607,8 @@ PD_REGISTER_KERNEL(linear_interp_grad,
                   phi::LinearInterpGradKernel,
                   float,
                   double,
-                   phi::dtype::float16) {
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
@@ -1609,7 +1618,8 @@ PD_REGISTER_KERNEL(bicubic_interp_grad,
                   phi::BicubicInterpGradKernel,
                   float,
                   double,
-                   phi::dtype::float16) {
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(3).SetBackend(phi::Backend::ALL_BACKEND);
 }
--- a/paddle/phi/kernels/gpu/interpolate_kernel.cu
+++ b/paddle/phi/kernels/gpu/interpolate_kernel.cu
@@ -82,26 +82,27 @@ __global__ void KeLinearInterpFw(const T* in,
    in_img_idx = (in_img_idx > 0) ? in_img_idx : 0;  // w
    int w_id = (in_img_idx < in_img_w - 1) ? 1 : 0;  // w_id
    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-    T src_w = static_cast<T>(ratio_w * (out_img_idx + 0.5) - 0.5);
-    src_w = (src_w > static_cast<T>(0)) ? src_w : static_cast<T>(0);
-    T w1lambda = align_flag
-                     ? static_cast<T>(static_cast<MT>(src_w) - in_img_idx)
-                     : static_cast<T>(ratio_w * out_img_idx - in_img_idx);
-    T w2lambda = static_cast<T>(1.0) - w1lambda;
+    MT src_w = ratio_w * (out_img_idx + 0.5) - 0.5;
+    src_w = (src_w > 0) ? src_w : 0;
+    MT w1lambda = align_flag ? (src_w - in_img_idx)
+                             : (ratio_w * out_img_idx - in_img_idx);
+    MT w2lambda = 1.0 - w1lambda;

    if (data_layout == DataLayout::kNCHW) {
      const T* in_pos =
          &in[out_id_h * out_id_w + channel_id * in_img_size + in_img_idx];
      // linear interpolation
      out[out_id_h * output_w + out_id_w] =
-          w2lambda * in_pos[0] + w1lambda * in_pos[w_id];
+          static_cast<T>(w2lambda * static_cast<MT>(in_pos[0]) +
+                         w1lambda * static_cast<MT>(in_pos[w_id]));

    } else {
      const T* in_pos =
          &in[out_id_h * input_w + in_img_idx * num_channels + channel_id];
      // linear interpolation
-      out[out_id_h * output_w + out_id_w] =
-          w2lambda * in_pos[0] + w1lambda * in_pos[w_id * num_channels];
+      out[out_id_h * output_w + out_id_w] = static_cast<T>(
+          w2lambda * static_cast<MT>(in_pos[0]) +
+          w1lambda * static_cast<MT>(in_pos[w_id * num_channels]));
    }
  }
 }
@@ -308,15 +309,18 @@ __global__ void KeBilinearInterpNCHWFw(const T* in,
 template <typename T>
 __device__ __forceinline__ static T Kecubic_interp(
    const T x0, const T x1, const T x2, const T x3, T t) {
-  T coeffs[4];
-  T a = static_cast<T>(-0.75);
-  T x_1 = t;
-  T x_2 = static_cast<T>(1.0) - t;
-  coeffs[0] = funcs::CubicConvolution2<T>(x_1 + static_cast<T>(1.0), a);
-  coeffs[1] = funcs::CubicConvolution1<T>(x_1, a);
-  coeffs[2] = funcs::CubicConvolution1<T>(x_2, a);
-  coeffs[3] = funcs::CubicConvolution2<T>(x_2 + static_cast<T>(1.0), a);
-  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  MT coeffs[4];
+  MT a = static_cast<MT>(-0.75);
+  MT x_1 = static_cast<MT>(t);
+  MT x_2 = static_cast<MT>(1.0) - static_cast<MT>(t);
+  coeffs[0] = funcs::CubicConvolution2<MT>(x_1 + static_cast<MT>(1.0), a);
+  coeffs[1] = funcs::CubicConvolution1<MT>(x_1, a);
+  coeffs[2] = funcs::CubicConvolution1<MT>(x_2, a);
+  coeffs[3] = funcs::CubicConvolution2<MT>(x_2 + static_cast<MT>(1.0), a);
+  return static_cast<T>(
+      static_cast<MT>(x0) * coeffs[0] + static_cast<MT>(x1) * coeffs[1] +
+      static_cast<MT>(x2) * coeffs[2] + static_cast<MT>(x3) * coeffs[3]);
 }

 template <typename T>
@@ -338,6 +342,7 @@ __global__ void KeBicubicInterpFw(const T* in,
  int nthreads = output_h * output_w;
  int tid = blockIdx.x * blockDim.x + threadIdx.x;
  int stride = blockDim.x * gridDim.x;
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;

  for (; tid < nthreads; tid += stride) {
    int out_id_h = tid / output_w;
@@ -357,18 +362,16 @@ __global__ void KeBicubicInterpFw(const T* in,
      channel_id = tid % num_channels;
    }

-    T in_img_idy = align_corners
-                       ? static_cast<T>(ratio_h * out_img_idy)
-                       : static_cast<T>(ratio_h * (out_img_idy + 0.5) - 0.5);
+    MT in_img_idy = align_corners ? ratio_h * out_img_idy
+                                  : ratio_h * (out_img_idy + 0.5) - 0.5;
    int input_y = floorf(static_cast<float>(in_img_idy));
-    using MT = typename phi::dtype::MPTypeTrait<T>::Type;
-    const T y_t = static_cast<T>(static_cast<MT>(in_img_idy) - input_y);

-    T in_img_idx = align_corners
-                       ? static_cast<T>(ratio_w * out_img_idx)
-                       : static_cast<T>(ratio_w * (out_img_idx + 0.5) - 0.5);
+    const T y_t = static_cast<T>(in_img_idy - input_y);
+
+    MT in_img_idx = align_corners ? ratio_w * out_img_idx
+                                  : ratio_w * (out_img_idx + 0.5) - 0.5;
    int input_x = floorf(static_cast<float>(in_img_idx));
-    const T x_t = static_cast<T>(static_cast<MT>(in_img_idx) - input_x);
+    const T x_t = static_cast<T>(in_img_idx - input_x);

    T coefficients[4];
    const T* in_pos_0;
@@ -1460,6 +1463,7 @@ PD_REGISTER_KERNEL(bilinear_interp,
                   float,
                   double,
                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
                   int) {
  kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
@@ -1486,6 +1490,7 @@ PD_REGISTER_KERNEL(trilinear_interp,
                   float,
                   double,
                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
                   int) {
  kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
@@ -1498,6 +1503,7 @@ PD_REGISTER_KERNEL(linear_interp,
                   float,
                   double,
                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
                   int) {
  kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
@@ -1510,6 +1516,7 @@ PD_REGISTER_KERNEL(bicubic_interp,
                   float,
                   double,
                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
                   int) {
  kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND);
  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);

--- a/paddle/phi/kernels/onednn/interpolate_kernel.cc
+++ b/paddle/phi/kernels/onednn/interpolate_kernel.cc
@@ -232,7 +232,8 @@ PD_REGISTER_KERNEL(bilinear_interp,
                   ONEDNN,
                   phi::BilinearInterpKernel,
                   float,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}

 PD_REGISTER_KERNEL(nearest_interp,
                   OneDNN,
@@ -240,5 +241,6 @@ PD_REGISTER_KERNEL(nearest_interp,
                   phi::NearestInterpKernel,
                   float,
                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
                   int8_t,
                   uint8_t) {}