diff --git a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
index 050c61596fee5f91363566fd5fb58684fcc30522..a4d43ef8fbe8927198ff6e83a39454a3a94a9886 100644
--- a/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/deformable_conv_grad_kernel.cc
@@ -97,14 +97,14 @@ inline void ModulatedDeformableCol2imCPUKernel(
                                            width);
 
           *(grad_im + cur_bottom_grad_pos) =
-              *(grad_im + cur_bottom_grad_pos) + (weight * cur_top_grad);
+              *(grad_im + cur_bottom_grad_pos) + weight * cur_top_grad;
         }
       }
     }
   }
 }
 
-template <typename T, typename MT, typename Context>
+template <typename T, typename Context>
 void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const T* data_col,
                                const T* data_offset,
@@ -116,7 +116,7 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const std::vector<int>& stride,
                                const std::vector<int>& dilation,
                                const int deformable_group,
-                               MT* grad_im) {
+                               T* grad_im) {
   int channel_per_deformable_group = im_shape[0] / deformable_group;
   int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
 
@@ -222,22 +222,22 @@ void ModulatedDeformableCol2imCoordCPUKernel(
       if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
         inv_h = inv_w = -2;
       } else {
-        mval += data_col_ptr[col_pos] * funcs::DmcnIm2colBilinear<T, T>(
-                                            data_im_ptr + cnt * height * width,
-                                            width,
-                                            height,
-                                            width,
-                                            inv_h,
-                                            inv_w);
+        mval += data_col_ptr[col_pos] *
+                funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width,
+                                          width,
+                                          height,
+                                          width,
+                                          inv_h,
+                                          inv_w);
       }
       const T weight =
-          DmcnGetCoordinateWeight<T, T>(inv_h,
-                                        inv_w,
-                                        height,
-                                        width,
-                                        data_im_ptr + cnt * height * width,
-                                        width,
-                                        bp_dir);
+          DmcnGetCoordinateWeight(inv_h,
+                                  inv_w,
+                                  height,
+                                  width,
+                                  data_im_ptr + cnt * height * width,
+                                  width,
+                                  bp_dir);
       if (data_mask_ptr) {
         const int data_mask_hw_ptr =
             (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cc b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
index 253a66adfc6a2f520c42821f90e18957c9c36668..48858fa59390efbdb1a01b69fc8453729d9800e9 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cc
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/common/amp_type_traits.h"
 
 namespace phi {
 namespace funcs {
@@ -82,8 +82,8 @@ inline void ModulatedDeformableIm2colCPUKernel(
         const T h_im = h_in + i * dilation_h + offset_h;
         const T w_im = w_in + j * dilation_w + offset_w;
         if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
-          val = DmcnIm2colBilinear<T, T>(
-              data_im_ptr, width, height, width, h_im, w_im);
+          val =
+              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
         }
         *data_col_ptr = val;
         if (data_mask_ptr) {
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.cu b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
index 0d5076a4937c3d398cdec119df20750df24673a8..48105d1f517e9b6a083ad5093de9912d7f867918 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.cu
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.cu
@@ -14,9 +14,6 @@
 
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/common/float16.h"
-#include "paddle/phi/core/device_context.h"
 
 namespace phi {
 namespace funcs {
@@ -54,8 +51,6 @@ __global__ void ModulatedDeformableIm2colGpuKernel(
     T* data_col) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
-
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   for (size_t i = index; i < nthreads; i += offset) {
     const int w_col = i % width_col;
     const int h_col = (i / width_col) % height_col;
@@ -90,22 +85,22 @@ __global__ void ModulatedDeformableIm2colGpuKernel(
             ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
             w_col;
 
-        const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
-        const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
-        MT val = static_cast<MT>(0);
-        const MT h_im = h_in + i * dilation_h + offset_h;
-        const MT w_im = w_in + j * dilation_w + offset_w;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
         if (h_im > -1 && w_im > -1 && h_im < height && w_im < width) {
-          val = DmcnIm2colBilinear<T, MT>(
-              data_im_ptr, width, height, width, h_im, w_im);
+          val =
+              DmcnIm2colBilinear(data_im_ptr, width, height, width, h_im, w_im);
         }
+        *data_col_ptr = val;
         if (data_mask_ptr) {
           const int data_mask_hw_ptr =
               ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
-          const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
-          val *= mask;
+          const T mask = data_mask_ptr[data_mask_hw_ptr];
+          *data_col_ptr *= mask;
         }
-        *data_col_ptr = static_cast<T>(val);
         data_col_ptr += batch_size * height_col * width_col;
       }
     }
@@ -169,20 +164,6 @@ template void ModulatedDeformableIm2col(
     const int deformable_groups,
     float* data_col);
 
-template void ModulatedDeformableIm2col(
-    const phi::GPUContext& dev_ctx,
-    const phi::dtype::float16* data_im,
-    const phi::dtype::float16* data_offset,
-    const phi::dtype::float16* data_mask,
-    const std::vector<int64_t>& im_shape,
-    const std::vector<int64_t>& col_shape,
-    const std::vector<int64_t>& filter_shape,
-    const std::vector<int>& paddings,
-    const std::vector<int>& strides,
-    const std::vector<int>& dilations,
-    const int deformable_groups,
-    phi::dtype::float16* data_col);
-
 template void ModulatedDeformableIm2col(
     const phi::GPUContext& dev_ctx,
     const double* data_im,
diff --git a/paddle/phi/kernels/funcs/deformable_conv_functor.h b/paddle/phi/kernels/funcs/deformable_conv_functor.h
index 62e42cd58334fea15df60a8a3524e38adca2271c..eecda72927510d79735daa8da282b72967a9eebd 100644
--- a/paddle/phi/kernels/funcs/deformable_conv_functor.h
+++ b/paddle/phi/kernels/funcs/deformable_conv_functor.h
@@ -14,47 +14,44 @@
 
 #pragma once
 
-#include "paddle/phi/common/amp_type_traits.h"
-#include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
 namespace funcs {
 
-template <typename T, typename MT>
-HOSTDEVICE MT DmcnIm2colBilinear(const T* bottom_data,
-                                 const int data_width,
-                                 const int height,
-                                 const int width,
-                                 MT h,
-                                 MT w) {
+template <typename T>
+HOSTDEVICE T DmcnIm2colBilinear(const T* bottom_data,
+                                const int data_width,
+                                const int height,
+                                const int width,
+                                T h,
+                                T w) {
   int h_low = floor(h);
   int w_low = floor(w);
   int h_high = h_low + 1;
   int w_high = w_low + 1;
 
-  MT lh = h - h_low;
-  MT lw = w - w_low;
-  MT hh = 1 - lh;
-  MT hw = 1 - lw;
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh;
+  T hw = 1 - lw;
 
-  MT v1 = (h_low >= 0 && w_low >= 0)
-              ? static_cast<MT>(bottom_data[h_low * data_width + w_low])
-              : 0;
-  MT v2 = (h_low >= 0 && w_high <= width - 1)
-              ? static_cast<MT>(bottom_data[h_low * data_width + w_high])
-              : 0;
-  MT v3 = (h_high <= height - 1 && w_low >= 0)
-              ? static_cast<MT>(bottom_data[h_high * data_width + w_low])
-              : 0;
-  MT v4 = (h_high <= height - 1 && w_high <= width - 1)
-              ? static_cast<MT>(bottom_data[h_high * data_width + w_high])
-              : 0;
+  T v1 =
+      (h_low >= 0 && w_low >= 0) ? bottom_data[h_low * data_width + w_low] : 0;
+  T v2 = (h_low >= 0 && w_high <= width - 1)
+             ? bottom_data[h_low * data_width + w_high]
+             : 0;
+  T v3 = (h_high <= height - 1 && w_low >= 0)
+             ? bottom_data[h_high * data_width + w_low]
+             : 0;
+  T v4 = (h_high <= height - 1 && w_high <= width - 1)
+             ? bottom_data[h_high * data_width + w_high]
+             : 0;
 
-  MT w1 = hh * hw;
-  MT w2 = hh * lw;
-  MT w3 = lh * hw;
-  MT w4 = lh * lw;
+  T w1 = hh * hw;
+  T w2 = hh * lw;
+  T w3 = lh * hw;
+  T w4 = lh * lw;
 
   return w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4;
 }
diff --git a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
index 5d2f4727c53e25f8ee50f9870f1808773e51c8e8..b46f1f4a3314dc4f6666b9e02fd3c7ea88e8bd50 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_grad_kernel.cu
@@ -29,7 +29,7 @@ static inline int NumBlocks(const int N) {
                   kNumMaximumNumBlocks);
 }
 
-template <typename T, typename MT>
+template <typename T>
 __global__ void ModulatedDeformableCol2imGpuKernel(
     const int nthreads,
     const T* data_col,
@@ -51,10 +51,9 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
     const int deformable_group,
     const int height_col,
     const int width_col,
-    MT* grad_im) {
+    T* grad_im) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
-  // using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   for (size_t thread = index; thread < nthreads; thread += offset) {
     const int j = (thread / width_col / height_col / batch_size) % kernel_w;
     const int i =
@@ -79,17 +78,17 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
         ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
     const int data_mask_hw_ptr =
         ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
-    const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
-    const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
-    const MT cur_inv_h_data = h_in + i * dilation_h + offset_h;
-    const MT cur_inv_w_data = w_in + j * dilation_w + offset_w;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
 
-    MT cur_top_grad = static_cast<MT>(data_col[thread]);
+    T cur_top_grad = data_col[thread];
     if (data_mask) {
       const T* data_mask_ptr =
           data_mask + (b * deformable_group + deformable_group_index) *
                           kernel_h * kernel_w * height_col * width_col;
-      const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
+      const T mask = data_mask_ptr[data_mask_hw_ptr];
       cur_top_grad *= mask;
     }
     const int cur_h = static_cast<int>(cur_inv_h_data);
@@ -101,12 +100,13 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
             abs(cur_inv_w_data - (cur_w + dx)) < 1) {
           int cur_bottom_grad_pos =
               ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
-          MT weight = DmcnGetGradientWeight(cur_inv_h_data,
-                                            cur_inv_w_data,
-                                            cur_h + dy,
-                                            cur_w + dx,
-                                            height,
-                                            width);
+          T weight = DmcnGetGradientWeight(cur_inv_h_data,
+                                           cur_inv_w_data,
+                                           cur_h + dy,
+                                           cur_w + dx,
+                                           height,
+                                           width);
+
           paddle::platform::CudaAtomicAdd(grad_im + cur_bottom_grad_pos,
                                           weight * cur_top_grad);
         }
@@ -115,7 +115,7 @@ __global__ void ModulatedDeformableCol2imGpuKernel(
   }
 }
 
-template <typename T, typename MT, typename Context>
+template <typename T, typename Context>
 void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const T* data_col,
                                const T* data_offset,
@@ -127,13 +127,13 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const std::vector<int>& stride,
                                const std::vector<int>& dilation,
                                const int deformable_group,
-                               MT* grad_im) {
+                               T* grad_im) {
   int channel_per_deformable_group = im_shape[0] / deformable_group;
   int num_kernels = col_shape[0] * col_shape[1] * col_shape[2] * col_shape[3];
   int blocks = NumBlocks(num_kernels);
   int threads = kNumCUDAThreads;
 
-  ModulatedDeformableCol2imGpuKernel<T, MT>
+  ModulatedDeformableCol2imGpuKernel<T>
       <<<blocks, threads, 0, dev_ctx.stream()>>>(num_kernels,
                                                  data_col,
                                                  data_offset,
@@ -185,9 +185,8 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
     T* grad_mask) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int offset = blockDim.x * gridDim.x;
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   for (size_t i = index; i < nthreads; i += offset) {
-    MT val = 0, mval = 0;
+    T val = 0, mval = 0;
     const int w = i % width_col;
     const int h = (i / width_col) % height_col;
     const int c = (i / width_col / height_col) % offset_channels;
@@ -232,42 +231,40 @@ __global__ void ModulatedDeformableCol2imCoordGpuKernel(
       const int data_offset_w_ptr =
           (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
            w_out);
-
-      const MT offset_h = static_cast<MT>(data_offset_ptr[data_offset_h_ptr]);
-      const MT offset_w = static_cast<MT>(data_offset_ptr[data_offset_w_ptr]);
-      MT inv_h = h_in + i * dilation_h + offset_h;
-      MT inv_w = w_in + j * dilation_w + offset_w;
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
       if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width) {
         inv_h = inv_w = -2;
       } else {
-        mval +=
-            static_cast<MT>(data_col_ptr[col_pos]) *
-            funcs::DmcnIm2colBilinear<T, MT>(data_im_ptr + cnt * height * width,
-                                             width,
-                                             height,
-                                             width,
-                                             inv_h,
-                                             inv_w);
+        mval += data_col_ptr[col_pos] *
+                funcs::DmcnIm2colBilinear(data_im_ptr + cnt * height * width,
+                                          width,
+                                          height,
+                                          width,
+                                          inv_h,
+                                          inv_w);
       }
-      const MT weight =
-          DmcnGetCoordinateWeight<T, MT>(inv_h,
-                                         inv_w,
-                                         height,
-                                         width,
-                                         data_im_ptr + cnt * height * width,
-                                         width,
-                                         bp_dir);
+      const T weight =
+          DmcnGetCoordinateWeight(inv_h,
+                                  inv_w,
+                                  height,
+                                  width,
+                                  data_im_ptr + cnt * height * width,
+                                  width,
+                                  bp_dir);
       if (data_mask_ptr) {
         const int data_mask_hw_ptr =
             (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
-        const MT mask = static_cast<MT>(data_mask_ptr[data_mask_hw_ptr]);
-        val += weight * static_cast<MT>(data_col_ptr[col_pos]) * mask;
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        val += weight * data_col_ptr[col_pos] * mask;
       } else {
-        val += weight * static_cast<MT>(data_col_ptr[col_pos]);
+        val += weight * data_col_ptr[col_pos];
       }
       cnt += 1;
     }
-    grad_offset[i] = static_cast<T>(val);
+    grad_offset[i] = val;
     if (grad_mask && offset_c % 2 == 0)
       grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
                       kernel_w +
@@ -362,5 +359,4 @@ PD_REGISTER_KERNEL(deformable_conv_grad,
                    ALL_LAYOUT,
                    phi::DeformableConvGradKernel,
                    float,
-                   double,
-                   paddle::platform::float16) {}
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
index 021791ca9306130794229c9aa9e7a1fbe83bbcfd..2476dcbafb984856bee71f9a840ac6d45ba1b369 100644
--- a/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
+++ b/paddle/phi/kernels/gpu/deformable_conv_kernel.cu
@@ -23,5 +23,4 @@ PD_REGISTER_KERNEL(deformable_conv,
                    ALL_LAYOUT,
                    phi::DeformableConvKernel,
                    float,
-                   double,
-                   phi::dtype::float16) {}
+                   double) {}
diff --git a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
index 7402a2273365b0f8c3230fd4c0fccdb8957e66a6..744c48b2bfbd61b436359079c1b1c86405728c26 100644
--- a/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_grad_kernel_impl.h
@@ -14,10 +14,8 @@
 
 #pragma once
 
-#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
-#include "paddle/phi/kernels/cast_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
@@ -60,14 +58,14 @@ HOSTDEVICE T DmcnGetGradientWeight(T argmax_h,
   return weight;
 }
 
-template <typename T, typename MT>
-HOSTDEVICE MT DmcnGetCoordinateWeight(MT argmax_h,
-                                      MT argmax_w,
-                                      const int height,
-                                      const int width,
-                                      const T* im_data,
-                                      const int data_width,
-                                      const int bp_dir) {
+template <typename T>
+HOSTDEVICE T DmcnGetCoordinateWeight(T argmax_h,
+                                     T argmax_w,
+                                     const int height,
+                                     const int width,
+                                     const T* im_data,
+                                     const int data_width,
+                                     const int bp_dir) {
   if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
       argmax_w >= width) {
     return 0;
@@ -78,51 +76,43 @@ HOSTDEVICE MT DmcnGetCoordinateWeight(MT argmax_h,
   int argmax_h_high = argmax_h_low + 1;
   int argmax_w_high = argmax_w_low + 1;
 
-  MT weight = 0;
+  T weight = 0;
 
   if (bp_dir == 0) {
     weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
                   ? -1 * (argmax_w_low + 1 - argmax_w) *
-                        static_cast<MT>(
-                            im_data[argmax_h_low * data_width + argmax_w_low])
+                        im_data[argmax_h_low * data_width + argmax_w_low]
                   : 0;
 
     weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
                   ? -1 * (argmax_w - argmax_w_low) *
-                        static_cast<MT>(
-                            im_data[argmax_h_low * data_width + argmax_w_high])
+                        im_data[argmax_h_low * data_width + argmax_w_high]
                   : 0;
 
     weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
                   ? (argmax_w_low + 1 - argmax_w) *
-                        static_cast<MT>(
-                            im_data[argmax_h_high * data_width + argmax_w_low])
+                        im_data[argmax_h_high * data_width + argmax_w_low]
                   : 0;
     weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
                   ? (argmax_w - argmax_w_low) *
-                        static_cast<MT>(
-                            im_data[argmax_h_high * data_width + argmax_w_high])
+                        im_data[argmax_h_high * data_width + argmax_w_high]
                   : 0;
   } else if (bp_dir == 1) {
     weight += (argmax_h_low >= 0 && argmax_w_low >= 0)
                   ? -1 * (argmax_h_low + 1 - argmax_h) *
-                        static_cast<MT>(
-                            im_data[argmax_h_low * data_width + argmax_w_low])
+                        im_data[argmax_h_low * data_width + argmax_w_low]
                   : 0;
     weight += (argmax_h_low >= 0 && argmax_w_high <= width - 1)
                   ? (argmax_h_low + 1 - argmax_h) *
-                        static_cast<MT>(
-                            im_data[argmax_h_low * data_width + argmax_w_high])
+                        im_data[argmax_h_low * data_width + argmax_w_high]
                   : 0;
     weight += (argmax_h_high <= height - 1 && argmax_w_low >= 0)
                   ? -1 * (argmax_h - argmax_h_low) *
-                        static_cast<MT>(
-                            im_data[argmax_h_high * data_width + argmax_w_low])
+                        im_data[argmax_h_high * data_width + argmax_w_low]
                   : 0;
     weight += (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
                   ? (argmax_h - argmax_h_low) *
-                        static_cast<MT>(
-                            im_data[argmax_h_high * data_width + argmax_w_high])
+                        im_data[argmax_h_high * data_width + argmax_w_high]
                   : 0;
   }
 
@@ -145,7 +135,7 @@ void ModulatedDeformableCol2imCoord(const Context& dev_ctx,
                                     T* grad_offset,
                                     T* grad_mask);
 
-template <typename T, typename MT, typename Context>
+template <typename T, typename Context>
 void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const T* data_col,
                                const T* data_offset,
@@ -157,7 +147,7 @@ void ModulatedDeformableCol2im(const Context& dev_ctx,
                                const std::vector<int>& stride,
                                const std::vector<int>& dilation,
                                const int deformable_group,
-                               MT* grad_im);
+                               T* grad_im);
 
 template <typename T, typename Context>
 void FilterGradAddup(const Context& dev_ctx,
@@ -186,7 +176,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                               DenseTensor* filter_grad,
                               DenseTensor* mask_grad) {
   const int batch_size = static_cast<int>(x.dims()[0]);
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+
   DDim input_shape = phi::slice_ddim(x.dims(), 1, x.dims().size());
   std::vector<int64_t> input_shape_vec = phi::vectorize(input_shape);
   std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
@@ -302,8 +292,8 @@ void DeformableConvGradKernel(const Context& dev_ctx,
           mask_grad_data_ptr);
     }
     if (dx) {
-      MT* mt_dx_ptr = dev_ctx.template Alloc<MT>(dx);
-
+      T* dx_ptr = dx->data<T>();
+      // get grad of input
       ModulatedDeformableCol2im(dev_ctx,
                                 col_buffer_ptr,
                                 offset_ptr + i * im2col_step * input_offset_dim,
@@ -315,7 +305,7 @@ void DeformableConvGradKernel(const Context& dev_ctx,
                                 strides,
                                 dilations,
                                 deformable_groups,
-                                mt_dx_ptr + i * im2col_step * input_dim);
+                                dx_ptr + i * im2col_step * input_dim);
       dx->Resize(x.dims());
     }
 
diff --git a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
index d66f4e58e5b61ceca83624bf88326dfde365a616..f864c2e5f0ed0ca5caef3ea51f2c8ee1c563e2c3 100644
--- a/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
+++ b/paddle/phi/kernels/impl/deformable_conv_kernel_impl.h
@@ -14,13 +14,11 @@
 
 #pragma once
 
-#include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/deformable_conv_functor.h"
-#include "paddle/phi/kernels/transpose_kernel.h"
 #include "paddle/utils/optional.h"
 
 namespace phi {
@@ -40,12 +38,6 @@ void DeformableConvKernel(const Context& dev_ctx,
                           DenseTensor* out) {
   const int batch_size = static_cast<int>(x.dims()[0]);
 
-  int temp_step = std::min(64, batch_size);
-  if (batch_size % temp_step == 0) {
-    im2col_step = temp_step;
-  }
-
-  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
   std::vector<int64_t> filter_shape_vec(phi::vectorize(filter.dims()));
   std::vector<int64_t> output_shape_vec(phi::vectorize(out->dims()));
 
@@ -109,11 +101,8 @@ void DeformableConvKernel(const Context& dev_ctx,
         dilations,
         deformable_groups,
         col_buffer_ptr);
-    DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize(phi::slice_ddim(
-        output_4d.dims(),
-        1,
-        output_4d.dims().size()));  // group * C/group * (im2step * H * W)
-
+    DenseTensor output_3d = output_4d.Slice(i, i + 1).Resize(
+        phi::slice_ddim(output_4d.dims(), 1, output_4d.dims().size()));
     // get the product of pixel and weight
     for (int g = 0; g < groups; ++g) {
       DenseTensor weight_3d_slice = weight_3d.Slice(g, g + 1).Resize(
@@ -121,11 +110,8 @@ void DeformableConvKernel(const Context& dev_ctx,
       DenseTensor col_buffer_3d_slice =
           col_buffer_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
               col_buffer_3d.dims(), 1, col_buffer_3d.dims().size()));
-      DenseTensor output_3d_slice =
-          output_3d.Slice(g, g + 1).Resize(phi::slice_ddim(
-              output_3d.dims(),
-              1,
-              output_3d.dims().size()));  // C * ((im2col_step)*H*W))
+      DenseTensor output_3d_slice = output_3d.Slice(g, g + 1).Resize(
+          phi::slice_ddim(output_3d.dims(), 1, output_3d.dims().size()));
       blas.MatMul(weight_3d_slice,
                   false,
                   col_buffer_3d_slice,
@@ -135,29 +121,7 @@ void DeformableConvKernel(const Context& dev_ctx,
                   T(0.0));
     }
   }
-
-  //  swap axis to get the right result when im2col_step is greater than 1
-  if (im2col_step > 1) {
-    std::vector<int> axis(4);
-    axis[0] = 0;
-    axis[1] = 2;
-    axis[2] = 1;
-    axis[3] = 3;
-
-    DenseTensor real_output_buffer = phi::Transpose<T, Context>(
-        dev_ctx,
-        output_4d.Resize(
-            phi::make_ddim({batch_size / im2col_step,
-                            output_shape_vec[1],
-                            im2col_step,
-                            output_shape_vec[2] * output_shape_vec[3]})),
-        axis);
-
-    out->ShareDataWith(real_output_buffer)
-        .Resize(phi::make_ddim(output_shape_vec));
-  } else {
-    out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec));
-  }
+  out->ShareDataWith(output_buffer).Resize(phi::make_ddim(output_shape_vec));
 }
 
 }  // namespace phi
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
index 0e149b171048ec325dc09f20ac97eac181ed3f73..eacf6dba27312bdcd86c03560d04d192b742144a 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_v1_op.py
@@ -19,8 +19,6 @@ import paddle.fluid as fluid
 from op_test import OpTest
 from paddle.fluid.framework import _test_eager_guard
 
-paddle.enable_static()
-
 
 def dmc_bilinear(data_im, height, width, h, w):
     h_low = int(np.floor(h))
@@ -60,8 +58,8 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param):
     assert f_c * group == in_c
     assert np.mod(out_c, group) == 0
 
-    stride, pad, dilation = conv_param['stride'], conv_param['pad'], \
-                            conv_param['dilation']
+    stride, pad, dilation = conv_param['stride'], conv_param['pad'],\
+        conv_param['dilation']
     out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) // stride[0]
     out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) // stride[1]
     assert out_h == in_h
@@ -75,18 +73,18 @@ def dconv_im2col_gemm(input, offset, filter, group, conv_param):
                     for kh in range(f_h):
                         for kw in range(f_w):
                             offset_h_table = \
-                                offset[n, ::2, h, w].reshape(f_h, f_w)
+                                    offset[n, ::2, h, w].reshape(f_h, f_w)
                             offset_w_table = \
-                                offset[n, 1::2, h, w].reshape(f_h, f_w)
+                                    offset[n, 1::2, h, w].reshape(f_h, f_w)
                             offset_h = offset_h_table[kh, kw]
                             offset_w = offset_w_table[kh, kw]
                             val = 0
                             im_h = h * stride[0] + kh * dilation[0] \
-                                   + offset_h - pad[0]
+                                + offset_h - pad[0]
                             im_w = w * stride[0] + kw * dilation[0] \
-                                   + offset_w - pad[1]
+                                + offset_w - pad[1]
                             if im_h > -1 and im_w > -1 and \
-                                    im_h < in_h and im_w < in_h:
+                                im_h < in_h and im_w < in_h:
                                 val = dmc_bilinear(input[n, c], in_h, in_w,
                                                    im_h, im_w)
                             val_out = val
@@ -285,69 +283,6 @@ class TestWithDouble(TestModulatedDeformableConvOp):
         self.dtype = np.float64
 
 
-class TestFP16(unittest.TestCase):
-
-    def check_main(self, input_np, offset_np, filter_np, dtype):
-        paddle.disable_static()
-        input_np = input_np.astype(dtype)
-        offset_np = offset_np.astype(dtype)
-        filter_np = filter_np.astype(dtype)
-
-        input = paddle.to_tensor(input_np)
-        offset = paddle.to_tensor(offset_np)
-        filter = paddle.to_tensor(filter_np)
-
-        input.stop_gradient = False
-        offset.stop_gradient = False
-        filter.stop_gradient = False
-
-        y = paddle.vision.ops.deform_conv2d(input, offset, filter)
-        input_grad, offset_grad, filter_grad = paddle.grad(
-            y, [input, offset, filter])
-        y_np = y.numpy().astype('float32')
-        input_grad_np = input_grad.numpy().astype('float32')
-        offset_grad_np = offset_grad.numpy().astype('float32')
-        filter_grad_np = filter_grad.numpy().astype('float32')
-        paddle.enable_static()
-        return y_np, input_grad_np, offset_grad_np, filter_grad_np
-
-    def test_main(self):
-        if not paddle.is_compiled_with_cuda():
-            return
-        self.pad = [1, 1]
-        self.stride = [1, 1]
-        self.dilations = [1, 1]
-        self.groups = 1
-        self.input_size = [2, 3, 5, 5]  # NCHW
-        assert np.mod(self.input_size[1], self.groups) == 0
-        f_c = self.input_size[1] // self.groups
-        self.filter_size = [40, f_c, 1, 1]
-        self.im2col_step = 1
-        self.deformable_groups = 1
-        offset_c = 2 * self.deformable_groups * self.filter_size[
-            2] * self.filter_size[3]
-        self.offset_size = [
-            self.input_size[0], offset_c, self.input_size[2], self.input_size[3]
-        ]
-
-        input = np.random.random(self.input_size)
-        offset = 10 * np.random.random(self.offset_size)
-        filter = np.random.random(self.filter_size)
-
-        y_np_1, input_g_np_1, offset_g_np_1, filter_g_np_1 = self.check_main(
-            input, offset, filter, 'float16')
-        y_np_2, input_g_np_2, offset_g_np_2, filter_g_np_2 = self.check_main(
-            input, offset, filter, 'float32')
-
-        def assert_equal(x, y):
-            np.testing.assert_allclose(x, y, atol=3e-2)
-
-        assert_equal(y_np_1, y_np_2)
-        assert_equal(input_g_np_1, input_g_np_2)
-        assert_equal(offset_g_np_1, offset_g_np_2)
-        assert_equal(filter_g_np_1, filter_g_np_2)
-
-
 class TestModulatedDeformableConvV1InvalidInput(unittest.TestCase):
 
     def test_error(self):