diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index e10ae8254a79e7beb2737328885f38053a9bb961..b195ed1aefadc1c8bceee9ff450be37c2af9e9ec 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -162,7 +162,7 @@ cc_library(context_pool SRCS context_pool.cc DEPS phi_context phi_enforce place)
 
 cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory context_pool)
 cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor)
-cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform)
+cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel copy_kernel tensor)
 cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform)
 cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform tensor_copy)
 
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 58827a98503ceb7505703f9b3c1baa3628ed97f9..b00311061c9d0711caa68a3e73545f15b04143f4 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -15,12 +15,14 @@ limitations under the License. */
 #include "paddle/phi/api/lib/data_transform.h"
 
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/api/lib/utils/storage.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
 #include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cast_kernel.h"
+#include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/transfer_layout_kernel.h"
 
-#include "paddle/fluid/framework/data_device_transform.h"
+#include "paddle/fluid/framework/tensor_util.h"
 
 namespace paddle {
 namespace experimental {
@@ -139,9 +141,8 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor,
   VLOG(3) << "DataTypeTransform src_dtype: " << tensor.dtype()
           << " dst_dtype: " << dtype;
 
-  phi::DenseTensor out(
-      phi::make_intrusive<paddle::experimental::SharedStorage>(tensor.place()),
-      {dtype, tensor.dims(), tensor.layout()});
+  DefaultAllocator alloc(tensor.place());
+  phi::DenseTensor out(&alloc, {dtype, tensor.dims(), tensor.layout()});
 
   if (platform::is_cpu_place(tensor.place())) {
     auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place()));
@@ -158,6 +159,51 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor,
   return out;
 }
 
+inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor,
+                                       Place dst_place) {
+  VLOG(3) << "DeviceTransform in, src_place " << tensor.place()
+          << " dst_place: " << dst_place;
+
+  DefaultAllocator alloc(dst_place);
+  phi::DenseTensor out(&alloc,
+                       {tensor.dtype(), tensor.dims(), tensor.layout()});
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  // NOTE(yy): TransDataPlace should wait for computation of input.
+  if (!platform::is_cuda_pinned_place(tensor.place())) {
+    pool.Get(tensor.place())->Wait();
+    pool.Get(dst_place)->Wait();
+  } else if (platform::is_gpu_place(dst_place)) {
+    auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(dst_place));
+    phi::Copy(*dev_ctx, tensor, dst_place, false, &out);
+
+    // Note: This is an empty callback, the only way is to "reference"
+    // tensor, so it will not be destructed until the kernels launched at
+    // current
+    // stream of given place is finished.
+    auto callback = [tensor, dst_place]() {
+      VLOG(4) << "Run callback of tensor:" << &tensor << " at place "
+              << dst_place;
+    };
+    dev_ctx->AddStreamCallback(callback);
+    return out;
+  }
+#endif
+
+  // FIXME(zcd): TransDataPlace is used to transform data from GPU to CPU and
+  // the enforced checkings have been done in GetDeviceContext, so the
+  // `dev_ctx->Wait()` is necessary. But `dev_ctx->Wait()` will make the program
+  // slow, especially when the number of elements is little, for example,
+  // the elements of learning rate are one and it's CPU side.
+  // One solution is to use a CUDA kernel to complete the copy operation when
+  // the transforming is from CPU to GPU and the number of elements is little.
+  // But the embarrassment is that this solution this solution makes training
+  // slower.
+  paddle::framework::TensorCopySync(tensor, dst_place, &out);
+  return out;
+}
+
 phi::DenseTensor TransformData(const phi::DenseTensor& tensor,
                                const phi::TensorArgDef& target_args_def,
                                const TransformFlag& transform_flag) {
@@ -174,10 +220,7 @@ phi::DenseTensor TransformData(const phi::DenseTensor& tensor,
 
   if (NeedTransformPlace(
           out.place(), target_args_def.backend, transform_flag)) {
-    phi::DenseTensor result;
-    framework::TransDataDevice(
-        out, phi::TransToPhiPlace(target_args_def.backend), &result);
-    out = result;
+    out = TransDataPlace(out, phi::TransToPhiPlace(target_args_def.backend));
   }
   return out;
 }