diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index e10ae8254a79e7beb2737328885f38053a9bb961..b195ed1aefadc1c8bceee9ff450be37c2af9e9ec 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -162,7 +162,7 @@ cc_library(context_pool SRCS context_pool.cc DEPS phi_context phi_enforce place) cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory context_pool) cc_library(api_gen_utils SRCS api_gen_utils.cc DEPS phi_tensor_raw selected_rows sparse_csr_tensor sparse_coo_tensor) -cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform) +cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel copy_kernel tensor) cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta phi_data_transform) cc_library(sparse_api_custom_impl SRCS sparse_api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform tensor_copy) diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 58827a98503ceb7505703f9b3c1baa3628ed97f9..b00311061c9d0711caa68a3e73545f15b04143f4 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -15,12 +15,14 @@ limitations under the License. */ #include "paddle/phi/api/lib/data_transform.h" #include "paddle/phi/api/lib/kernel_dispatch.h" -#include "paddle/phi/api/lib/utils/storage.h" +#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/backends/all_context.h" +#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cast_kernel.h" +#include "paddle/phi/kernels/copy_kernel.h" #include "paddle/phi/kernels/transfer_layout_kernel.h" -#include "paddle/fluid/framework/data_device_transform.h" +#include "paddle/fluid/framework/tensor_util.h" namespace paddle { namespace experimental { @@ -139,9 +141,8 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor, VLOG(3) << "DataTypeTransform src_dtype: " << tensor.dtype() << " dst_dtype: " << dtype; - phi::DenseTensor out( - phi::make_intrusive(tensor.place()), - {dtype, tensor.dims(), tensor.layout()}); + DefaultAllocator alloc(tensor.place()); + phi::DenseTensor out(&alloc, {dtype, tensor.dims(), tensor.layout()}); if (platform::is_cpu_place(tensor.place())) { auto* dev_ctx = static_cast(pool.Get(tensor.place())); @@ -158,6 +159,51 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor, return out; } +inline phi::DenseTensor TransDataPlace(const phi::DenseTensor& tensor, + Place dst_place) { + VLOG(3) << "DeviceTransform in, src_place " << tensor.place() + << " dst_place: " << dst_place; + + DefaultAllocator alloc(dst_place); + phi::DenseTensor out(&alloc, + {tensor.dtype(), tensor.dims(), tensor.layout()}); + +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + auto& pool = paddle::platform::DeviceContextPool::Instance(); + // NOTE(yy): TransDataPlace should wait for computation of input. + if (!platform::is_cuda_pinned_place(tensor.place())) { + pool.Get(tensor.place())->Wait(); + pool.Get(dst_place)->Wait(); + } else if (platform::is_gpu_place(dst_place)) { + auto* dev_ctx = static_cast(pool.Get(dst_place)); + phi::Copy(*dev_ctx, tensor, dst_place, false, &out); + + // Note: This is an empty callback, the only way is to "reference" + // tensor, so it will not be destructed until the kernels launched at + // current + // stream of given place is finished. + auto callback = [tensor, dst_place]() { + VLOG(4) << "Run callback of tensor:" << &tensor << " at place " + << dst_place; + }; + dev_ctx->AddStreamCallback(callback); + return out; + } +#endif + + // FIXME(zcd): TransDataPlace is used to transform data from GPU to CPU and + // the enforced checkings have been done in GetDeviceContext, so the + // `dev_ctx->Wait()` is necessary. But `dev_ctx->Wait()` will make the program + // slow, especially when the number of elements is little, for example, + // the elements of learning rate are one and it's CPU side. + // One solution is to use a CUDA kernel to complete the copy operation when + // the transforming is from CPU to GPU and the number of elements is little. + // But the embarrassment is that this solution this solution makes training + // slower. + paddle::framework::TensorCopySync(tensor, dst_place, &out); + return out; +} + phi::DenseTensor TransformData(const phi::DenseTensor& tensor, const phi::TensorArgDef& target_args_def, const TransformFlag& transform_flag) { @@ -174,10 +220,7 @@ phi::DenseTensor TransformData(const phi::DenseTensor& tensor, if (NeedTransformPlace( out.place(), target_args_def.backend, transform_flag)) { - phi::DenseTensor result; - framework::TransDataDevice( - out, phi::TransToPhiPlace(target_args_def.backend), &result); - out = result; + out = TransDataPlace(out, phi::TransToPhiPlace(target_args_def.backend)); } return out; }