From 934171ae01581b1ec2928e25c6465859b651b244 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Thu, 1 Sep 2022 10:05:26 +0800 Subject: [PATCH] remove circular dependency of device_context and allocator (#45455) * refine cmake of framework * add deps for dense tensor * fix deps * remove alloc(ctx) * add depends on mkldnn --- paddle/fluid/framework/CMakeLists.txt | 13 +++- .../framework/details/nan_inf_utils_detail.cu | 6 +- paddle/fluid/framework/operator.h | 24 ++----- paddle/fluid/framework/var_type_traits.cc | 1 - paddle/fluid/memory/allocation/CMakeLists.txt | 2 +- paddle/fluid/memory/malloc.h | 2 - paddle/fluid/memory/malloc_test.cu | 10 ++- .../operators/deformable_psroi_pooling_op.cu | 10 ++- .../fluid/operators/detection/bbox_util.cu.h | 5 +- paddle/fluid/operators/dgc_op.h | 14 ++++- paddle/fluid/operators/fake_quantize_op.h | 11 ++-- paddle/fluid/operators/layer_norm_kernel.cu.h | 12 ++-- paddle/fluid/operators/mean_iou_op.cu | 5 +- paddle/fluid/operators/partial_concat_op.cu | 10 ++- paddle/fluid/operators/partial_sum_op.cu | 11 +++- paddle/fluid/operators/prroi_pool_op.cu | 10 ++- paddle/fluid/operators/quantize_linear_op.h | 5 +- paddle/fluid/operators/sum_op.cu | 11 +++- paddle/fluid/platform/device_context.cc | 62 ------------------- paddle/phi/backends/CMakeLists.txt | 2 +- paddle/phi/core/CMakeLists.txt | 5 +- .../kernels/funcs/concat_and_split_functor.cu | 21 +++++-- .../phi/kernels/funcs/elementwise_grad_base.h | 35 ++++++++--- paddle/phi/kernels/funcs/matrix_inverse.cu.cc | 15 ++++- paddle/phi/kernels/funcs/matrix_solve.cu | 10 ++- .../funcs/sparse/sparse_blas_impl.cu.h | 18 ++++-- .../kernels/funcs/values_vectors_functor.h | 10 ++- paddle/phi/kernels/gpu/add_n_kernel.cu | 6 +- paddle/phi/kernels/gpu/amp_kernel.cu | 22 +++++-- paddle/phi/kernels/gpu/box_coder.cu | 5 +- paddle/phi/kernels/gpu/cholesky_kernel.cu | 10 ++- paddle/phi/kernels/gpu/flip_kernel.cu | 15 ++++- .../gpu/generate_proposals_v2_kernel.cu | 6 +- paddle/phi/kernels/gpu/lu_kernel.cu | 5 +- .../phi/kernels/gpu/matrix_rank_tol_kernel.cu | 25 ++++++-- paddle/phi/kernels/gpu/nms_kernel.cu | 4 +- paddle/phi/kernels/gpu/randperm_kernel.cu | 5 +- .../phi/kernels/gpu/roi_align_grad_kernel.cu | 6 +- paddle/phi/kernels/gpu/roi_align_kernel.cu | 5 +- .../phi/kernels/gpu/roi_pool_grad_kernel.cu | 5 +- paddle/phi/kernels/gpu/roi_pool_kernel.cu | 5 +- paddle/phi/kernels/gpu/stack_grad_kernel.cu | 6 +- paddle/phi/kernels/gpu/stack_kernel.cu | 5 +- paddle/phi/kernels/gpu/svd_kernel.cu | 10 ++- .../phi/kernels/gpu/sync_batch_norm_kernel.cu | 5 +- .../phi/kernels/gpu/sync_batch_norm_utils.h | 5 +- .../kernels/gpu/triangular_solve_kernel.cu | 5 +- paddle/phi/kernels/gpu/yolo_box_kernel.cu | 6 +- 48 files changed, 321 insertions(+), 185 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 7a9c631941e..369dc395fae 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -73,7 +73,8 @@ cc_library( cc_library( data_type SRCS data_type.cc - DEPS framework_proto ddim device_context) + DEPS framework_proto) + cc_test( data_type_test SRCS data_type_test.cc @@ -183,7 +184,7 @@ cc_test( cc_library( var_type_traits SRCS var_type_traits.cc - DEPS selected_rows_utils framework_proto scope) + DEPS framework_proto scope) if(WITH_GPU) target_link_libraries(var_type_traits dynload_cuda) endif() @@ -364,7 +365,13 @@ cc_library( cc_library( shape_inference SRCS shape_inference.cc - DEPS ddim attribute device_context) + DEPS ddim attribute) + +# every source file that includes "dnnl.h" must depends on mkldnn +# or, the first one should depends on mkldnn +if(WITH_MKLDNN) + add_dependencies(shape_inference mkldnn) +endif() cc_test( no_need_buffer_vars_inference_test diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cu b/paddle/fluid/framework/details/nan_inf_utils_detail.cu index d91225a8141..4aa24f8cb6a 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cu +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cu @@ -161,8 +161,10 @@ void TensorCheckerVisitor::apply( std::lock_guard guard(op_var2gpu_str_mutex); if (op_var2gpu_str.find(op_var) == op_var2gpu_str.end()) { // insert - auto gpu_str_tensor = - paddle::memory::Alloc(*dev_ctx, op_var.length() + 1); + auto gpu_str_tensor = paddle::memory::Alloc( + dev_ctx->GetPlace(), + op_var.length() + 1, + phi::Stream(reinterpret_cast(dev_ctx->stream()))); gpu_str_ptr = reinterpret_cast(gpu_str_tensor->ptr()); op_var2gpu_str.emplace(op_var, std::move(gpu_str_tensor)); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 27ce31c25c0..43a5b7a0bb9 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -445,26 +445,10 @@ class ExecutionContext { template Tensor AllocateTmpTensor(const framework::DDim& dim, const DevContext& dev_ctx) const { - auto tmp_allocation_ptr = memory::Alloc(dev_ctx, product(dim) * sizeof(T)); - auto& deleter = tmp_allocation_ptr.get_deleter(); - auto* allocation_ptr = tmp_allocation_ptr.release(); - auto shared_allocation = - std::shared_ptr(allocation_ptr, deleter); - - PADDLE_ENFORCE_GE( - allocation_ptr->size(), - phi::product(dim) * sizeof(T), - platform::errors::PreconditionNotMet( - "The data memory size(%d) is less than the tensor needed memory " - "size(%d).", - allocation_ptr->size(), - phi::product(dim) * sizeof(T))); - - paddle::framework::Tensor temp_tensor(framework::TransToPhiDataType( - framework::ToDataType(std::type_index(typeid(T))))); - temp_tensor.Resize(dim); - temp_tensor.ResetHolder(std::move(shared_allocation)); - return temp_tensor; + phi::DenseTensor tmp; + tmp.Resize(dim); + dev_ctx.template Alloc(&tmp); + return tmp; } const RuntimeContext Context() const { return ctx_; } diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index a3231b1b8b9..6331ee8861a 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -17,7 +17,6 @@ #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" #include "paddle/fluid/platform/macros.h" #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 13a405c7d3d..515524f68bd 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -1,6 +1,6 @@ include(ExternalProject) -set(ALLOCATOR_DEPS place stats profiler phi_backends) +set(ALLOCATOR_DEPS place stats profiler phi_backends device_context) set(ALLOCATOR_SRCS allocator.cc cpu_allocator.cc diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h index 49ced76c337..b8f5f0289c4 100644 --- a/paddle/fluid/memory/malloc.h +++ b/paddle/fluid/memory/malloc.h @@ -33,8 +33,6 @@ extern std::shared_ptr AllocShared(const platform::Place& place, extern AllocationPtr Alloc(const platform::Place& place, size_t size); -extern AllocationPtr Alloc(const phi::DeviceContext& dev_ctx, size_t size); - extern uint64_t Release(const platform::Place& place); extern std::shared_ptr AllocShared(const platform::Place& place, diff --git a/paddle/fluid/memory/malloc_test.cu b/paddle/fluid/memory/malloc_test.cu index b3308ffdd30..0bf5e99b773 100644 --- a/paddle/fluid/memory/malloc_test.cu +++ b/paddle/fluid/memory/malloc_test.cu @@ -67,7 +67,10 @@ void MultiStreamCompute(float **data, float **second_data, const phi::GPUContext &ctx) { // multi-streams - AllocationPtr allocation_ptr = Alloc(ctx, N * sizeof(float)); + AllocationPtr allocation_ptr = + Alloc(ctx.GetPlace(), + N * sizeof(float), + phi::Stream(reinterpret_cast(ctx.stream()))); EXPECT_GE(allocation_ptr->size(), N * sizeof(float)); *data = reinterpret_cast(allocation_ptr->ptr()); #ifdef PADDLE_WITH_HIP @@ -77,7 +80,10 @@ void MultiStreamCompute(float **data, #endif // allocate and compute on same stream again - allocation_ptr = Alloc(ctx, N * sizeof(float)); + allocation_ptr = + Alloc(ctx.GetPlace(), + N * sizeof(float), + phi::Stream(reinterpret_cast(ctx.stream()))); EXPECT_GE(allocation_ptr->size(), N * sizeof(float)); *second_data = reinterpret_cast(allocation_ptr->ptr()); #ifdef PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu index d974a60197d..2fcdebd5e82 100644 --- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu @@ -266,7 +266,10 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel { auto& dev_ctx = ctx.cuda_device_context(); int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = memory::Alloc(dev_ctx, bytes); + auto roi_ptr = memory::Alloc( + dev_ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); const auto gplace = ctx.GetPlace(); memory::Copy(gplace, @@ -577,7 +580,10 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel { } int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = memory::Alloc(dev_ctx, bytes); + auto roi_ptr = memory::Alloc( + dev_ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); const auto gplace = ctx.GetPlace(); memory::Copy(gplace, diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h index 90be767e2f2..e4accef0fa9 100644 --- a/paddle/fluid/operators/detection/bbox_util.cu.h +++ b/paddle/fluid/operators/detection/bbox_util.cu.h @@ -301,7 +301,10 @@ static void NMS(const phi::GPUContext &ctx, const T *boxes = proposals.data(); auto place = ctx.GetPlace(); - auto mask_ptr = memory::Alloc(ctx, boxes_num * col_blocks * sizeof(uint64_t)); + auto mask_ptr = + memory::Alloc(ctx.GetPlace(), + boxes_num * col_blocks * sizeof(uint64_t), + phi::Stream(reinterpret_cast(ctx.stream()))); uint64_t *mask_dev = reinterpret_cast(mask_ptr->ptr()); NMSKernel<<>>( diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h index 80e8ea17c11..82e002cbb33 100644 --- a/paddle/fluid/operators/dgc_op.h +++ b/paddle/fluid/operators/dgc_op.h @@ -187,7 +187,19 @@ class DGCOpKernel : public framework::OpKernel { ctx.GetPlace()); int buf_size = paddle::communication::dgc::get_buffer_size(k); - auto tmp_ious_data = memory::Alloc(dev_ctx, buf_size); + paddle::memory::allocation::AllocationPtr tmp_ious_data; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + if (platform::is_gpu_place(dev_ctx.GetPlace())) { + tmp_ious_data = memory::Alloc( + dev_ctx.GetPlace(), + buf_size, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); + } +#endif + if (platform::is_cpu_place(dev_ctx.GetPlace())) { + tmp_ious_data = memory::Alloc(dev_ctx.GetPlace(), buf_size); + } + void* buf = reinterpret_cast(tmp_ious_data->ptr()); if (!paddle::communication::dgc::k_select( diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h index 9d0527d7104..92aaa1fb248 100644 --- a/paddle/fluid/operators/fake_quantize_op.h +++ b/paddle/fluid/operators/fake_quantize_op.h @@ -328,8 +328,10 @@ class FakeMovingAverageAbsMaxKernelBase : public framework::OpKernel { // training auto *in_accum = context.Input("InAccum"); auto *in_state = context.Input("InState"); - auto cur_scale = memory::Alloc(dev_ctx, sizeof(T)); - T *cur_scale_data = static_cast(cur_scale->ptr()); + + phi::DenseTensor tmp_scale; + tmp_scale.Resize(phi::make_dim(1)); + T *cur_scale_data = dev_ctx.template Alloc(&tmp_scale); FindAbsMaxFunctor()( dev_ctx, in->data(), in->numel(), cur_scale_data); @@ -417,8 +419,9 @@ class MovingAverageAbsMaxScaleKernel : public framework::OpKernel { // training auto *in_accum = context.Input("InAccum"); auto *in_state = context.Input("InState"); - auto cur_scale = memory::Alloc(dev_ctx, sizeof(T)); - T *cur_scale_data = static_cast(cur_scale->ptr()); + phi::DenseTensor tmp_scale; + tmp_scale.Resize(phi::make_dim(1)); + T *cur_scale_data = dev_ctx.template Alloc(&tmp_scale); FindAbsMaxFunctor()( dev_ctx, in->data(), in->numel(), cur_scale_data); diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h index 8ed706a5443..0c41429c61e 100644 --- a/paddle/fluid/operators/layer_norm_kernel.cu.h +++ b/paddle/fluid/operators/layer_norm_kernel.cu.h @@ -1815,10 +1815,14 @@ static void LayerNormBackward( constexpr int part_size = BDIMY2 * VPT; const dim3 blocks2((feature_size + BDIMX2 - 1) / BDIMX2, part_size, 1); - auto part_grad_gamma_ptr = - memory::Alloc(dev_ctx, part_size * feature_size * sizeof(U)); - auto part_grad_beta_ptr = - memory::Alloc(dev_ctx, part_size * feature_size * sizeof(U)); + auto part_grad_gamma_ptr = memory::Alloc( + dev_ctx.GetPlace(), + part_size * feature_size * sizeof(U), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); + auto part_grad_beta_ptr = memory::Alloc( + dev_ctx.GetPlace(), + part_size * feature_size * sizeof(U), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); U *part_grad_gamma = reinterpret_cast(part_grad_gamma_ptr->ptr()); U *part_grad_beta = reinterpret_cast(part_grad_beta_ptr->ptr()); diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu index 08ab074718b..ee31607c63a 100644 --- a/paddle/fluid/operators/mean_iou_op.cu +++ b/paddle/fluid/operators/mean_iou_op.cu @@ -116,7 +116,10 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel { auto out_correct_t = EigenTensor::From(*out_correct); // Temporary memory - auto tmp_ious_data = memory::Alloc(dev_ctx, num_classes * sizeof(float)); + auto tmp_ious_data = memory::Alloc( + dev_ctx.GetPlace(), + num_classes * sizeof(float), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); float* ious_data = static_cast(tmp_ious_data->ptr()); // Init out_wrong, out_correct and out_mean_iou diff --git a/paddle/fluid/operators/partial_concat_op.cu b/paddle/fluid/operators/partial_concat_op.cu index f4d8f7083b0..a6b2700a1a4 100644 --- a/paddle/fluid/operators/partial_concat_op.cu +++ b/paddle/fluid/operators/partial_concat_op.cu @@ -126,7 +126,10 @@ class PartialConcatOpCUDAKernel : public framework::OpKernel { for (int i = 0; i < in_num; ++i) in_data.emplace_back(in_vars[i]->data()); - auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *)); + auto tmp_in_array = memory::Alloc( + dev_ctx.GetPlace(), + in_data.size() * sizeof(T *), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); memory::Copy(dev_ctx.GetPlace(), tmp_in_array->ptr(), platform::CPUPlace(), @@ -202,7 +205,10 @@ class PartialConcatGradOpCUDAKernel : public framework::OpKernel { for (size_t i = 0; i < in_num; ++i) { out_data.emplace_back(outs[i]->data()); } - auto tmp_out_array = memory::Alloc(dev_ctx, out_data.size() * sizeof(T *)); + auto tmp_out_array = memory::Alloc( + dev_ctx.GetPlace(), + out_data.size() * sizeof(T *), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); memory::Copy(dev_ctx.GetPlace(), tmp_out_array->ptr(), diff --git a/paddle/fluid/operators/partial_sum_op.cu b/paddle/fluid/operators/partial_sum_op.cu index 69517233bf3..e0703532c12 100644 --- a/paddle/fluid/operators/partial_sum_op.cu +++ b/paddle/fluid/operators/partial_sum_op.cu @@ -122,7 +122,10 @@ class PartialSumOpCUDAKernel : public framework::OpKernel { } if (!in_data.empty()) { - auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *)); + auto tmp_in_array = memory::Alloc( + dev_ctx.GetPlace(), + in_data.size() * sizeof(T *), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); memory::Copy(dev_ctx.GetPlace(), tmp_in_array->ptr(), @@ -204,8 +207,10 @@ class PartialSumGradOpCUDAKernel : public framework::OpKernel { } if (!out_data.empty()) { - auto tmp_out_array = - memory::Alloc(dev_ctx, out_data.size() * sizeof(T *)); + auto tmp_out_array = memory::Alloc( + dev_ctx.GetPlace(), + out_data.size() * sizeof(T *), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); memory::Copy(dev_ctx.GetPlace(), tmp_out_array->ptr(), diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu index ac4666bb174..e95201c472a 100644 --- a/paddle/fluid/operators/prroi_pool_op.cu +++ b/paddle/fluid/operators/prroi_pool_op.cu @@ -287,7 +287,10 @@ class GPUPRROIPoolOpKernel : public framework::OpKernel { auto cplace = platform::CPUPlace(); auto& dev_ctx = ctx.cuda_device_context(); int bytes = rois_batch_id_list.numel() * sizeof(int); - auto roi_ptr = memory::Alloc(dev_ctx, bytes); + auto roi_ptr = memory::Alloc( + dev_ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); const auto gplace = ctx.GetPlace(); memory::Copy(gplace, @@ -377,7 +380,10 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel { auto cplace = platform::CPUPlace(); auto& dev_ctx = ctx.cuda_device_context(); int bytes = rois_batch_id_list.numel() * sizeof(int); - auto roi_ptr = memory::Alloc(dev_ctx, bytes); + auto roi_ptr = memory::Alloc( + dev_ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); const auto gplace = ctx.GetPlace(); memory::Copy(gplace, diff --git a/paddle/fluid/operators/quantize_linear_op.h b/paddle/fluid/operators/quantize_linear_op.h index 315ca108273..fd0579023b3 100644 --- a/paddle/fluid/operators/quantize_linear_op.h +++ b/paddle/fluid/operators/quantize_linear_op.h @@ -60,8 +60,9 @@ class QuantizeLinearKernel : public framework::OpKernel { // training auto* in_accum = context.Input("InAccum"); auto* in_state = context.Input("InState"); - auto cur_scale = memory::Alloc(dev_ctx, sizeof(T)); - T* cur_scale_data = static_cast(cur_scale->ptr()); + phi::DenseTensor tmp_scale; + tmp_scale.Resize(phi::make_dim(1)); + T* cur_scale_data = dev_ctx.template Alloc(&tmp_scale); FindAbsMaxFunctor()( dev_ctx, in->data(), in->numel(), cur_scale_data); diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu index 2cc17de1820..35a1680d84d 100644 --- a/paddle/fluid/operators/sum_op.cu +++ b/paddle/fluid/operators/sum_op.cu @@ -200,8 +200,10 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { } } if (!sr_in_out_data.empty()) { - auto tmp_sr_in_out_array = - memory::Alloc(dev_ctx, sr_in_out_data.size() * sizeof(T *)); + auto tmp_sr_in_out_array = memory::Alloc( + dev_ctx.GetPlace(), + sr_in_out_data.size() * sizeof(T *), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); memory::Copy(dev_ctx.GetPlace(), tmp_sr_in_out_array->ptr(), @@ -221,7 +223,10 @@ void SumToLoDTensor(const framework::ExecutionContext &context) { } // if indata not null, merge into one kernel call. if (!in_data.empty()) { - auto tmp_in_array = memory::Alloc(dev_ctx, in_data.size() * sizeof(T *)); + auto tmp_in_array = memory::Alloc( + dev_ctx.GetPlace(), + in_data.size() * sizeof(T *), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); memory::Copy(dev_ctx.GetPlace(), tmp_in_array->ptr(), diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 98c2e92f2c3..e2fec11c190 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -39,68 +39,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device/mlu/device_context_allocator.h" #endif -namespace paddle { -namespace memory { - -AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) { - auto place = dev_ctx.GetPlace(); - if (size == 0) { - return Alloc(place, size); - } - - if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - auto* default_dev_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(place)); - auto& desired_dev_ctx = static_cast(dev_ctx); - if (default_dev_ctx->stream() == desired_dev_ctx.stream()) { - return paddle::memory::Alloc(desired_dev_ctx.GetPlace(), - size, - phi::Stream(reinterpret_cast( - desired_dev_ctx.stream()))); - } else { - return allocation::GPUContextAllocatorPool::Instance().Alloc( - desired_dev_ctx, size); - } -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "Paddle can't use CUDA device since it's not compiled with CUDA," - "Please recompile or reinstall Paddle with GPU support.")); -#endif - } else if (platform::is_xpu_place(place)) { -#ifdef PADDLE_WITH_XPU - // TODO(liuyuhui): Consider xpu stream later - return Alloc(place, size); -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "Paddle can't use XPU device since it's not compiled with XPU," - "Please recompile or reinstall Paddle with XPU support.")); -#endif - } else if (platform::is_mlu_place(place)) { -#ifdef PADDLE_WITH_MLU - auto* default_dev_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(place)); - auto& desired_dev_ctx = - static_cast(dev_ctx); - if (default_dev_ctx->stream() == desired_dev_ctx.stream()) { - return Alloc(place, size); - } else { - return allocation::MLUDeviceContextAllocatorPool::Instance().Alloc( - desired_dev_ctx, size); - } -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "Paddle can't use MLU device since it's not compiled with MLU," - "Please recompile or reinstall Paddle with MLU support.")); -#endif - } else { - return Alloc(place, size); - } -} - -} // namespace memory -} // namespace paddle - namespace paddle { namespace platform { diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index 6a55c34266f..9a26aed5f34 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -1,7 +1,7 @@ add_subdirectory(dynload) set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc) -set(BACKENDS_DEPS enforce place flags eigen3) +set(BACKENDS_DEPS enforce place flags eigen3 phi_device_context) if(WITH_GPU OR WITH_ROCM) list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index 099cc87834c..e48f73694af 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -40,7 +40,10 @@ cc_library( cc_library( dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc - DEPS convert_utils fluid_convert_utils tensor_meta tensor_base) + DEPS convert_utils fluid_convert_utils tensor_meta tensor_base ddim) + +target_link_libraries(dense_tensor malloc) + cc_library( sparse_coo_tensor SRCS sparse_coo_tensor.cc diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu index 01701ee2873..57cf64d8df1 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu @@ -315,7 +315,10 @@ struct ConcatFunctor { paddle::memory::allocation::AllocationPtr tmp_dev_ins_data; const T** dev_ins_data = nullptr; if (!has_same_shape || in_num < 2 || in_num > 4) { - tmp_dev_ins_data = paddle::memory::Alloc(context, in_num * sizeof(T*)); + tmp_dev_ins_data = paddle::memory::Alloc( + context.GetPlace(), + in_num * sizeof(T*), + phi::Stream(reinterpret_cast(context.stream()))); auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph( inputs_data, in_num); paddle::memory::Copy(context.GetPlace(), @@ -360,8 +363,10 @@ struct ConcatFunctor { dev_ins_data, in_num, in_col, out_row, out_col, output->data()); } } else { - auto tmp_dev_ins_col_data = - paddle::memory::Alloc(context, inputs_col_num * sizeof(int64_t)); + auto tmp_dev_ins_col_data = paddle::memory::Alloc( + context.GetPlace(), + inputs_col_num * sizeof(int64_t), + phi::Stream(reinterpret_cast(context.stream()))); auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph( inputs_col, inputs_col_num); @@ -475,7 +480,10 @@ class SplitFunctor { T** dev_out_gpu_data = nullptr; if (!has_same_shape || o_num < 2 || o_num > 4) { // TODO(chentianyu03): try to find a method to remove the Alloc function - tmp_dev_outs_data = paddle::memory::Alloc(context, o_num * sizeof(T*)); + tmp_dev_outs_data = paddle::memory::Alloc( + context.GetPlace(), + o_num * sizeof(T*), + phi::Stream(reinterpret_cast(context.stream()))); auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph( outputs_data, o_num); paddle::memory::Copy(context.GetPlace(), @@ -523,7 +531,10 @@ class SplitFunctor { auto tmp_dev_ins_col_data = // TODO(chentianyu03): try to find a method to remove the Alloc // function - paddle::memory::Alloc(context, outputs_cols_num * sizeof(int64_t)); + paddle::memory::Alloc( + context.GetPlace(), + outputs_cols_num * sizeof(int64_t), + phi::Stream(reinterpret_cast(context.stream()))); auto* restored = paddle::platform::RestoreHostMemIfCapturingCUDAGraph( outputs_cols, outputs_cols_num); paddle::memory::Copy(context.GetPlace(), diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h index 9ca21b967a4..62889b530af 100644 --- a/paddle/phi/kernels/funcs/elementwise_grad_base.h +++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h @@ -1524,7 +1524,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, ComputeBroadcastKernelSize( y_dims_array, out_dims_array, &y_blocks, &y_threads, max_dim); - auto x_strides_array_tmp = paddle::memory::Alloc(ctx, bytes); + auto x_strides_array_tmp = paddle::memory::Alloc( + ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(ctx.stream()))); int *x_strides_array_gpu = reinterpret_cast(x_strides_array_tmp->ptr()); paddle::memory::Copy(gplace, @@ -1534,7 +1537,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, bytes, ctx.stream()); - auto y_strides_array_tmp = paddle::memory::Alloc(ctx, bytes); + auto y_strides_array_tmp = paddle::memory::Alloc( + ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(ctx.stream()))); int *y_strides_array_gpu = reinterpret_cast(y_strides_array_tmp->ptr()); paddle::memory::Copy(gplace, @@ -1544,7 +1550,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, bytes, ctx.stream()); - auto out_dims_array_tmp = paddle::memory::Alloc(ctx, bytes); + auto out_dims_array_tmp = paddle::memory::Alloc( + ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(ctx.stream()))); int *out_dims_array_gpu = reinterpret_cast(out_dims_array_tmp->ptr()); paddle::memory::Copy( gplace, out_dims_array_gpu, cplace, out_dims_array, bytes, ctx.stream()); @@ -1554,7 +1563,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, int x_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, x_threads); int y_block_size = std::min(ELEMWISE_MAX_BLOCK_DIM, y_threads); if (dx) { - auto x_strides_order_tmp = paddle::memory::Alloc(ctx, bytes); + auto x_strides_order_tmp = paddle::memory::Alloc( + ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(ctx.stream()))); int *x_strides_order_gpu = reinterpret_cast(x_strides_order_tmp->ptr()); paddle::memory::Copy(gplace, @@ -1564,7 +1576,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, bytes, ctx.stream()); - auto x_dims_order_tmp = paddle::memory::Alloc(ctx, bytes); + auto x_dims_order_tmp = paddle::memory::Alloc( + ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(ctx.stream()))); int *x_dims_order_gpu = reinterpret_cast(x_dims_order_tmp->ptr()); paddle::memory::Copy(gplace, x_dims_order_gpu, @@ -1589,7 +1604,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, dx_op); } if (dy) { - auto y_strides_order_tmp = paddle::memory::Alloc(ctx, bytes); + auto y_strides_order_tmp = paddle::memory::Alloc( + ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(ctx.stream()))); int *y_strides_order_gpu = reinterpret_cast(y_strides_order_tmp->ptr()); paddle::memory::Copy(gplace, @@ -1599,7 +1617,10 @@ void CommonGradBroadcastCUDA(const DenseTensor &x, bytes, ctx.stream()); - auto y_dims_order_tmp = paddle::memory::Alloc(ctx, bytes); + auto y_dims_order_tmp = paddle::memory::Alloc( + ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(ctx.stream()))); int *y_dims_order_gpu = reinterpret_cast(y_dims_order_tmp->ptr()); paddle::memory::Copy(gplace, y_dims_order_gpu, diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc index eb9434396cc..c43c3c04755 100644 --- a/paddle/phi/kernels/funcs/matrix_inverse.cu.cc +++ b/paddle/phi/kernels/funcs/matrix_inverse.cu.cc @@ -36,7 +36,10 @@ void MatrixInverseFunctor::operator()(const Context& dev_ctx, if (n >= 32) { // Copy all elements of input matrix A to a temporary memory space to // avoid being overriden by getrf. - tmp_gpu_mat_data = paddle::memory::Alloc(dev_ctx, a.numel() * sizeof(T)); + tmp_gpu_mat_data = paddle::memory::Alloc( + dev_ctx.GetPlace(), + a.numel() * sizeof(T), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); paddle::memory::Copy(dev_ctx.GetPlace(), tmp_gpu_mat_data->ptr(), dev_ctx.GetPlace(), @@ -54,7 +57,10 @@ void MatrixInverseFunctor::operator()(const Context& dev_ctx, // Copy the addresses of A and A_inv from host to device. paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = - paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*)); + paddle::memory::Alloc( + dev_ctx.GetPlace(), + cpu_ptrs.size() * sizeof(T*), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); paddle::memory::Copy(dev_ctx.GetPlace(), tmp_gpu_ptrs_data->ptr(), phi::CPUPlace(), @@ -67,7 +73,10 @@ void MatrixInverseFunctor::operator()(const Context& dev_ctx, // Allocate device memory for info and pivots. int num_ints = n < 32 ? batch_size : batch_size * (n + 1); paddle::memory::allocation::AllocationPtr tmp_gpu_info_data = - paddle::memory::Alloc(dev_ctx, num_ints * sizeof(int)); + paddle::memory::Alloc( + dev_ctx.GetPlace(), + num_ints * sizeof(int), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); int* gpu_info_ptr = reinterpret_cast(tmp_gpu_info_data->ptr()); auto blas = phi::funcs::GetBlas(dev_ctx); diff --git a/paddle/phi/kernels/funcs/matrix_solve.cu b/paddle/phi/kernels/funcs/matrix_solve.cu index 004375bc240..e2f344c9890 100644 --- a/paddle/phi/kernels/funcs/matrix_solve.cu +++ b/paddle/phi/kernels/funcs/matrix_solve.cu @@ -80,7 +80,10 @@ void MatrixSolveFunctor::operator()(const Context& context, // Copy the addresses of A and tmp_b from host to device. paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = - paddle::memory::Alloc(context, cpu_ptrs.size() * sizeof(T*)); + paddle::memory::Alloc( + context.GetPlace(), + cpu_ptrs.size() * sizeof(T*), + phi::Stream(reinterpret_cast(context.stream()))); paddle::memory::Copy(context.GetPlace(), tmp_gpu_ptrs_data->ptr(), phi::CPUPlace(), @@ -94,7 +97,10 @@ void MatrixSolveFunctor::operator()(const Context& context, // Allocate device memory for BatchedGETRF's info and pivots. int num_ints = n < 32 ? batch_size : batch_size * (n + 1); paddle::memory::allocation::AllocationPtr tmp_gpu_info_data = - paddle::memory::Alloc(context, num_ints * sizeof(int)); + paddle::memory::Alloc( + context.GetPlace(), + num_ints * sizeof(int), + phi::Stream(reinterpret_cast(context.stream()))); int* gpu_info_ptr = reinterpret_cast(tmp_gpu_info_data->ptr()); auto blas = phi::funcs::GetBlas(context); diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h index 9ced02fcb69..738f9280267 100644 --- a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h +++ b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h @@ -337,8 +337,10 @@ void SparseBlas::SPMM(bool transa, &buffer_size); }); - paddle::memory::allocation::AllocationPtr tmp_buffer = - paddle::memory::Alloc(dev_ctx_, buffer_size); + paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc( + dev_ctx_.GetPlace(), + buffer_size, + phi::Stream(reinterpret_cast(dev_ctx_.stream()))); void* tmp_buffer_ptr = tmp_buffer->ptr(); dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseSpMM(handle, @@ -383,8 +385,10 @@ void SparseBlas::SPMV(bool transa, &buffer_size); }); - paddle::memory::allocation::AllocationPtr tmp_buffer = - paddle::memory::Alloc(dev_ctx_, buffer_size); + paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc( + dev_ctx_.GetPlace(), + buffer_size, + phi::Stream(reinterpret_cast(dev_ctx_.stream()))); void* tmp_buffer_ptr = tmp_buffer->ptr(); dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { phi::dynload::cusparseSpMV(handle, @@ -431,8 +435,10 @@ void SparseBlas::SDDMM(bool transa, &buffer_size); }); - paddle::memory::allocation::AllocationPtr tmp_buffer = - paddle::memory::Alloc(dev_ctx_, buffer_size); + paddle::memory::allocation::AllocationPtr tmp_buffer = paddle::memory::Alloc( + dev_ctx_.GetPlace(), + buffer_size, + phi::Stream(reinterpret_cast(dev_ctx_.stream()))); void* tmp_buffer_ptr = tmp_buffer->ptr(); dev_ctx_.CusparseCall([&](cusparseHandle_t handle) { diff --git a/paddle/phi/kernels/funcs/values_vectors_functor.h b/paddle/phi/kernels/funcs/values_vectors_functor.h index a6a6d409703..88bef61fa92 100644 --- a/paddle/phi/kernels/funcs/values_vectors_functor.h +++ b/paddle/phi/kernels/funcs/values_vectors_functor.h @@ -223,7 +223,10 @@ struct MatrixEighFunctor { has_vectors ? CUSOLVER_EIG_MODE_VECTOR : CUSOLVER_EIG_MODE_NOVECTOR; ValueType *out_value = dev_ctx.template Alloc(eigen_values); - auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_size); + auto info = paddle::memory::Alloc( + dev_ctx.GetPlace(), + sizeof(int) * batch_size, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); auto *info_ptr = reinterpret_cast(info->ptr()); DenseTensor input_trans = phi::TransposeLast2Dim(dev_ctx, input); @@ -260,7 +263,10 @@ struct MatrixEighFunctor { out_value, &workspace_size); } - auto work = paddle::memory::Alloc(dev_ctx, sizeof(T) * workspace_size); + auto work = paddle::memory::Alloc( + dev_ctx.GetPlace(), + sizeof(T) * workspace_size, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); auto *work_ptr = reinterpret_cast(work->ptr()); for (auto i = 0; i < batch_size; ++i) { diff --git a/paddle/phi/kernels/gpu/add_n_kernel.cu b/paddle/phi/kernels/gpu/add_n_kernel.cu index d165edaaed7..981ec24dac6 100644 --- a/paddle/phi/kernels/gpu/add_n_kernel.cu +++ b/paddle/phi/kernels/gpu/add_n_kernel.cu @@ -122,8 +122,10 @@ void AddNKernel(const Context &dev_ctx, // if indata not null, merge into one kernel call. if (!in_data.empty()) { - auto tmp_in_array = - paddle::memory::Alloc(dev_ctx, in_data.size() * sizeof(T *)); + auto tmp_in_array = paddle::memory::Alloc( + dev_ctx.GetPlace(), + in_data.size() * sizeof(T *), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); paddle::memory::Copy(dev_ctx.GetPlace(), tmp_in_array->ptr(), diff --git a/paddle/phi/kernels/gpu/amp_kernel.cu b/paddle/phi/kernels/gpu/amp_kernel.cu index b0b0f9c5e79..51e11cc44b8 100644 --- a/paddle/phi/kernels/gpu/amp_kernel.cu +++ b/paddle/phi/kernels/gpu/amp_kernel.cu @@ -163,8 +163,10 @@ class LazyZeros { paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); int64_t* h_starts = reinterpret_cast(h_in_starts_mem->ptr()); - auto d_in_starts_mem = - paddle::memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); + auto d_in_starts_mem = paddle::memory::Alloc( + dev_ctx.GetPlace(), + (xs_size + 1) * sizeof(int64_t), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); int64_t* d_starts = reinterpret_cast(d_in_starts_mem->ptr()); // the start index value of each tensor is @@ -186,7 +188,10 @@ class LazyZeros { paddle::memory::Alloc(cpu_place, xs_size * sizeof(T*)); T** h_out_addrs = reinterpret_cast(h_out_addrs_mem->ptr()); - auto d_out_addrs_mem = paddle::memory::Alloc(dev_ctx, xs_size * sizeof(T*)); + auto d_out_addrs_mem = paddle::memory::Alloc( + dev_ctx.GetPlace(), + xs_size * sizeof(T*), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); T** d_out_addrs = reinterpret_cast(d_out_addrs_mem->ptr()); for (size_t i = 0; i < xs_size; ++i) { @@ -287,8 +292,10 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, paddle::memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); int64_t* h_starts = reinterpret_cast(h_starts_tensor->ptr()); - auto d_starts_tensor = - paddle::memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); + auto d_starts_tensor = paddle::memory::Alloc( + dev_ctx.GetPlace(), + (xs_size + 1) * sizeof(int64_t), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); int64_t* d_starts = reinterpret_cast(d_starts_tensor->ptr()); // the start index value of each tensor is @@ -311,7 +318,10 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx, const T** h_xs = reinterpret_cast(h_mem->ptr()); T** h_outs = reinterpret_cast(h_mem->ptr()) + xs_size; - auto d_mem = paddle::memory::Alloc(dev_ctx, 2 * xs_size * sizeof(T*)); + auto d_mem = paddle::memory::Alloc( + dev_ctx.GetPlace(), + 2 * xs_size * sizeof(T*), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); const T** d_xs = reinterpret_cast(d_mem->ptr()); T** d_outs = reinterpret_cast(d_mem->ptr()) + xs_size; diff --git a/paddle/phi/kernels/gpu/box_coder.cu b/paddle/phi/kernels/gpu/box_coder.cu index e72c5f9cee1..bca18c25c3e 100644 --- a/paddle/phi/kernels/gpu/box_coder.cu +++ b/paddle/phi/kernels/gpu/box_coder.cu @@ -199,7 +199,10 @@ void BoxCoderKernel(const Context &dev_ctx, int grid = (row * col + block - 1) / block; int bytes = var_size * sizeof(float); - auto dev_var = paddle::memory::Alloc(dev_ctx, bytes); + auto dev_var = paddle::memory::Alloc( + dev_ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); float *dev_var_data = reinterpret_cast(dev_var->ptr()); auto cplace = phi::CPUPlace(); const auto gplace = dev_ctx.GetPlace(); diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu index 66b3bf22b04..7c4a497703e 100644 --- a/paddle/phi/kernels/gpu/cholesky_kernel.cu +++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu @@ -81,7 +81,10 @@ struct MatrixBandPartFunctor { int workspace_size = 0; \ PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \ handle, uplo, n, A, lda, &workspace_size)); \ - auto workspace = paddle::memory::Alloc(dev_ctx, workspace_size); \ + auto workspace = paddle::memory::Alloc( \ + dev_ctx.GetPlace(), \ + workspace_size, \ + phi::Stream(reinterpret_cast(dev_ctx.stream()))); \ T* workspace_ptr = reinterpret_cast(workspace->ptr()); \ PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf( \ handle, uplo, n, A, lda, workspace_ptr, workspace_size, info)); \ @@ -146,7 +149,10 @@ void CholeskyKernel(const Context& dev_ctx, for_range(matrix_band_part_functor); } - auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_count); + auto info = paddle::memory::Alloc( + dev_ctx.GetPlace(), + sizeof(int) * batch_count, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); auto* info_ptr = reinterpret_cast(info->ptr()); #if CUDA_VERSION >= 9020 && !defined(_WIN32) diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu index 08ef6ddc5b2..d9829710290 100644 --- a/paddle/phi/kernels/gpu/flip_kernel.cu +++ b/paddle/phi/kernels/gpu/flip_kernel.cu @@ -85,7 +85,10 @@ void FlipKernel(const Context& dev_ctx, std::vector x_stride_v = phi::vectorize(x_stride); int bytes = total_dims * sizeof(int64_t); - auto x_strides_array_tmp = paddle::memory::Alloc(dev_ctx, bytes); + auto x_strides_array_tmp = paddle::memory::Alloc( + dev_ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); int64_t* x_strides_array_gpu = reinterpret_cast(x_strides_array_tmp->ptr()); paddle::memory::Copy(gplace, @@ -95,7 +98,10 @@ void FlipKernel(const Context& dev_ctx, bytes, dev_ctx.stream()); - auto x_shape_array_tmp = paddle::memory::Alloc(dev_ctx, bytes); + auto x_shape_array_tmp = paddle::memory::Alloc( + dev_ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); int64_t* x_shape_array_gpu = reinterpret_cast(x_shape_array_tmp->ptr()); paddle::memory::Copy(gplace, @@ -106,7 +112,10 @@ void FlipKernel(const Context& dev_ctx, dev_ctx.stream()); bytes = flip_dims_size * sizeof(int); - auto flip_dims_array_tmp = paddle::memory::Alloc(dev_ctx, bytes); + auto flip_dims_array_tmp = paddle::memory::Alloc( + dev_ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); int* flip_dims_array_gpu = reinterpret_cast(flip_dims_array_tmp->ptr()); paddle::memory::Copy(gplace, flip_dims_array_gpu, diff --git a/paddle/phi/kernels/gpu/generate_proposals_v2_kernel.cu b/paddle/phi/kernels/gpu/generate_proposals_v2_kernel.cu index bcda357fd8f..91abb290dd8 100644 --- a/paddle/phi/kernels/gpu/generate_proposals_v2_kernel.cu +++ b/paddle/phi/kernels/gpu/generate_proposals_v2_kernel.cu @@ -303,8 +303,10 @@ static void NMS(const phi::GPUContext &ctx, const T *boxes = proposals.data(); auto place = ctx.GetPlace(); - auto mask_ptr = - paddle::memory::Alloc(ctx, boxes_num * col_blocks * sizeof(uint64_t)); + auto mask_ptr = paddle::memory::Alloc( + place, + boxes_num * col_blocks * sizeof(uint64_t), + phi::Stream(reinterpret_cast(ctx.stream()))); uint64_t *mask_dev = reinterpret_cast(mask_ptr->ptr()); NMSKernel<<>>( diff --git a/paddle/phi/kernels/gpu/lu_kernel.cu b/paddle/phi/kernels/gpu/lu_kernel.cu index 7f6070a805c..7ae27df5729 100644 --- a/paddle/phi/kernels/gpu/lu_kernel.cu +++ b/paddle/phi/kernels/gpu/lu_kernel.cu @@ -105,7 +105,10 @@ void lu_decomposed_kernel(const Context& dev_ctx, int lwork; cusolver_bufferSize(cusolverH, m, n, d_A, lda, &lwork); - auto work_buff = paddle::memory::Alloc(dev_ctx, lwork * sizeof(T)); + auto work_buff = paddle::memory::Alloc( + dev_ctx.GetPlace(), + lwork * sizeof(T), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); T* d_work = reinterpret_cast(work_buff->ptr()); /* step 3: LU factorization */ diff --git a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu index 050c6d2faf5..48a73d143d6 100644 --- a/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu +++ b/paddle/phi/kernels/gpu/matrix_rank_tol_kernel.cu @@ -92,7 +92,10 @@ void GesvdjBatched(const phi::GPUContext& dev_ctx, ldt, &lwork, gesvdj_params)); - auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float)); + auto workspace = paddle::memory::Alloc( + dev_ctx.GetPlace(), + lwork * sizeof(float), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); float* workspace_ptr = reinterpret_cast(workspace->ptr()); int stride_A = lda * n; int stride_U = ldu * (thin_UV ? k : m); @@ -168,7 +171,10 @@ void GesvdjBatched(const phi::GPUContext& dev_ctx, ldt, &lwork, gesvdj_params)); - auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double)); + auto workspace = paddle::memory::Alloc( + dev_ctx.GetPlace(), + lwork * sizeof(double), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); double* workspace_ptr = reinterpret_cast(workspace->ptr()); int stride_A = lda * n; int stride_U = ldu * (thin_UV ? k : m); @@ -229,7 +235,10 @@ void SyevjBatched(const phi::GPUContext& dev_ctx, PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(¶ms)); PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize( handle, jobz, uplo, n, A, lda, W, &lwork, params)); - auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float)); + auto workspace = paddle::memory::Alloc( + dev_ctx.GetPlace(), + lwork * sizeof(float), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); float* workspace_ptr = reinterpret_cast(workspace->ptr()); for (int i = 0; i < batchSize; i++) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj(handle, @@ -281,7 +290,10 @@ void SyevjBatched(const phi::GPUContext& dev_ctx, PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(¶ms)); PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj_bufferSize( handle, jobz, uplo, n, A, lda, W, &lwork, params)); - auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double)); + auto workspace = paddle::memory::Alloc( + dev_ctx.GetPlace(), + lwork * sizeof(double), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); double* workspace_ptr = reinterpret_cast(workspace->ptr()); for (int i = 0; i < batchSize; i++) { @@ -340,7 +352,10 @@ void MatrixRankTolKernel(const Context& dev_ctx, // Must Copy X once, because the gesvdj will destory the content when exit. DenseTensor x_tmp; paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), &x_tmp); - auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batches); + auto info = paddle::memory::Alloc( + dev_ctx.GetPlace(), + sizeof(int) * batches, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); int* info_ptr = reinterpret_cast(info->ptr()); DenseTensor eigenvalue_tensor; diff --git a/paddle/phi/kernels/gpu/nms_kernel.cu b/paddle/phi/kernels/gpu/nms_kernel.cu index 5a52cb33662..490753f1313 100644 --- a/paddle/phi/kernels/gpu/nms_kernel.cu +++ b/paddle/phi/kernels/gpu/nms_kernel.cu @@ -65,7 +65,9 @@ void NMSKernel(const Context& dev_ctx, dim3 block(threadsPerBlock); dim3 grid(blocks_per_line, blocks_per_line); auto mask_data = paddle::memory::Alloc( - dev_ctx, num_boxes * blocks_per_line * sizeof(uint64_t)); + dev_ctx.GetPlace(), + num_boxes * blocks_per_line * sizeof(uint64_t), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); uint64_t* mask_dev = reinterpret_cast(mask_data->ptr()); NMS<<>>( boxes.data(), threshold, num_boxes, mask_dev); diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu index d1c8265f2fa..94f59c84693 100644 --- a/paddle/phi/kernels/gpu/randperm_kernel.cu +++ b/paddle/phi/kernels/gpu/randperm_kernel.cu @@ -127,7 +127,10 @@ void RandpermRawKernel( end_bit < 32 ? end_bit : 32, dev_ctx.stream()); - auto d_temp_storage = paddle::memory::Alloc(dev_ctx, temp_storage_bytes); + auto d_temp_storage = paddle::memory::Alloc( + dev_ctx.GetPlace(), + temp_storage_bytes, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); cub::DeviceRadixSort::SortPairs(d_temp_storage->ptr(), temp_storage_bytes, key.data(), diff --git a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu index d173030a9c9..0673eda8d5f 100644 --- a/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_align_grad_kernel.cu @@ -219,8 +219,10 @@ void RoiAlignGradKernel(const Context& dev_ctx, } } } - auto roi_ptr = - paddle::memory::Alloc(dev_ctx, box_batch_id_list.numel() * sizeof(int)); + auto roi_ptr = paddle::memory::Alloc( + dev_ctx.GetPlace(), + box_batch_id_list.numel() * sizeof(int), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); int bytes = box_batch_id_list.numel() * sizeof(int); paddle::memory::Copy( diff --git a/paddle/phi/kernels/gpu/roi_align_kernel.cu b/paddle/phi/kernels/gpu/roi_align_kernel.cu index a1824cc8fe4..c11ad5971c3 100644 --- a/paddle/phi/kernels/gpu/roi_align_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_align_kernel.cu @@ -227,7 +227,10 @@ void RoiAlignKernel(const Context& dev_ctx, } } int bytes = roi_batch_id_list.numel() * sizeof(int); - auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes); + auto roi_ptr = paddle::memory::Alloc( + dev_ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); paddle::memory::Copy( gplace, roi_id_data, cplace, roi_batch_id_data, bytes, dev_ctx.stream()); diff --git a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu index 8b74e4bf2c6..6b888b200e1 100644 --- a/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_pool_grad_kernel.cu @@ -120,7 +120,10 @@ void RoiPoolGradKernel(const Context& dev_ctx, } } int bytes = box_batch_id_list.numel() * sizeof(int); - auto roi_ptr = paddle::memory::Alloc(dev_ctx, bytes); + auto roi_ptr = paddle::memory::Alloc( + dev_ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); int* roi_id_data = reinterpret_cast(roi_ptr->ptr()); paddle::memory::Copy(gplace, roi_id_data, diff --git a/paddle/phi/kernels/gpu/roi_pool_kernel.cu b/paddle/phi/kernels/gpu/roi_pool_kernel.cu index 03c92e8727f..084ecc5d131 100644 --- a/paddle/phi/kernels/gpu/roi_pool_kernel.cu +++ b/paddle/phi/kernels/gpu/roi_pool_kernel.cu @@ -184,7 +184,10 @@ void RoiPoolKernel(const Context& dev_ctx, } int bytes = box_batch_id_list.numel() * sizeof(int); - auto box_ptr = paddle::memory::Alloc(dev_ctx, bytes); + auto box_ptr = paddle::memory::Alloc( + dev_ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); int* box_id_data = reinterpret_cast(box_ptr->ptr()); paddle::memory::Copy(gplace, box_id_data, diff --git a/paddle/phi/kernels/gpu/stack_grad_kernel.cu b/paddle/phi/kernels/gpu/stack_grad_kernel.cu index 97923eedcfc..f99747b0593 100644 --- a/paddle/phi/kernels/gpu/stack_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/stack_grad_kernel.cu @@ -92,8 +92,10 @@ void StackGradKernel(const Context& dev_ctx, } dy_suf = out.numel() / (split_dim * dy_pre); - auto tmp_out_data = - paddle::memory::Alloc(dev_ctx, outputs.size() * sizeof(T*)); + auto tmp_out_data = paddle::memory::Alloc( + dev_ctx.GetPlace(), + outputs.size() * sizeof(T*), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); paddle::memory::Copy(dev_ctx.GetPlace(), tmp_out_data->ptr(), phi::CPUPlace(), diff --git a/paddle/phi/kernels/gpu/stack_kernel.cu b/paddle/phi/kernels/gpu/stack_kernel.cu index 22e8a3e4870..e5c8d392e60 100644 --- a/paddle/phi/kernels/gpu/stack_kernel.cu +++ b/paddle/phi/kernels/gpu/stack_kernel.cu @@ -57,7 +57,10 @@ void StackKernel(const Context& dev_ctx, x_datas[i] = x[i]->data(); } - auto tmp_x_data = paddle::memory::Alloc(dev_ctx, x_datas.size() * sizeof(T*)); + auto tmp_x_data = paddle::memory::Alloc( + dev_ctx.GetPlace(), + x_datas.size() * sizeof(T*), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); paddle::memory::Copy(dev_ctx.GetPlace(), tmp_x_data->ptr(), phi::CPUPlace(), diff --git a/paddle/phi/kernels/gpu/svd_kernel.cu b/paddle/phi/kernels/gpu/svd_kernel.cu index d7fd3c9dffd..4d4c19cde2b 100644 --- a/paddle/phi/kernels/gpu/svd_kernel.cu +++ b/paddle/phi/kernels/gpu/svd_kernel.cu @@ -77,7 +77,10 @@ void GesvdjBatched(const phi::GPUContext& dev_ctx, ldt, &lwork, gesvdj_params)); - auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float)); + auto workspace = paddle::memory::Alloc( + dev_ctx.GetPlace(), + lwork * sizeof(float), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); float* workspace_ptr = reinterpret_cast(workspace->ptr()); int stride_A = lda * n; int stride_U = ldu * (thin_UV ? k : m); @@ -155,7 +158,10 @@ void GesvdjBatched(const phi::GPUContext& dev_ctx, ldt, &lwork, gesvdj_params)); - auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double)); + auto workspace = paddle::memory::Alloc( + dev_ctx.GetPlace(), + lwork * sizeof(double), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); double* workspace_ptr = reinterpret_cast(workspace->ptr()); int stride_A = lda * n; int stride_U = ldu * (thin_UV ? k : m); diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu index 1d8d8a6dfac..555c45b8ad3 100644 --- a/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/sync_batch_norm_kernel.cu @@ -86,7 +86,10 @@ void SyncBatchNormKernel(const Context &ctx, // x, x^2, 1, here 1 is used to calc device num // device num also can be got from platform::DeviceContextPool const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType); - alloc_ptr = paddle::memory::Alloc(ctx, bytes); + alloc_ptr = paddle::memory::Alloc( + ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(ctx.stream()))); auto *stats = reinterpret_cast *>(alloc_ptr->ptr()); const int threads = 256; diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h index 27c3cd4ad44..348c4982d04 100644 --- a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h +++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h @@ -365,7 +365,10 @@ void SyncBatchNormGradFunctor( const auto *saved_inv_var = saved_variance.template data>(); const int bytes = (C * 2 + 1) * sizeof(BatchNormParamType); - auto alloc_ptr = paddle::memory::Alloc(ctx, bytes); + auto alloc_ptr = paddle::memory::Alloc( + ctx.GetPlace(), + bytes, + phi::Stream(reinterpret_cast(ctx.stream()))); auto *stats = reinterpret_cast *>(alloc_ptr->ptr()); const int block = 512; diff --git a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu index a48afeb2c79..4728ae38d01 100644 --- a/paddle/phi/kernels/gpu/triangular_solve_kernel.cu +++ b/paddle/phi/kernels/gpu/triangular_solve_kernel.cu @@ -94,7 +94,10 @@ void TriangularSolveKernel(const Context& dev_ctx, // Copy the addresses of A and tmp_b from host to device. paddle::memory::allocation::AllocationPtr tmp_gpu_ptrs_data = - paddle::memory::Alloc(dev_ctx, cpu_ptrs.size() * sizeof(T*)); + paddle::memory::Alloc( + dev_ctx.GetPlace(), + cpu_ptrs.size() * sizeof(T*), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); paddle::memory::Copy(dev_ctx.GetPlace(), tmp_gpu_ptrs_data->ptr(), diff --git a/paddle/phi/kernels/gpu/yolo_box_kernel.cu b/paddle/phi/kernels/gpu/yolo_box_kernel.cu index 446a21e6a80..8baf339f0c6 100644 --- a/paddle/phi/kernels/gpu/yolo_box_kernel.cu +++ b/paddle/phi/kernels/gpu/yolo_box_kernel.cu @@ -129,9 +129,9 @@ void YoloBoxKernel(const Context& dev_ctx, int input_size_w = downsample_ratio * w; int bytes = sizeof(int) * anchors.size(); - auto anchors_ptr = - paddle::memory::Alloc(dev_ctx, sizeof(int) * anchors.size()); - int* anchors_data = reinterpret_cast(anchors_ptr->ptr()); + DenseTensor tmp_anchors; + tmp_anchors.Resize(phi::make_dim(anchors.size())); + int* anchors_data = dev_ctx.template Alloc(&tmp_anchors); const auto gplace = dev_ctx.GetPlace(); const auto cplace = phi::CPUPlace(); paddle::memory::Copy( -- GitLab