From 34a957e3e362f5045af325c079b78d207e09fb19 Mon Sep 17 00:00:00 2001 From: Ruibiao Chen Date: Mon, 5 Dec 2022 16:46:08 +0800 Subject: [PATCH] Replace mutable_data with DeviceContext.Alloc in phi kernels (#48500) * Replace mutable_data with DeviceContext.Alloc in phi kernels * Fix CI errors * Fix CI errors * Fix CI errors, test=kunlun * Fix CI errors, test=kunlun * Handle rnn_functor * Update approvals --- paddle/fluid/operators/math/CMakeLists.txt | 2 +- .../math/selected_rows_functor_test.cc | 25 ++++++++++++ paddle/fluid/platform/device_context.cc | 2 +- paddle/phi/core/device_context.cc | 2 +- paddle/phi/kernels/cpu/concat_kernel.cc | 2 +- paddle/phi/kernels/cpu/elementwise_grad.h | 6 +-- paddle/phi/kernels/cpu/histogram_kernel.cc | 2 +- .../kernels/cpu/masked_select_grad_kernel.cc | 3 +- .../phi/kernels/cpu/masked_select_kernel.cc | 3 +- .../kernels/cpu/put_along_axis_grad_kernel.cc | 2 +- .../kernels/cpu/temporal_shift_grad_kernel.cc | 5 ++- .../phi/kernels/cpu/temporal_shift_kernel.cc | 3 +- paddle/phi/kernels/cpu/yolo_box_kernel.cc | 13 ++++--- paddle/phi/kernels/funcs/broadcast_function.h | 2 +- paddle/phi/kernels/funcs/diagonal.h | 3 +- paddle/phi/kernels/funcs/elementwise_base.h | 2 +- .../phi/kernels/funcs/elementwise_grad_base.h | 6 ++- paddle/phi/kernels/funcs/fc_functor.cc | 7 +++- paddle/phi/kernels/funcs/math_function.cu | 8 +++- paddle/phi/kernels/funcs/math_function_impl.h | 6 +-- paddle/phi/kernels/funcs/select_impl.cu.h | 2 +- .../kernels/funcs/selected_rows_functor.cc | 37 +++++++++--------- .../kernels/funcs/selected_rows_functor.cu | 15 ++++--- .../phi/kernels/funcs/top_k_function_cuda.h | 11 +++--- paddle/phi/kernels/gpu/concat_kernel.cu | 2 +- paddle/phi/kernels/gpu/depthwise_conv.h | 12 +++--- .../kernels/gpu/depthwise_conv_grad_kernel.cu | 4 +- .../phi/kernels/gpu/depthwise_conv_kernel.cu | 2 +- paddle/phi/kernels/gpu/dropout_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/elementwise_grad.h | 39 +++++++++---------- .../phi/kernels/gpu/graph_reindex_kernel.cu | 4 +- paddle/phi/kernels/gpu/histogram_kernel.cu | 8 ++-- paddle/phi/kernels/gpu/kthvalue_kernel.cu | 11 ++++-- .../kernels/gpu/put_along_axis_grad_kernel.cu | 2 +- paddle/phi/kernels/gpu/rnn_functor.h | 10 ++--- paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc | 2 +- paddle/phi/kernels/gpu/rnn_kernel.cu.cc | 2 +- paddle/phi/kernels/gpu/sgd_kernel.cu | 12 +++--- .../kernels/gpu/temporal_shift_grad_kernel.cu | 4 +- .../phi/kernels/gpu/temporal_shift_kernel.cu | 3 +- paddle/phi/kernels/gpu/yolo_box_kernel.cu | 7 ++-- .../kernels/impl/digamma_grad_kernel_impl.h | 2 +- paddle/phi/kernels/impl/digamma_kernel_impl.h | 2 +- .../impl/elementwise_grad_kernel_impl.h | 4 +- paddle/phi/kernels/memcpy_kernel.cc | 34 ---------------- .../xpu/elementwise_add_grad_kernel.cc | 2 +- paddle/phi/kernels/xpu/full_kernel.cc | 2 +- tools/check_file_diff_approvals.sh | 18 ++++----- 48 files changed, 184 insertions(+), 175 deletions(-) diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index e2a62273d0..3b06722ddf 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -50,7 +50,7 @@ math_library(tree2col DEPS math_function) cc_test( selected_rows_functor_test SRCS selected_rows_functor_test.cc - DEPS selected_rows_functor) + DEPS allocator selected_rows_functor) cc_test( im2col_test SRCS im2col_test.cc diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc index 49c6942531..a2c88c723f 100644 --- a/paddle/fluid/operators/math/selected_rows_functor_test.cc +++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc @@ -15,11 +15,15 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/selected_rows_functor.h" #include "gtest/gtest.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" #include "paddle/phi/kernels/funcs/math_function.h" TEST(selected_rows_functor, cpu_add) { paddle::platform::CPUPlace cpu_place; phi::CPUContext ctx(cpu_place); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(cpu_place) + .get()); phi::funcs::SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -109,6 +113,9 @@ TEST(selected_rows_functor, cpu_add) { TEST(selected_rows_functor, cpu_add_to) { paddle::platform::CPUPlace cpu_place; phi::CPUContext ctx(cpu_place); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(cpu_place) + .get()); phi::funcs::SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -198,6 +205,9 @@ TEST(selected_rows_functor, cpu_add_to) { TEST(selected_rows_functor, cpu_merge_average_float) { paddle::platform::CPUPlace cpu_place; phi::CPUContext ctx(cpu_place); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(cpu_place) + .get()); phi::funcs::SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -233,6 +243,9 @@ TEST(selected_rows_functor, cpu_merge_average_float) { TEST(selected_rows_functor, cpu_merge_add_float) { paddle::platform::CPUPlace cpu_place; phi::CPUContext ctx(cpu_place); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(cpu_place) + .get()); phi::funcs::SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -269,6 +282,9 @@ TEST(selected_rows_functor, cpu_merge_add_float) { TEST(selected_rows_functor, cpu_merge_add_int) { paddle::platform::CPUPlace cpu_place; phi::CPUContext ctx(cpu_place); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(cpu_place) + .get()); phi::funcs::SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -305,6 +321,9 @@ TEST(selected_rows_functor, cpu_merge_add_int) { TEST(selected_rows_functor, cpu_merge_add_multi) { paddle::platform::CPUPlace cpu_place; phi::CPUContext ctx(cpu_place); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(cpu_place) + .get()); phi::funcs::SetConstant set_const; int64_t height = 10; @@ -354,6 +373,9 @@ TEST(selected_rows_functor, cpu_merge_add_multi) { TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) { paddle::platform::CPUPlace cpu_place; phi::CPUContext ctx(cpu_place); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(cpu_place) + .get()); phi::funcs::SetConstant set_const; int64_t height = 10; @@ -409,6 +431,9 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) { TEST(selected_rows_functor, cpu_sum_to) { paddle::platform::CPUPlace cpu_place; phi::CPUContext ctx(cpu_place); + ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(cpu_place) + .get()); phi::funcs::SetConstant functor; int64_t height = 10; int64_t row_numel = 10; diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index a09f438c50..539bbfb87d 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -163,7 +163,7 @@ std::unique_ptr CreateDeviceContext( if (!disable_setting_default_stream_for_allocator) { instance.SetDefaultStream(CUDAPlace(p.GetDeviceId()), cuda_ctx->stream()); } - dev_ctx->SetAllocator(instance.GetAllocator(p).get()); + dev_ctx->SetAllocator(instance.GetAllocator(p, cuda_ctx->stream()).get()); dev_ctx->SetPinnedAllocator( instance.GetAllocator(paddle::platform::CUDAPinnedPlace()).get()); diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc index d46f9250ee..a18e695cce 100644 --- a/paddle/phi/core/device_context.cc +++ b/paddle/phi/core/device_context.cc @@ -148,7 +148,7 @@ struct DeviceContext::Impl { if (tensor->initialized() && tensor->place() != place) { ClearHolder(tensor); } - auto* allocator = tensor->numel() == 0 + auto* allocator = tensor->numel() == 0 && requested_size == 0 ? zero_allocator_ : (pinned ? pinned_allocator_ : device_allocator_); #ifdef PADDLE_WITH_CUDA diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc index 96e02f4c42..1075cb9f77 100644 --- a/paddle/phi/kernels/cpu/concat_kernel.cc +++ b/paddle/phi/kernels/cpu/concat_kernel.cc @@ -44,7 +44,7 @@ void ConcatKernel(const Context& dev_ctx, phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis); out->Resize(out_dims); - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); // If axis is 0, the lod of the output is not the same as inputs. if (axis == 0 && x[0]->lod().size() > 0) { diff --git a/paddle/phi/kernels/cpu/elementwise_grad.h b/paddle/phi/kernels/cpu/elementwise_grad.h index 92587566eb..05c02f167b 100644 --- a/paddle/phi/kernels/cpu/elementwise_grad.h +++ b/paddle/phi/kernels/cpu/elementwise_grad.h @@ -90,13 +90,11 @@ ElementwiseAddGrad(const CPUContext& ctx, int axis = -1) { auto blas = phi::funcs::GetBlas(ctx); if (dx) { - blas.VCOPY( - dout.numel(), dout.data(), dx->mutable_data(ctx.GetPlace())); + blas.VCOPY(dout.numel(), dout.data(), ctx.template Alloc(dx)); } if (dy) { - blas.VCOPY( - dout.numel(), dout.data(), dy->mutable_data(ctx.GetPlace())); + blas.VCOPY(dout.numel(), dout.data(), ctx.template Alloc(dy)); } } diff --git a/paddle/phi/kernels/cpu/histogram_kernel.cc b/paddle/phi/kernels/cpu/histogram_kernel.cc index d9c41508ef..4c04566b8b 100644 --- a/paddle/phi/kernels/cpu/histogram_kernel.cc +++ b/paddle/phi/kernels/cpu/histogram_kernel.cc @@ -34,7 +34,7 @@ void HistogramKernel(const Context& dev_ctx, const T* input_data = input.data(); auto input_numel = input.numel(); - int64_t* out_data = output->mutable_data(dev_ctx.GetPlace()); + int64_t* out_data = dev_ctx.template Alloc(output); phi::funcs::SetConstant()( dev_ctx, output, static_cast(0)); diff --git a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc index bbb08f0616..f615fb2e0b 100644 --- a/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/masked_select_grad_kernel.cc @@ -27,7 +27,8 @@ void MaskedSelectGradKernel(const Context& dev_ctx, DenseTensor* x_grad) { auto* mask_data = mask.data(); auto* input_data = out_grad.data(); - auto* out_data = x_grad->mutable_data(dev_ctx.GetPlace()); + + auto* out_data = dev_ctx.template Alloc(x_grad); int mask_size = mask.numel(); int index = 0; diff --git a/paddle/phi/kernels/cpu/masked_select_kernel.cc b/paddle/phi/kernels/cpu/masked_select_kernel.cc index f377658d50..33311c26cf 100644 --- a/paddle/phi/kernels/cpu/masked_select_kernel.cc +++ b/paddle/phi/kernels/cpu/masked_select_kernel.cc @@ -48,7 +48,8 @@ void MaskedSelectKernel(const Context& dev_ctx, DDim out_dim{out_size}; out->Resize(out_dim); - auto out_data = out->mutable_data(phi::CPUPlace()); + + auto out_data = dev_ctx.template HostAlloc(out); int index = 0; for (int i = 0; i < mask_size; i++) { diff --git a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc index ca57c223be..969c5b9fe3 100644 --- a/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/put_along_axis_grad_kernel.cc @@ -58,7 +58,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx, if (value_grad) { value_grad->Resize(index.dims()); - value_grad->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(value_grad); if (index_type == DataType::INT32) { paddle::operators::cpu_gather_kernel( out_grad, axis, index, *value_grad, dev_ctx); diff --git a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc index 9e6a0e4412..3dcd3c9eb4 100644 --- a/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/temporal_shift_grad_kernel.cc @@ -114,8 +114,9 @@ void TemporalShiftGradKernel(const Context& dev_ctx, (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w}) : phi::make_ddim({nt, h, w, c})); const T* output_grad_data = output_grad->data(); - T* input_grad_data = - input_grad->mutable_data(in_grad_dims, dev_ctx.GetPlace()); + input_grad->Resize(in_grad_dims); + + T* input_grad_data = dev_ctx.template Alloc(input_grad); if (data_layout == DataLayout::kNCHW) { TemporalShiftBwNCHW( diff --git a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc index 3d10520ec8..3edd3aa301 100644 --- a/paddle/phi/kernels/cpu/temporal_shift_kernel.cc +++ b/paddle/phi/kernels/cpu/temporal_shift_kernel.cc @@ -114,7 +114,8 @@ void TemporalShiftKernel(const Context& dev_ctx, (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w}) : phi::make_ddim({nt, h, w, c})); const T* input_data = input->data(); - T* output_data = output->mutable_data(out_dims, dev_ctx.GetPlace()); + output->Resize(out_dims); + T* output_data = dev_ctx.template Alloc(output); if (data_layout == DataLayout::kNCHW) { TemporalShiftFwNCHW( diff --git a/paddle/phi/kernels/cpu/yolo_box_kernel.cc b/paddle/phi/kernels/cpu/yolo_box_kernel.cc index 6b882ad289..0c04c78214 100644 --- a/paddle/phi/kernels/cpu/yolo_box_kernel.cc +++ b/paddle/phi/kernels/cpu/yolo_box_kernel.cc @@ -51,16 +51,19 @@ void YoloBoxKernel(const Context& dev_ctx, const int an_stride = (class_num + 5) * stride; DenseTensor anchors_; - auto anchors_data = - anchors_.mutable_data({an_num * 2}, dev_ctx.GetPlace()); + anchors_.Resize({an_num * 2}); + auto anchors_data = dev_ctx.template Alloc(&anchors_); std::copy(anchors.begin(), anchors.end(), anchors_data); const T* input_data = input->data(); const int* imgsize_data = imgsize->data(); - T* boxes_data = boxes->mutable_data({n, box_num, 4}, dev_ctx.GetPlace()); + boxes->Resize({n, box_num, 4}); + T* boxes_data = dev_ctx.template Alloc(boxes); memset(boxes_data, 0, boxes->numel() * sizeof(T)); - T* scores_data = - scores->mutable_data({n, box_num, class_num}, dev_ctx.GetPlace()); + + scores->Resize({n, box_num, class_num}); + T* scores_data = dev_ctx.template Alloc(scores); + memset(scores_data, 0, scores->numel() * sizeof(T)); T box[4]; diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h index e19735e6c1..d2c30c8fa3 100644 --- a/paddle/phi/kernels/funcs/broadcast_function.h +++ b/paddle/phi/kernels/funcs/broadcast_function.h @@ -996,7 +996,7 @@ void ElementwiseCompute(const GPUContext &dev_ctx, DenseTensor *z) { std::vector ins = {&x, &y}; std::vector outs = {z}; - z->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(z); BroadcastKernel( dev_ctx, ins, &outs, axis, func); } diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h index 81525cb254..92f970aed3 100644 --- a/paddle/phi/kernels/funcs/diagonal.h +++ b/paddle/phi/kernels/funcs/diagonal.h @@ -104,7 +104,8 @@ DenseTensor Diagonal(const DeviceContext& context, DenseTensor diag; DDim diag_dims = phi::make_ddim(ret_dims); auto dig_stride = phi::stride(diag_dims); - auto diag_data = diag.mutable_data(diag_dims, context.GetPlace()); + diag.Resize(diag_dims); + auto diag_data = context.template Alloc(&diag); int64_t pos = std::abs(offset) * offset_stride; int64_t dim_size = ret_strides.size(); diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index 17b0a653cc..ffb3ff4ae3 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -474,7 +474,7 @@ static inline void GetDoubleGradSafeTensor(const DeviceContext &dev_ctx, } else { auto meta = phi::DenseTensorMeta(x.dtype(), x.dims(), x.layout()); *ddx_safe = phi::Empty(dev_ctx, std::move(meta)); - ddx_safe->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(ddx_safe); SetConstant set_zero; set_zero(dev_ctx, ddx_safe, static_cast(0)); } diff --git a/paddle/phi/kernels/funcs/elementwise_grad_base.h b/paddle/phi/kernels/funcs/elementwise_grad_base.h index 65f21e5b7f..b9ffb4e3f1 100644 --- a/paddle/phi/kernels/funcs/elementwise_grad_base.h +++ b/paddle/phi/kernels/funcs/elementwise_grad_base.h @@ -237,7 +237,8 @@ void CommonElementwiseBroadcastBackward(const CPUContext &ctx, // result. if (dx && dx->IsSharedBufferWith(dout)) { dx->clear(); - dx->mutable_data(x_dims, ctx.GetPlace()); + dx->Resize(x_dims); + ctx.template Alloc(dx); } VLOG(3) << "CommonElementwiseBroadcastBackward xdims:" @@ -1680,7 +1681,8 @@ void CommonElementwiseBroadcastBackward(const GPUContext &ctx, // result. if (dx && dx->IsSharedBufferWith(dout)) { dx->clear(); - dx->mutable_data(x_dims, ctx.GetPlace()); + dx->Resize(x_dims); + ctx.template Alloc(dx); } VLOG(3) << "CommonElementwiseBroadcastBackward xdims:" diff --git a/paddle/phi/kernels/funcs/fc_functor.cc b/paddle/phi/kernels/funcs/fc_functor.cc index f428746bc5..31212a687f 100644 --- a/paddle/phi/kernels/funcs/fc_functor.cc +++ b/paddle/phi/kernels/funcs/fc_functor.cc @@ -39,8 +39,11 @@ void FCFunctor::operator()(const DeviceContext& context, const int NN = N + 4; const int KK = K + 4; phi::DenseTensor X1; - T* X1_data = X1.mutable_data({M * KK}, paddle::platform::CPUPlace()); - Y1_data = Y1.mutable_data({M * (N + 4)}, paddle::platform::CPUPlace()); + X1.Resize({M * KK}); + T* X1_data = context.template HostAlloc(&X1); + + Y1.Resize({M * (N + 4)}); + Y1_data = context.template HostAlloc(&Y1); #ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif diff --git a/paddle/phi/kernels/funcs/math_function.cu b/paddle/phi/kernels/funcs/math_function.cu index db4cdc57e2..a0e59f8f3f 100644 --- a/paddle/phi/kernels/funcs/math_function.cu +++ b/paddle/phi/kernels/funcs/math_function.cu @@ -319,7 +319,9 @@ void ColwiseSum::operator()( size, vector->numel())); phi::DenseTensor one; - one.mutable_data({in_dims[0]}, context.GetPlace()); + one.Resize({in_dims[0]}); + context.template Alloc(&one); + SetConstant set; set(context, &one, static_cast(1.0)); phi::funcs::GetBlas(context).GEMV( @@ -355,7 +357,9 @@ void RowwiseSum::operator()( in_dims[0], vector->numel())); phi::DenseTensor one; - one.mutable_data({size}, context.GetPlace()); + one.Resize({size}); + context.template Alloc(&one); + SetConstant set; set(context, &one, static_cast(1.0)); phi::funcs::GetBlas(context).GEMV( diff --git a/paddle/phi/kernels/funcs/math_function_impl.h b/paddle/phi/kernels/funcs/math_function_impl.h index b59a249bbb..2011523a01 100644 --- a/paddle/phi/kernels/funcs/math_function_impl.h +++ b/paddle/phi/kernels/funcs/math_function_impl.h @@ -117,7 +117,7 @@ class ColwiseSum { size, out->numel())); - T* out_buf = out->mutable_data(out->place()); + T* out_buf = context.template Alloc(out); const T* in_buf = input.data(); for (size_t i = 0; i < static_cast(height); ++i) { @@ -185,7 +185,7 @@ class RowwiseMean { height, out->numel())); auto inv_size = 1.0 / size; - T* out_buf = out->mutable_data(out->place()); + T* out_buf = context.template Alloc(out); const T* in_buf = input.data(); for (size_t i = 0; i < static_cast(height); ++i) { @@ -251,7 +251,7 @@ class RowwiseSum { height, out->numel())); - T* out_buf = out->mutable_data(out->place()); + T* out_buf = context.template Alloc(out); const T* in_buf = input.data(); for (size_t i = 0; i < static_cast(height); ++i) { diff --git a/paddle/phi/kernels/funcs/select_impl.cu.h b/paddle/phi/kernels/funcs/select_impl.cu.h index 4fb1bc13ae..c5ddce68e7 100644 --- a/paddle/phi/kernels/funcs/select_impl.cu.h +++ b/paddle/phi/kernels/funcs/select_impl.cu.h @@ -451,7 +451,7 @@ void SelectKernel(const KPDevice &dev_ctx, out_dim.push_back(static_cast(rank)); out->Resize(phi::make_ddim(out_dim)); } - auto out_data = out->mutable_data(cuda_place); + auto out_data = dev_ctx.template Alloc(out); // 3.2 get true data's index according to cond_data and cumsum_data if (total_true_num <= 0) return; SelectKernel diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cc b/paddle/phi/kernels/funcs/selected_rows_functor.cc index de362d45a8..fb08766061 100644 --- a/paddle/phi/kernels/funcs/selected_rows_functor.cc +++ b/paddle/phi/kernels/funcs/selected_rows_functor.cc @@ -542,11 +542,10 @@ struct MergeAddImpl { } out.set_height(input_height); - out.mutable_value()->mutable_data( - phi::make_ddim( - {static_cast(merged_row_set.size()), input_width}), - context.GetPlace()); - auto* out_data = out.mutable_value()->data(); + DenseTensor* out_tensor = out.mutable_value(); + out_tensor->Resize(phi::make_ddim( + {static_cast(merged_row_set.size()), input_width})); + auto* out_data = context.template Alloc(out_tensor); if (merged_row_set.size() == row_num && !sorted_result) { // no duplicated ids, just concat the result together @@ -659,9 +658,10 @@ struct MergeAdd { out.set_rows(merge_rows); out.set_height(input.height()); - out.mutable_value()->mutable_data( - phi::make_ddim({static_cast(merge_rows.size()), input_width}), - context.GetPlace()); + DenseTensor* out_tensor = out.mutable_value(); + out_tensor->Resize( + phi::make_ddim({static_cast(merge_rows.size()), input_width})); + context.template Alloc(out_tensor); std::unordered_map rows_to_id; for (size_t i = 0; i < merge_rows.size(); ++i) { @@ -748,12 +748,13 @@ struct MergeAdd { out.set_rows(merge_rows); out.set_height(input_height); - out.mutable_value()->mutable_data( - phi::make_ddim( - {static_cast(merged_row_set.size()), input_width}), - context.GetPlace()); - float* y_data = reinterpret_cast(out.mutable_value()->data()); + DenseTensor* out_tensor = out.mutable_value(); + out_tensor->Resize(phi::make_ddim( + {static_cast(merged_row_set.size()), input_width})); + context.template Alloc(out_tensor); + + float* y_data = reinterpret_cast(out_tensor->data()); std::unordered_map rows_to_id; for (size_t i = 0; i < merge_rows.size(); ++i) { @@ -856,11 +857,11 @@ struct MergeAverage { } out.set_height(input_height); - out.mutable_value()->mutable_data( - phi::make_ddim( - {static_cast(merged_row_set.size()), input_width}), - context.GetPlace()); - auto* out_data = out.mutable_value()->data(); + + DenseTensor* out_tensor = out.mutable_value(); + out_tensor->Resize(phi::make_ddim( + {static_cast(merged_row_set.size()), input_width})); + auto* out_data = context.template Alloc(out_tensor); std::vector merge_rows(merged_row_set.begin(), merged_row_set.end()); diff --git a/paddle/phi/kernels/funcs/selected_rows_functor.cu b/paddle/phi/kernels/funcs/selected_rows_functor.cu index e08fea2b35..8f409466e1 100644 --- a/paddle/phi/kernels/funcs/selected_rows_functor.cu +++ b/paddle/phi/kernels/funcs/selected_rows_functor.cu @@ -392,9 +392,10 @@ struct MergeAddImpl { out.set_rows(merge_rows); out.set_height(input.height()); - out.mutable_value()->mutable_data( - phi::make_ddim({static_cast(merge_rows.size()), input_width}), - context.GetPlace()); + DenseTensor* out_tensor = out.mutable_value(); + out_tensor->Resize( + phi::make_ddim({static_cast(merge_rows.size()), input_width})); + context.template Alloc(out_tensor); phi::funcs::SetConstant constant_functor; constant_functor(context, out.mutable_value(), static_cast(0)); @@ -462,9 +463,11 @@ struct MergeAddImpl { out.set_rows(merge_rows); out.set_height(input_height); - out.mutable_value()->mutable_data( - phi::make_ddim({static_cast(merge_rows.size()), input_width}), - context.GetPlace()); + + DenseTensor* out_tensor = out.mutable_value(); + out_tensor->Resize( + phi::make_ddim({static_cast(merge_rows.size()), input_width})); + context.template Alloc(out_tensor); phi::funcs::SetConstant constant_functor; constant_functor(context, out.mutable_value(), static_cast(0)); diff --git a/paddle/phi/kernels/funcs/top_k_function_cuda.h b/paddle/phi/kernels/funcs/top_k_function_cuda.h index 6c48e05c76..f04c7a8da8 100644 --- a/paddle/phi/kernels/funcs/top_k_function_cuda.h +++ b/paddle/phi/kernels/funcs/top_k_function_cuda.h @@ -941,8 +941,7 @@ bool SortTopk(const phi::GPUContext& ctx, const std::vector dims = {num_rows, num_cols}; auto dim = phi::make_ddim(dims); input_indices.Resize(dim); - // input_indices.Resize(num_rows*num_cols); - input_indices.mutable_data(ctx.GetPlace()); + ctx.template Alloc(&input_indices); size_t temp_storage_bytes = -1; auto ComputeBlockSize = [](int col) { @@ -984,7 +983,7 @@ bool SortTopk(const phi::GPUContext& ctx, const T* input = input_tensor->data(); T* values = out_tensor->data(); - int64_t* indices = indices_tensor->mutable_data(ctx.GetPlace()); + int64_t* indices = ctx.template Alloc(indices_tensor); if (k == num_cols) { // Doing a full sort. @@ -993,8 +992,8 @@ bool SortTopk(const phi::GPUContext& ctx, } else { temp_values.Resize(dim); temp_indices.Resize(dim); - sorted_values_ptr = temp_values.mutable_data(ctx.GetPlace()); - sorted_indices_ptr = temp_indices.mutable_data(ctx.GetPlace()); + sorted_values_ptr = ctx.template Alloc(&temp_values); + sorted_indices_ptr = ctx.template Alloc(&temp_indices); } // Get temp storage buffer size, maybe can allocate a fixed buffer to save @@ -1067,7 +1066,7 @@ bool SortTopk(const phi::GPUContext& ctx, #endif } Tensor temp_storage; - temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); + ctx.template Alloc(&temp_storage, temp_storage_bytes); if (largest) { auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending( diff --git a/paddle/phi/kernels/gpu/concat_kernel.cu b/paddle/phi/kernels/gpu/concat_kernel.cu index 0666c60a8d..80ff71b215 100644 --- a/paddle/phi/kernels/gpu/concat_kernel.cu +++ b/paddle/phi/kernels/gpu/concat_kernel.cu @@ -43,7 +43,7 @@ void ConcatKernel(const Context& dev_ctx, phi::DDim out_dims = phi::funcs::ComputeAndCheckShape(true, x_dims, axis); out->Resize(out_dims); - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); // If axis is 0, the lod of the output is not the same as inputs. if (axis == 0 && x[0]->lod().size() > 0) { diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h index 9ed8813504..879056d67a 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv.h +++ b/paddle/phi/kernels/gpu/depthwise_conv.h @@ -1231,7 +1231,7 @@ class DepthwiseConvFunctor { const T* input_data = input.data(); const T* filter_data = filter.data(); - T* output_data = output->mutable_data(context.GetPlace()); + T* output_data = context.template Alloc(output); phi::DenseTensor filter_hwc; if (data_layout == DataLayout::kNHWC) { @@ -1240,7 +1240,7 @@ class DepthwiseConvFunctor { filter.dims()[0], filter.dims()[1]}); filter_hwc.Resize(filter_hwc_dims); - filter_hwc.mutable_data(context.GetPlace()); + context.template Alloc(&filter_hwc); std::vector perm_axis({2, 3, 0, 1}); phi::funcs::TransposeNormal trans; trans(context, filter, &filter_hwc, perm_axis); @@ -1409,7 +1409,7 @@ class DepthwiseConvInputGradFunctor { const T* input_data = input.data(); const T* filter_data = filter.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + T* input_grad_data = context.template Alloc(input_grad); phi::DenseTensor filter_hwc; if (data_layout == DataLayout::kNHWC) { @@ -1418,7 +1418,7 @@ class DepthwiseConvInputGradFunctor { filter.dims()[0], filter.dims()[1]}); filter_hwc.Resize(filter_hwc_dims); - filter_hwc.mutable_data(context.GetPlace()); + context.template Alloc(&filter_hwc); std::vector perm_axis({2, 3, 0, 1}); phi::funcs::TransposeNormal trans; trans(context, filter, &filter_hwc, perm_axis); @@ -1584,7 +1584,7 @@ class DepthwiseConvFilterGradFunctor(); const T* output_grad_data = output_grad.data(); - T* filter_grad_data = filter_grad->mutable_data(context.GetPlace()); + T* filter_grad_data = context.template Alloc(filter_grad); int block_size = 512; int blocks; @@ -1654,7 +1654,7 @@ class DepthwiseConvFilterGradFunctordims()[0], \ filter_grad->dims()[1]}); \ filter_grad_hwc.Resize(filter_grad_hwc_dims); \ - filter_grad_hwc.mutable_data(context.GetPlace()); \ + context.template Alloc(&filter_grad_hwc); \ phi::funcs::SetConstant set_zero; \ set_zero(context, &filter_grad_hwc, static_cast(0)); \ filter_grad_data = filter_grad_hwc.data(); \ diff --git a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu index 2e815b3e45..5bb0a4946f 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv_grad_kernel.cu @@ -75,7 +75,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, phi::funcs::SetConstant set_zero; if (input_grad) { - input_grad->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(input_grad); set_zero(dev_ctx, input_grad, static_cast(0)); if (fuse_relu) { @@ -106,7 +106,7 @@ void DepthwiseConvGradKernel(const Context& dev_ctx, } if (filter_grad) { - filter_grad->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(filter_grad); set_zero(dev_ctx, filter_grad, static_cast(0)); if (fuse_relu) { paddle::operators::math::DepthwiseConvFilterGradFunctor diff --git a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu index 8617a42e4e..1cb6301dc9 100644 --- a/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu +++ b/paddle/phi/kernels/gpu/depthwise_conv_kernel.cu @@ -32,7 +32,7 @@ void DepthwiseConvKernel(const Context& dev_ctx, const std::string& data_format, DenseTensor* out) { DenseTensor* output = out; - output->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(output); const std::vector strides = strides_t; std::vector dilations = dilations_t; diff --git a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu index 4aa59cded8..cdb8d0bd27 100644 --- a/paddle/phi/kernels/gpu/dropout_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/dropout_grad_kernel.cu @@ -29,7 +29,7 @@ void DropoutGradRawKernel(const Context& dev_ctx, const std::string& mode, DenseTensor* x_grad) { bool upscale_in_train = (mode == "upscale_in_train"); - x_grad->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(x_grad); paddle::operators::DropoutGradGPUKernelDriver(dev_ctx, is_test, p.to(), diff --git a/paddle/phi/kernels/gpu/elementwise_grad.h b/paddle/phi/kernels/gpu/elementwise_grad.h index e8f01be897..84047f1473 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad.h +++ b/paddle/phi/kernels/gpu/elementwise_grad.h @@ -153,7 +153,7 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx, // dx if (dx != nullptr) { - auto *dx_data = dx->mutable_data(ctx.GetPlace()); + auto *dx_data = ctx.template Alloc(dx); if (dx->dims() == dout.dims()) { if (dx_data != dout_data) { phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); @@ -163,7 +163,8 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx, // the result of dy wrong. if (dx->IsSharedBufferWith(dout)) { dx->clear(); - dx->mutable_data(x.dims(), ctx.GetPlace()); + dx->Resize(x.dims()); + ctx.template Alloc(dx); } std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); @@ -173,7 +174,7 @@ void DefaultElementwiseAddGrad(const GPUContext &ctx, } // dy if (dy != nullptr) { - auto *dy_data = dy->mutable_data(ctx.GetPlace()); + auto *dy_data = ctx.template Alloc(dy); if (dy->dims() == dout.dims()) { if (dy_data != dout_data) { phi::Copy(ctx, dout, ctx.GetPlace(), false, dy); @@ -217,12 +218,11 @@ void ElementwiseAddGrad(const GPUContext &ctx, PREDEFINED_BLOCK_SIZE, 1); SimpleElemwiseAddGradCUDAKernel - <<>>( - dout.data(), - size, - vec_size, - dx->mutable_data(ctx.GetPlace()), - dy->mutable_data(ctx.GetPlace())); + <<>>(dout.data(), + size, + vec_size, + ctx.template Alloc(dx), + ctx.template Alloc(dy)); } else { VLOG(4) << "Special case when dy_data is the same as dout_data, " "and dx_data is the same as dout_data, do not need " @@ -264,7 +264,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx, auto *dout_data = dout.data(); // dx if (dx != nullptr) { - auto *dx_data = dx->mutable_data(ctx.GetPlace()); + auto *dx_data = ctx.template Alloc(dx); if (dx->dims() == dout.dims()) { if (dx_data != dout_data) { phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); @@ -274,7 +274,8 @@ void default_elementwise_sub_grad(const GPUContext &ctx, // the result of dy wrong. if (dx->IsSharedBufferWith(dout)) { dx->clear(); - dx->mutable_data(x.dims(), ctx.GetPlace()); + dx->Resize(x.dims()); + ctx.template Alloc(dx); } std::vector reduce_dims = funcs::GetReduceDim(x.dims(), out.dims(), axis); @@ -284,7 +285,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx, } // dy if (dy != nullptr) { - auto *dy_data = dy->mutable_data(ctx.GetPlace()); + auto *dy_data = ctx.template Alloc(dy); if (dy->dims() == dout.dims()) { if (dy_data != dout_data) { dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1); @@ -293,10 +294,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx, dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1); SimpleElemwiseSubGradCUDAKernel <<>>( - dout.data(), - size, - nullptr, - dy->mutable_data(ctx.GetPlace())); + dout.data(), size, nullptr, ctx.template Alloc(dy)); } } else { std::vector reduce_dims = @@ -320,11 +318,10 @@ void elementwise_sub_grad(const GPUContext &ctx, dim3 grid_size = dim3((size + PREDEFINED_BLOCK_SIZE - 1) / PREDEFINED_BLOCK_SIZE, 1); SimpleElemwiseSubGradCUDAKernel - <<>>( - dout.data(), - size, - dx->mutable_data(ctx.GetPlace()), - dy->mutable_data(ctx.GetPlace())); + <<>>(dout.data(), + size, + ctx.template Alloc(dx), + ctx.template Alloc(dy)); } /* ****************************** diff --git a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu index 046c210e18..10a5eec5b1 100644 --- a/paddle/phi/kernels/gpu/graph_reindex_kernel.cu +++ b/paddle/phi/kernels/gpu/graph_reindex_kernel.cu @@ -314,9 +314,9 @@ void GraphReindexKernel(const Context& dev_ctx, const auto* ph_index = hashtable_index.get_ptr(); hashtable_index_out.ShareDataWith(*ph_index); int* hashtable_value_data = - hashtable_value_out.mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(&hashtable_value_out); int* hashtable_index_data = - hashtable_index_out.mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(&hashtable_index_out); BufferReindex(dev_ctx, x_data, src_outputs, diff --git a/paddle/phi/kernels/gpu/histogram_kernel.cu b/paddle/phi/kernels/gpu/histogram_kernel.cu index 02f5bbb530..47929e640d 100644 --- a/paddle/phi/kernels/gpu/histogram_kernel.cu +++ b/paddle/phi/kernels/gpu/histogram_kernel.cu @@ -85,7 +85,7 @@ void HistogramKernel(const Context& dev_ctx, const T* input_data = input.data(); const int input_numel = input.numel(); - int64_t* out_data = output->mutable_data(dev_ctx.GetPlace()); + int64_t* out_data = dev_ctx.template Alloc(output); phi::funcs::SetConstant()( dev_ctx, output, static_cast(0)); @@ -98,8 +98,10 @@ void HistogramKernel(const Context& dev_ctx, auto input_x = phi::EigenVector::Flatten(input); DenseTensor input_min_t, input_max_t; - auto* input_min_data = input_min_t.mutable_data({1}, dev_ctx.GetPlace()); - auto* input_max_data = input_max_t.mutable_data({1}, dev_ctx.GetPlace()); + input_min_t.Resize({1}); + input_max_t.Resize({1}); + auto* input_min_data = dev_ctx.template Alloc(&input_min_t); + auto* input_max_data = dev_ctx.template Alloc(&input_max_t); auto input_min_scala = phi::EigenScalar::From(input_min_t); auto input_max_scala = phi::EigenScalar::From(input_max_t); diff --git a/paddle/phi/kernels/gpu/kthvalue_kernel.cu b/paddle/phi/kernels/gpu/kthvalue_kernel.cu index 1f6dc48969..b04cea2ceb 100644 --- a/paddle/phi/kernels/gpu/kthvalue_kernel.cu +++ b/paddle/phi/kernels/gpu/kthvalue_kernel.cu @@ -67,7 +67,7 @@ bool SortKthvalue(const phi::GPUContext& dev_ctx, DenseTensor temp_values, temp_indices; const T* input = input_tensor->data(); T* values = out_tensor->data(); - int64_t* indices = indices_tensor->mutable_data(dev_ctx.GetPlace()); + int64_t* indices = dev_ctx.template Alloc(indices_tensor); temp_values.Resize(dim); temp_indices.Resize(dim); sorted_values_ptr = dev_ctx.template Alloc(&temp_values); @@ -208,13 +208,16 @@ void KthvalueKernel(const Context& dev_ctx, } trans_out_dims[in_dims.size() - 1] = 1; DenseTensor trans_input; - trans_input.mutable_data(trans_dims, dev_ctx.GetPlace()); + trans_input.Resize(trans_dims); + dev_ctx.template Alloc(&trans_input); int ndims = trans.size(); funcs::TransCompute( ndims, dev_ctx, x, &trans_input, trans); DenseTensor trans_ind, trans_out; - trans_ind.mutable_data(trans_out_dims, dev_ctx.GetPlace()); - trans_out.mutable_data(trans_out_dims, dev_ctx.GetPlace()); + trans_ind.Resize(trans_out_dims); + trans_out.Resize(trans_out_dims); + dev_ctx.template Alloc(&trans_ind); + dev_ctx.template Alloc(&trans_out); const int64_t input_height = phi::product(phi::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); const int64_t input_width = trans_dims[trans_dims.size() - 1]; diff --git a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu index 16c32886e2..fcf43f9f42 100644 --- a/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/put_along_axis_grad_kernel.cu @@ -50,7 +50,7 @@ void PutAlongAxisGradKernel(const Context& dev_ctx, } if (value_grad) { value_grad->Resize(index.dims()); - value_grad->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(value_grad); if (index_type == DataType::INT32) { paddle::operators::gpu_gather_kernel( out_grad, diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h index 59c5988986..3c82726662 100644 --- a/paddle/phi/kernels/gpu/rnn_functor.h +++ b/paddle/phi/kernels/gpu/rnn_functor.h @@ -58,7 +58,7 @@ class RNNDescriptors { template void Create(const gpuDnnHandle_t &handle, - const Place &place, + const DeviceContext &dev_ctx, const std::vector &sequence_length, size_t *workspace_size, size_t *reserve_size, @@ -103,17 +103,15 @@ class RNNDescriptors { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::miopenDropoutGetStatesSize(handle, &state_size)); - dropout_state->mutable_data({static_cast(state_size)}, - place); #else PADDLE_ENFORCE_GPU_SUCCESS( phi::dynload::cudnnDropoutGetStatesSize(handle, &state_size)); - dropout_state->mutable_data({static_cast(state_size)}, - place); #endif + dropout_state->Resize({static_cast(state_size)}); + dev_ctx.template Alloc(dropout_state); } dropout_desc_.descriptor(handle, - place, + dev_ctx.GetPlace(), is_initialized, dropout_prob_, is_test_ ? nullptr : dropout_state, diff --git a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc index fe04463237..ff1d295b11 100644 --- a/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc +++ b/paddle/phi/kernels/gpu/rnn_grad_kernel.cu.cc @@ -248,7 +248,7 @@ void RnnGradKernel(const Context &dev_ctx, is_test); rnn.Create(handle, - dev_ctx.GetPlace(), + dev_ctx, SequenceLength, &workspace_size, &reserve_size, diff --git a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc index 079a159ee8..44fab87d91 100644 --- a/paddle/phi/kernels/gpu/rnn_kernel.cu.cc +++ b/paddle/phi/kernels/gpu/rnn_kernel.cu.cc @@ -280,7 +280,7 @@ void RnnKernel(const Context &dev_ctx, is_bidirec, is_test); rnn.Create(handle, - dev_ctx.GetPlace(), + dev_ctx, SequenceLength, &workspace_size, &reserve_size, diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu index e3f0bf968c..b7cf9e5bad 100644 --- a/paddle/phi/kernels/gpu/sgd_kernel.cu +++ b/paddle/phi/kernels/gpu/sgd_kernel.cu @@ -82,9 +82,8 @@ void SGDDenseKernel(const Context& dev_ctx, const MPDType* master_in_data = multi_precision ? master_param->data() : nullptr; MPDType* master_out_data = - multi_precision - ? master_param_out->mutable_data(dev_ctx.GetPlace()) - : nullptr; + multi_precision ? dev_ctx.template Alloc(master_param_out) + : nullptr; int block = 512; int grid = (param.numel() + block - 1) / block; @@ -94,7 +93,7 @@ void SGDDenseKernel(const Context& dev_ctx, grad.data(), learning_rate.data(), param.numel(), - param_out->mutable_data(dev_ctx.GetPlace()), + dev_ctx.template Alloc(param_out), master_in_data, master_out_data); } @@ -119,9 +118,8 @@ void SGDDenseParamSparseGradKernel( const MPDType* master_in_data = multi_precision ? master_param->data() : nullptr; MPDType* master_out_data = - multi_precision - ? master_param_out->mutable_data(dev_ctx.GetPlace()) - : nullptr; + multi_precision ? dev_ctx.template Alloc(master_param_out) + : nullptr; PADDLE_ENFORCE_EQ( ¶m, diff --git a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu index b4a1574ee8..cc5d95a12f 100644 --- a/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/temporal_shift_grad_kernel.cu @@ -120,8 +120,8 @@ void TemporalShiftGradKernel(const Context& dev_ctx, (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w}) : phi::make_ddim({nt, h, w, c})); const T* output_grad_data = output_grad->data(); - T* input_grad_data = - input_grad->mutable_data(in_grad_dims, dev_ctx.GetPlace()); + input_grad->Resize(in_grad_dims); + T* input_grad_data = dev_ctx.template Alloc(input_grad); int pixelNum = nt * chw; int threads = 1024; diff --git a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu index c69a8aa288..b321fad07a 100644 --- a/paddle/phi/kernels/gpu/temporal_shift_kernel.cu +++ b/paddle/phi/kernels/gpu/temporal_shift_kernel.cu @@ -120,7 +120,8 @@ void TemporalShiftKernel(const Context& dev_ctx, (data_layout == DataLayout::kNCHW ? phi::make_ddim({nt, c, h, w}) : phi::make_ddim({nt, h, w, c})); const T* input_data = input->data(); - T* output_data = output->mutable_data(out_dims, dev_ctx.GetPlace()); + output->Resize(out_dims); + T* output_data = dev_ctx.template Alloc(output); int pixelNum = nt * chw; int threads = 1024; diff --git a/paddle/phi/kernels/gpu/yolo_box_kernel.cu b/paddle/phi/kernels/gpu/yolo_box_kernel.cu index 8baf339f0c..a55834c6ae 100644 --- a/paddle/phi/kernels/gpu/yolo_box_kernel.cu +++ b/paddle/phi/kernels/gpu/yolo_box_kernel.cu @@ -139,9 +139,10 @@ void YoloBoxKernel(const Context& dev_ctx, const T* input_data = input->data(); const int* imgsize_data = img_size.data(); - T* boxes_data = boxes->mutable_data({n, box_num, 4}, dev_ctx.GetPlace()); - T* scores_data = - scores->mutable_data({n, box_num, class_num}, dev_ctx.GetPlace()); + boxes->Resize({n, box_num, 4}); + T* boxes_data = dev_ctx.template Alloc(boxes); + scores->Resize({n, box_num, class_num}); + T* scores_data = dev_ctx.template Alloc(scores); phi::funcs::SetConstant set_zero; set_zero(dev_ctx, boxes, static_cast(0)); set_zero(dev_ctx, scores, static_cast(0)); diff --git a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h index 49046dfa4d..160e100f2b 100644 --- a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h @@ -42,7 +42,7 @@ void DigammaGradKernel(const Context& ctx, const DenseTensor& x, const DenseTensor& out_grad, DenseTensor* x_grad) { - x_grad->mutable_data(ctx.GetPlace()); + ctx.template Alloc(x_grad); auto* dout_data = out_grad.data(); auto* x_data = x.data(); diff --git a/paddle/phi/kernels/impl/digamma_kernel_impl.h b/paddle/phi/kernels/impl/digamma_kernel_impl.h index 4547806a38..ded77ca5a8 100644 --- a/paddle/phi/kernels/impl/digamma_kernel_impl.h +++ b/paddle/phi/kernels/impl/digamma_kernel_impl.h @@ -38,7 +38,7 @@ struct DigammaFunctor { template void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { - out->mutable_data(ctx.GetPlace()); + ctx.template Alloc(out); auto* x_data = x.data(); auto* out_data = out->data(); auto numel = x.numel(); diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index 28387975e6..396f1e9548 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -68,7 +68,7 @@ void AddDoubleGradImpl(const Context& dev_ctx, funcs::GetDoubleGradSafeTensor( dev_ctx, y, ddy.get_ptr(), &ddy_safe); - ddout->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(ddout); auto ddx_dims = ddx_safe.dims(); auto ddy_dims = ddy_safe.dims(); if (ddx_dims.size() >= ddy_dims.size()) { @@ -102,7 +102,7 @@ void SubtractDoubleGradImpl(const Context& dev_ctx, funcs::GetDoubleGradSafeTensor( dev_ctx, y, ddy.get_ptr(), &ddy_safe); - ddout->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(ddout); funcs::ElementwiseCompute, T>( dev_ctx, ddx_safe, ddy_safe, axis, funcs::SubtractFunctor(), ddout); } diff --git a/paddle/phi/kernels/memcpy_kernel.cc b/paddle/phi/kernels/memcpy_kernel.cc index acc87dc996..521edc26af 100644 --- a/paddle/phi/kernels/memcpy_kernel.cc +++ b/paddle/phi/kernels/memcpy_kernel.cc @@ -25,32 +25,6 @@ namespace phi { static constexpr size_t WAIT_THRESHOLD = 64 * 1024; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -template <> -void MemcpyH2DKernel(const GPUContext& dev_ctx, - const DenseTensor& x, - int dst_place_type, - DenseTensor* out) { - PADDLE_ENFORCE_GE( - dst_place_type, - 0, - errors::OutOfRange("dst_place_type only support 0-3, but got: %d", - dst_place_type)); - PADDLE_ENFORCE_LE( - dst_place_type, - 3, - errors::OutOfRange("dst_place_type only support 0-3, but got: %d", - dst_place_type)); - - auto stream = dev_ctx.stream(); - out->mutable_data(dev_ctx.GetPlace(), - x.dtype(), - phi::Stream(reinterpret_cast(stream))); - - Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); -} -#endif - template void MemcpyH2DKernel(const Context& dev_ctx, const DenseTensor& x, @@ -77,10 +51,6 @@ void MemcpyD2HKernel(const Context& dev_ctx, DenseTensor* out) { switch (dst_place_type) { case 0: - // NOTE(lvyongkang): phi::Copy will use DeviceContext.zero_allocator to - // alloc and assign DeviceContext.place to out, which causes place check - // fails. So we specify out's place here. - out->mutable_data(CPUPlace()); Copy(dev_ctx, x, CPUPlace(), false, out); // NOTE(copy from Aurelius84): host <-> device memory copies of a memory // block of 64 KB or less are asynchronous. See @@ -91,10 +61,6 @@ void MemcpyD2HKernel(const Context& dev_ctx, break; case 1: - // NOTE(lvyongkang): phi::Copy will use DeviceContext.zero_allocator to - // alloc and assign DeviceContext.place to out, which causes place check - // fails. So we specify out's place here. - out->mutable_data(GPUPinnedPlace()); Copy(dev_ctx, x, GPUPinnedPlace(), false, out); // paddle::memory::Copy use async copy for GPUPinnedPlace dev_ctx.Wait(); diff --git a/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc b/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc index a25cd0cd61..9dd8f7df08 100644 --- a/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/elementwise_add_grad_kernel.cc @@ -74,7 +74,7 @@ void AddGradKernel(const Context& dev_ctx, } if (dy != nullptr) { - T* dy_data = dy->mutable_data(dev_ctx.GetPlace()); + T* dy_data = dev_ctx.template Alloc(dy); if (dy->dims() == dz_dims) { if (dy_data != dz_data) { int ret = xpu::copy(dev_ctx.x_context(), diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc index c5fca8881e..44c5842210 100644 --- a/paddle/phi/kernels/xpu/full_kernel.cc +++ b/paddle/phi/kernels/xpu/full_kernel.cc @@ -45,7 +45,7 @@ void TensorSetConstantXPU(phi::DenseTensor* tensor, template void FullValueXPU(const Context& dev_ctx, DenseTensor* tensor, VType val) { - tensor->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(tensor); PD_VISIT_ALL_TYPES(tensor->dtype(), "FullValueXPU", ([&] { TensorSetConstantXPU( diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index f1aa530e87..63c204af01 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -342,17 +342,17 @@ if [ "${PHI_INCLUDE_FLUID_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then check_approval 1 chenwhql YuanRisheng zyfncg fi -HAS_MODIFIED_PHI_KERNEL_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/kernels" || true` -PHI_USE_MUTABLE_DATA_FILES="" -for CHANGE_FILE in ${HAS_MODIFIED_PHI_KERNEL_FILES}; do - PHI_DIR_ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "mutable_data" || true` - if [ "${PHI_DIR_ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then - PHI_USE_MUTABLE_DATA_FILES="${PHI_USE_MUTABLE_DATA_FILES} ${CHANGE_FILE}" +HAS_MODIFIED_PHI_OR_FLUID_FILES=`git diff --name-only upstream/$BRANCH | grep -E "paddle/phi|paddle/fluid" || true` +USE_MUTABLE_DATA_FILES="" +for CHANGE_FILE in ${HAS_MODIFIED_PHI_OR_FLUID_FILES}; do + ADDED_LINES=`git diff -U0 upstream/$BRANCH -- ${PADDLE_ROOT}/${CHANGE_FILE} | grep "^+" | grep -w "mutable_data" || true` + if [ "${ADDED_LINES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then + USE_MUTABLE_DATA_FILES="${USE_MUTABLE_DATA_FILES} ${CHANGE_FILE}" fi done -if [ "${PHI_USE_MUTABLE_DATA_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then - echo_line="You can not use the DenseTensor::mutable_data() method in paddle/phi/kernels files(${PHI_USE_MUTABLE_DATA_FILES}). If you want to alloc memory, use phi::DeviceContext::Alloc() or phi::DeviceContext::HostAlloc() instead and if you want to get mutable data, use DenseTensor::data(). If you have any questions, you can have one RD (chenwhql, Shixiaowei02, YuanRisheng or zyfncg) review and approve.\n" - check_approval 1 chenwhql Shixiaowei02 YuanRisheng zyfncg +if [ "${USE_MUTABLE_DATA_FILES}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then + echo_line="You can not use the DenseTensor::mutable_data() method in files(${USE_MUTABLE_DATA_FILES}). If you want to alloc memory, use phi::DeviceContext::Alloc() or phi::DeviceContext::HostAlloc() instead and if you want to get mutable data, use DenseTensor::data(). If you have any questions, you can have one RD (chenwhql, Shixiaowei02, YuanRisheng, zyfncg or From00) review and approve.\n" + check_approval 1 chenwhql Shixiaowei02 YuanRisheng zyfncg From00 fi ALL_CHANGE_FILES=`git diff --numstat upstream/$BRANCH | awk '{print $3}' | grep ".py"` -- GitLab