From 4bbbed9a6c3f03ae6f852c77766444bff1cb96aa Mon Sep 17 00:00:00 2001 From: Wilber Date: Wed, 7 Sep 2022 16:24:07 +0800 Subject: [PATCH] Fix fused cuda op's mutable data [2] (#45562) --- .../fluid/operators/fused/attn_bias_add.cu.h | 3 +- .../fluid/operators/fused/conv_fusion_op.cu | 2 +- .../fused/cudnn_bn_stats_finalize.cu.h | 19 +- .../operators/fused/cudnn_norm_conv.cu.h | 16 +- .../fused/cudnn_scale_bias_add_relu.cu.h | 20 ++- .../operators/fused/fused_attention_op.cu | 163 +++++++++++------- ...sed_bias_dropout_residual_layer_norm_op.cu | 39 +++-- .../operators/fused/fused_bn_activation_op.cu | 80 +++++---- .../fused/fused_bn_add_activation_op.cu | 56 +++--- .../operators/fused/fused_feedforward_op.cu | 72 ++++---- .../operators/fused/fused_gate_attention.h | 11 +- .../fused/fused_gate_attention_op.cu | 25 +-- .../operators/fused/fused_gemm_epilogue_op.cu | 11 +- .../fused/fused_multi_transformer_op.cu | 76 ++++---- .../operators/fused/fused_seqpool_cvm_op.cu | 18 +- .../fused/fusion_conv_inception_op.cu | 11 +- 16 files changed, 376 insertions(+), 246 deletions(-) diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index 2a0881ca093..2b8b857966d 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -326,7 +326,8 @@ void Launch2DColumnReduce(const phi::GPUContext& dev_ctx, } else { framework::Tensor tmp_sum; tmp_sum.Resize({grid.y, left_num}); - tmp_sum.mutable_data>(dev_ctx.GetPlace()); + dev_ctx.template Alloc>( + &tmp_sum, tmp_sum.numel() * sizeof(ReduceParamType)); BiasAddBw2DReduceKernel<<>>( d_out, diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index 1ef834c3f7a..4f05e6b6e2f 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -49,7 +49,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { auto* bias = ctx.Input("Bias"); auto* residual = ctx.Input("ResidualData"); auto* output = ctx.Output("Output"); - output->mutable_data(ctx.GetPlace()); + dev_ctx.template Alloc(output, output->numel() * sizeof(T)); std::vector strides = ctx.Attr>("strides"); std::vector paddings = ctx.Attr>("paddings"); diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h index 628642b9563..86588331ec2 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h +++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h @@ -84,7 +84,6 @@ class CudnnBNStatsFinalize { float momentum, int64_t ele_count, bool is_train) { - auto place = ctx.GetPlace(); if (is_train) { TrainInit(ctx); } else { @@ -98,12 +97,18 @@ class CudnnBNStatsFinalize { const_cast(sum_of_squares.data()); float *scale_ptr = const_cast(scale.data()); float *bias_ptr = const_cast(bias.data()); - float *saved_mean_ptr = saved_mean->mutable_data(place); - float *saved_invstd_ptr = saved_invstd->mutable_data(place); - float *running_mean_ptr = running_mean->mutable_data(place); - float *running_var_ptr = running_var->mutable_data(place); - T *equiv_scale_ptr = equiv_scale->mutable_data(place); - T *equiv_bias_ptr = equiv_bias->mutable_data(place); + float *saved_mean_ptr = ctx.template Alloc( + saved_mean, saved_mean->numel() * sizeof(float)); + float *saved_invstd_ptr = ctx.template Alloc( + saved_invstd, saved_invstd->numel() * sizeof(float)); + float *running_mean_ptr = ctx.template Alloc( + running_mean, running_mean->numel() * sizeof(float)); + float *running_var_ptr = ctx.template Alloc( + running_var, running_var->numel() * sizeof(float)); + T *equiv_scale_ptr = + ctx.template Alloc(equiv_scale, equiv_scale->numel() * sizeof(T)); + T *equiv_bias_ptr = + ctx.template Alloc(equiv_bias, equiv_bias->numel() * sizeof(T)); op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SCALE, scale_ptr); op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_BIAS, bias_ptr); op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_RUNNING_MEAN, running_mean_ptr); diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h index 34cf677223c..cde4ed06142 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h +++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h @@ -193,7 +193,6 @@ class CudnnNormConvolution { Tensor *sum, Tensor *sum_of_squares) { auto cudnn_handle = ctx.cudnn_handle(); - auto place = ctx.GetPlace(); CudnnFusionOp *fwd_op = GetForwardOp(ctx); size_t workspace_size = RoundUp( @@ -210,9 +209,11 @@ class CudnnNormConvolution { CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size); // output ptr - T *output_ptr = output->mutable_data(place); - float *sum_ptr = sum->mutable_data(place); - float *sum_of_squares_ptr = sum_of_squares->mutable_data(place); + T *output_ptr = ctx.template Alloc(output, output->numel() * sizeof(T)); + float *sum_ptr = + ctx.template Alloc(sum, sum->numel() * sizeof(float)); + float *sum_of_squares_ptr = ctx.template Alloc( + sum_of_squares, sum_of_squares->numel() * sizeof(float)); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, output_ptr); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr); fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr); @@ -311,17 +312,18 @@ class CudnnNormConvolutionGrad { Tensor *input_grad, Tensor *filter_grad, bool use_addto = false) { - auto place = ctx.GetPlace(); T *input_ptr = const_cast(input.data()); T *filter_ptr = const_cast(filter.data()); T *output_grad_ptr = const_cast(output_grad.data()); if (filter_grad) { - T *filter_grad_ptr = filter_grad->mutable_data(place); + T *filter_grad_ptr = + ctx.template Alloc(filter_grad, filter_grad->numel() * sizeof(T)); BackwardFilter(ctx, output_grad_ptr, input_ptr, filter_grad_ptr); } if (input_grad) { - T *input_grad_ptr = input_grad->mutable_data(place); + T *input_grad_ptr = + ctx.template Alloc(input_grad, input_grad->numel() * sizeof(T)); BackwardData(ctx, output_grad_ptr, filter_ptr, input_grad_ptr, use_addto); } } diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h index b25605c6ca0..60cf314c5ea 100644 --- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h +++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h @@ -127,7 +127,6 @@ class CudnnScaleBiasAddRelu { Tensor *bitmask) { ForwardInit(ctx); auto handle = ctx.cudnn_handle(); - auto place = ctx.GetPlace(); auto workspace_handle = ctx.cudnn_workspace_handle(); fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle); // Set variant_param @@ -156,8 +155,9 @@ class CudnnScaleBiasAddRelu { CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &fwd_workspace_byte_); // output ptr - T *out_ptr = out->mutable_data(place); - int32_t *bitmask_ptr = bitmask->mutable_data(place); + T *out_ptr = ctx.template Alloc(out, out->numel() * sizeof(T)); + int32_t *bitmask_ptr = ctx.template Alloc( + bitmask, bitmask->numel() * sizeof(int32_t)); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, out_ptr); fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ACTIVATION_BITMASK, bitmask_ptr); @@ -186,7 +186,6 @@ class CudnnScaleBiasAddRelu { double eps) { BackwardInit(ctx); auto handle = ctx.cudnn_handle(); - auto place = ctx.GetPlace(); auto workspace_handle = ctx.cudnn_workspace_handle(); bwd_workspace_byte_ = bwd_op_.GetWorkspaceSizeInBytes(handle); // Set variant_param @@ -199,10 +198,15 @@ class CudnnScaleBiasAddRelu { float *saved_invstd_ptr = const_cast(saved_invstd.data()); int32_t *bitmask_ptr = bitmask ? const_cast(bitmask->data()) : nullptr; - T *dx_ptr = dx->mutable_data(place); - T *dz_ptr = dz ? dz->mutable_data(place) : nullptr; - float *dscale_ptr = dscale ? dscale->mutable_data(place) : nullptr; - float *dbias_ptr = dbias ? dbias->mutable_data(place) : nullptr; + T *dx_ptr = ctx.template Alloc(dx, dx->numel() * sizeof(T)); + T *dz_ptr = + dz ? ctx.template Alloc(dz, dz->numel() * sizeof(T)) : nullptr; + float *dscale_ptr = dscale ? ctx.template Alloc( + dscale, dscale->numel() * sizeof(float)) + : nullptr; + float *dbias_ptr = + dbias ? ctx.template Alloc(dbias, dbias->numel() * sizeof(float)) + : nullptr; bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, x_ptr); bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DYDATA, dy_ptr); diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index ed904df93df..059d94031ac 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -64,7 +64,7 @@ static void AllReduce(framework::Tensor &tensor, // NOLINT int64_t numel = tensor.numel(); const void *sendbuff = tensor.data(); auto place = ctx.GetPlace(); - void *recvbuff = tensor.mutable_data(place); + void *recvbuff = ctx.template Alloc(&tensor, tensor.numel() * sizeof(T)); auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); auto stream = ctx.stream(); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( @@ -83,7 +83,7 @@ class FusedAttentionOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext &ctx) const override { using U = LayerNormParamType; auto *input_x = ctx.Input("X"); - + auto &dev_ctx = ctx.template device_context(); const auto pre_layer_norm = ctx.Attr("pre_layer_norm"); const float epsilon = ctx.Attr("epsilon"); auto *ln_scale = ctx.Input("LnScale"); @@ -145,40 +145,53 @@ class FusedAttentionOpKernel : public framework::OpKernel { auto *x_data = input_x->data(); auto *qkv_weight_data = qkv_weight->data(); auto *qkv_bias_data = (qkv_bias == nullptr) ? nullptr : qkv_bias->data(); - auto *qkv_out_data = qkv_out->mutable_data(ctx.GetPlace()); + auto *qkv_out_data = + dev_ctx.template Alloc(qkv_out, qkv_out->numel() * sizeof(T)); auto *qkv_bias_out_data = - (qkv_bias == nullptr) ? nullptr - : qkv_bias_out->mutable_data(ctx.GetPlace()); + (qkv_bias == nullptr) + ? nullptr + : dev_ctx.template Alloc(qkv_bias_out, + qkv_bias_out->numel() * sizeof(T)); // get data ptr for FMHA. - auto *transpose_out_2_data = - transpose_out_2->mutable_data(ctx.GetPlace()); + auto *transpose_out_2_data = dev_ctx.template Alloc( + transpose_out_2, transpose_out_2->numel() * sizeof(T)); auto *cache_kv_out_data = (cache_kv_out == nullptr) ? nullptr - : cache_kv_out->mutable_data(ctx.GetPlace()); - auto *qk_out_data = qk_out->mutable_data(ctx.GetPlace()); - auto *qktv_out_data = qktv_out->mutable_data(ctx.GetPlace()); + : dev_ctx.template Alloc(cache_kv_out, + cache_kv_out->numel() * sizeof(T)); + auto *qk_out_data = + dev_ctx.template Alloc(qk_out, qk_out->numel() * sizeof(T)); + auto *qktv_out_data = + dev_ctx.template Alloc(qktv_out, qktv_out->numel() * sizeof(T)); auto *src_mask_out_data = - (src_mask == nullptr) ? nullptr - : src_mask_out->mutable_data(ctx.GetPlace()); - auto *softmax_out_data = softmax_out->mutable_data(ctx.GetPlace()); - auto *attn_dropout_mask_out_data = - attn_dropout_mask_out->mutable_data(ctx.GetPlace()); - auto *attn_dropout_out_data = - attn_dropout_out->mutable_data(ctx.GetPlace()); - auto *fmha_out_data = fmha_out->mutable_data(ctx.GetPlace()); + (src_mask == nullptr) + ? nullptr + : dev_ctx.template Alloc(src_mask_out, + src_mask_out->numel() * sizeof(T)); + auto *softmax_out_data = dev_ctx.template Alloc( + softmax_out, softmax_out->numel() * sizeof(T)); + auto *attn_dropout_mask_out_data = dev_ctx.template Alloc( + attn_dropout_mask_out, + attn_dropout_mask_out->numel() * sizeof(uint8_t)); + auto *attn_dropout_out_data = dev_ctx.template Alloc( + attn_dropout_out, attn_dropout_out->numel() * sizeof(T)); + auto *fmha_out_data = + dev_ctx.template Alloc(fmha_out, fmha_out->numel() * sizeof(T)); // get data ptr for out_linear. auto *out_linear_weight_data = out_linear_weight->data(); auto *out_linear_bias_data = (out_linear_bias == nullptr) ? nullptr : out_linear_bias->data(); - auto *out_linear_out_data = out_linear_out->mutable_data(ctx.GetPlace()); + auto *out_linear_out_data = dev_ctx.template Alloc( + out_linear_out, out_linear_out->numel() * sizeof(T)); // get data ptr for bias+dropout+residual+layernorm - auto *dropout_mask_out_data = - dropout_mask_out->mutable_data(ctx.GetPlace()); - auto *final_out_data = out->mutable_data(ctx.GetPlace()); + auto *dropout_mask_out_data = dev_ctx.template Alloc( + dropout_mask_out, dropout_mask_out->numel() * sizeof(uint8_t)); + auto *final_out_data = + dev_ctx.template Alloc(out, out->numel() * sizeof(T)); int batch_size = input_x_dims[0]; int max_seq_len = input_x_dims[1]; @@ -248,9 +261,12 @@ class FusedAttentionOpKernel : public framework::OpKernel { auto *ln_scale_data = (ln_scale == nullptr ? nullptr : ln_scale->data()); auto *ln_bias_data = (ln_bias == nullptr ? nullptr : ln_bias->data()); - auto *ln_mean_data = ln_mean->mutable_data(ctx.GetPlace()); - auto *ln_var_data = ln_var->mutable_data(ctx.GetPlace()); - auto *ln_out_data = ln_out->mutable_data(ctx.GetPlace()); + auto *ln_mean_data = + dev_ctx.template Alloc(ln_mean, ln_mean->numel() * sizeof(U)); + auto *ln_var_data = + dev_ctx.template Alloc(ln_var, ln_var->numel() * sizeof(U)); + auto *ln_out_data = + dev_ctx.template Alloc(ln_out, ln_out->numel() * sizeof(T)); layer_norm_compute.ComputeForward(x_data, ln_scale_data, @@ -321,10 +337,13 @@ class FusedAttentionOpKernel : public framework::OpKernel { const U *ln_scale_2_ptr = ln_scale_2 ? ln_scale_2->data() : nullptr; const U *ln_bias_2_ptr = ln_bias_2 ? ln_bias_2->data() : nullptr; - T *bias_dropout_residual_out_ptr = - bias_dropout_residual_out->mutable_data(ctx.GetPlace()); - U *ln_mean_2_ptr = ln_mean_2->mutable_data(ctx.GetPlace()); - U *ln_var_2_ptr = ln_var_2->mutable_data(ctx.GetPlace()); + T *bias_dropout_residual_out_ptr = dev_ctx.template Alloc( + bias_dropout_residual_out, + bias_dropout_residual_out->numel() * sizeof(T)); + U *ln_mean_2_ptr = + dev_ctx.template Alloc(ln_mean_2, ln_mean_2->numel() * sizeof(U)); + U *ln_var_2_ptr = + dev_ctx.template Alloc(ln_var_2, ln_var_2->numel() * sizeof(U)); // output = layernorm(residual + dropout(input + bias)) fused_dropout_layernorm_helper.LayernormResidualDropoutBias( ctx.cuda_device_context(), @@ -352,6 +371,7 @@ class FusedAttentionGradKernel : public framework::OpKernel { const float ln2epsilon = ctx.Attr("ln_epsilon"); float attn_dropout_prob = ctx.Attr("attn_dropout_rate"); + auto &dev_ctx = ctx.template device_context(); bool is_test_1 = ctx.Attr("is_test"); auto &dropout_implementation_1 = ctx.Attr("attn_dropout_implementation"); @@ -432,29 +452,37 @@ class FusedAttentionGradKernel : public framework::OpKernel { ctx.Output(framework::GradVarName("OutLinearOut")); auto *d_bias_dropout_residual_out = ctx.Output(framework::GradVarName("BiasDropoutResidualOut")); - auto *d_x_data = d_x->mutable_data(ctx.GetPlace()); + auto *d_x_data = dev_ctx.template Alloc(d_x, d_x->numel() * sizeof(T)); // when qkv_bias is not nullptr, d_qkv_out is equals to d_qkv_bias_out, the // space can be reused. auto *d_qkv_out_data = (d_qkv_bias_out != nullptr) ? nullptr - : d_qkv_out->mutable_data(ctx.GetPlace()); + : dev_ctx.template Alloc( + d_qkv_out, d_qkv_out->numel() * sizeof(T)); auto *d_qkv_bias_out_data = (d_qkv_bias_out == nullptr) ? nullptr - : d_qkv_bias_out->mutable_data(ctx.GetPlace()); - auto *d_qktv_out_data = d_qktv_out->mutable_data(ctx.GetPlace()); - auto *d_transpose_out_2_data = - d_transpose_out_2->mutable_data(ctx.GetPlace()); - auto *d_qk_out_data = d_qk_out->mutable_data(ctx.GetPlace()); - auto *d_softmax_out_data = d_softmax_out->mutable_data(ctx.GetPlace()); - auto *d_attn_dropout_out_data = - d_attn_dropout_out->mutable_data(ctx.GetPlace()); + : dev_ctx.template Alloc(d_qkv_bias_out, + d_qkv_bias_out->numel() * sizeof(T)); + auto *d_qktv_out_data = + dev_ctx.template Alloc(d_qktv_out, d_qktv_out->numel() * sizeof(T)); + auto *d_transpose_out_2_data = dev_ctx.template Alloc( + d_transpose_out_2, d_transpose_out_2->numel() * sizeof(T)); + auto *d_qk_out_data = + dev_ctx.template Alloc(d_qk_out, d_qk_out->numel() * sizeof(T)); + auto *d_softmax_out_data = dev_ctx.template Alloc( + d_softmax_out, d_softmax_out->numel() * sizeof(T)); + auto *d_attn_dropout_out_data = dev_ctx.template Alloc( + d_attn_dropout_out, d_attn_dropout_out->numel() * sizeof(T)); auto *d_src_mask_out_data = - (src_mask == nullptr) ? nullptr - : d_src_mask_out->mutable_data(ctx.GetPlace()); - auto *d_fmha_out_data = d_fmha_out->mutable_data(ctx.GetPlace()); - auto *d_out_linear_out_data = - d_out_linear_out->mutable_data(ctx.GetPlace()); + (src_mask == nullptr) + ? nullptr + : dev_ctx.template Alloc(d_src_mask_out, + d_src_mask_out->numel() * sizeof(T)); + auto *d_fmha_out_data = + dev_ctx.template Alloc(d_fmha_out, d_fmha_out->numel() * sizeof(T)); + auto *d_out_linear_out_data = dev_ctx.template Alloc( + d_out_linear_out, d_out_linear_out->numel() * sizeof(T)); // parameter grad auto *d_qkv_weight = ctx.Output(framework::GradVarName("QKVW")); @@ -466,16 +494,20 @@ class FusedAttentionGradKernel : public framework::OpKernel { auto *d_ln_2_scale = ctx.Output(framework::GradVarName("Ln2Scale")); auto *d_ln_2_bias = ctx.Output(framework::GradVarName("Ln2Bias")); - auto *d_qkv_weight_data = d_qkv_weight->mutable_data(ctx.GetPlace()); - auto *d_qkv_bias_data = (d_qkv_bias == nullptr) - ? nullptr - : d_qkv_bias->mutable_data(ctx.GetPlace()); - auto *d_out_linear_weight_data = - d_out_linear_weight->mutable_data(ctx.GetPlace()); + auto *d_qkv_weight_data = dev_ctx.template Alloc( + d_qkv_weight, d_qkv_weight->numel() * sizeof(T)); + auto *d_qkv_bias_data = + (d_qkv_bias == nullptr) + ? nullptr + : dev_ctx.template Alloc(d_qkv_bias, + d_qkv_bias->numel() * sizeof(T)); + auto *d_out_linear_weight_data = dev_ctx.template Alloc( + d_out_linear_weight, d_out_linear_weight->numel() * sizeof(T)); auto *d_out_linear_bias_data = (d_out_linear_bias == nullptr) ? nullptr - : d_out_linear_bias->mutable_data(ctx.GetPlace()); + : dev_ctx.template Alloc(d_out_linear_bias, + d_out_linear_bias->numel() * sizeof(T)); const auto input_x_dims = input_x->dims(); const auto qkv_w_dims = qkv_weight->dims(); @@ -496,7 +528,8 @@ class FusedAttentionGradKernel : public framework::OpKernel { T *d_residual_data = nullptr; if (add_residual) { d_residual.Resize(input_x_dims); - d_residual_data = d_residual.mutable_data(ctx.GetPlace()); + d_residual_data = dev_ctx.template Alloc( + &d_residual, d_residual.numel() * sizeof(T)); } bool transA = false; @@ -560,13 +593,16 @@ class FusedAttentionGradKernel : public framework::OpKernel { auto *d_ln_2_scale_data = (d_ln_2_scale == nullptr ? nullptr - : d_ln_2_scale->mutable_data(ctx.GetPlace())); + : dev_ctx.template Alloc(d_ln_2_scale, + d_ln_2_scale->numel() * sizeof(U))); auto *d_ln_2_bias_data = (d_ln_2_bias == nullptr ? nullptr - : d_ln_2_bias->mutable_data(ctx.GetPlace())); - auto *d_bias_dropout_residual_out_data = - d_bias_dropout_residual_out->mutable_data(ctx.GetPlace()); + : dev_ctx.template Alloc(d_ln_2_bias, + d_ln_2_bias->numel() * sizeof(U))); + auto *d_bias_dropout_residual_out_data = dev_ctx.template Alloc( + d_bias_dropout_residual_out, + d_bias_dropout_residual_out->numel() * sizeof(T)); fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad( ctx.cuda_device_context(), @@ -638,13 +674,18 @@ class FusedAttentionGradKernel : public framework::OpKernel { auto *d_ln_out = ctx.Output(framework::GradVarName("LnOut")); auto *d_ln_scale = ctx.Output(framework::GradVarName("LnScale")); auto *d_ln_bias = ctx.Output(framework::GradVarName("LnBias")); - auto *d_ln_out_data = d_ln_out->mutable_data(ctx.GetPlace()); + auto *d_ln_out_data = + dev_ctx.template Alloc(d_ln_out, d_ln_out->numel() * sizeof(T)); auto *d_ln_scale_data = - (d_ln_scale == nullptr ? nullptr - : d_ln_scale->mutable_data(ctx.GetPlace())); + (d_ln_scale == nullptr + ? nullptr + : dev_ctx.template Alloc(d_ln_scale, + d_ln_scale->numel() * sizeof(U))); auto *d_ln_bias_data = - (d_ln_bias == nullptr ? nullptr - : d_ln_bias->mutable_data(ctx.GetPlace())); + (d_ln_bias == nullptr + ? nullptr + : dev_ctx.template Alloc(d_ln_bias, + d_ln_bias->numel() * sizeof(U))); if (qkv_bias != nullptr) { qkv_compute.ComputeBackward(ln_out, qkv_weight, diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu index 2d6260c920d..b194f07c848 100644 --- a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu +++ b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu @@ -31,6 +31,7 @@ template class FusedBiasDropoutResidualLnOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { + auto &dev_ctx = ctx.template device_context(); using U = LayerNormParamType; auto *input_x = ctx.Input("X"); auto *bias = ctx.Input("Bias"); @@ -50,12 +51,14 @@ class FusedBiasDropoutResidualLnOpKernel : public framework::OpKernel { auto *ln_scale_data = (ln_scale == nullptr ? nullptr : ln_scale->data()); auto *ln_bias_data = (ln_bias == nullptr ? nullptr : ln_bias->data()); auto *bias_dropout_residual_out_data = - bias_dropout_residual_out->mutable_data(ctx.GetPlace()); - auto *ln_mean_data = ln_mean->mutable_data(ctx.GetPlace()); - auto *ln_var_data = ln_var->mutable_data(ctx.GetPlace()); - auto *dropout_mask_out_data = - dropout_mask_out->mutable_data(ctx.GetPlace()); - auto *y_data = y->mutable_data(ctx.GetPlace()); + dev_ctx.Alloc(bias_dropout_residual_out, + bias_dropout_residual_out->numel() * sizeof(T)); + auto *ln_mean_data = + dev_ctx.Alloc(ln_mean, ln_mean->numel() * sizeof(U)); + auto *ln_var_data = dev_ctx.Alloc(ln_var, ln_var->numel() * sizeof(U)); + auto *dropout_mask_out_data = dev_ctx.Alloc( + dropout_mask_out, dropout_mask_out->numel() * sizeof(uint8_t)); + auto *y_data = dev_ctx.Alloc(y, y->numel() * sizeof(T)); const auto input_x_dims = input_x->dims(); int bsz_seq = 1; @@ -92,7 +95,7 @@ class FusedBiasDropoutResidualLnGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext &ctx) const override { using U = LayerNormParamType; const float ln_epsilon = ctx.Attr("ln_epsilon"); - + auto &dev_ctx = ctx.template device_context(); auto *d_y = ctx.Input(framework::GradVarName("Y")); auto *ln_scale = ctx.Input("LnScale"); auto *dropout_mask_out = ctx.Input("DropoutMaskOut"); @@ -114,18 +117,24 @@ class FusedBiasDropoutResidualLnGradKernel : public framework::OpKernel { ctx.Output(framework::GradVarName("BiasDropoutResidualOut")); auto *d_ln_scale = ctx.Output(framework::GradVarName("LnScale")); auto *d_ln_bias = ctx.Output(framework::GradVarName("LnBias")); - auto *d_x_data = d_x->mutable_data(ctx.GetPlace()); - auto *d_residual_data = d_residual->mutable_data(ctx.GetPlace()); + auto *d_x_data = dev_ctx.Alloc(d_x, d_x->numel() * sizeof(T)); + auto *d_residual_data = + dev_ctx.Alloc(d_residual, d_residual->numel() * sizeof(T)); auto *d_bias_dropout_residual_out_data = - d_bias_dropout_residual_out->mutable_data(ctx.GetPlace()); + dev_ctx.Alloc(d_bias_dropout_residual_out, + d_bias_dropout_residual_out->numel() * sizeof(T)); auto *d_bias_data = - (d_bias == nullptr ? nullptr : d_bias->mutable_data(ctx.GetPlace())); + (d_bias == nullptr + ? nullptr + : dev_ctx.Alloc(d_bias, d_bias->numel() * sizeof(T))); auto *d_ln_scale_data = - (d_ln_scale == nullptr ? nullptr - : d_ln_scale->mutable_data(ctx.GetPlace())); + (d_ln_scale == nullptr + ? nullptr + : dev_ctx.Alloc(d_ln_scale, d_ln_scale->numel() * sizeof(U))); auto *d_ln_bias_data = - (d_ln_bias == nullptr ? nullptr - : d_ln_bias->mutable_data(ctx.GetPlace())); + (d_ln_bias == nullptr + ? nullptr + : dev_ctx.Alloc(d_ln_bias, d_ln_bias->numel() * sizeof(U))); const auto input_x_dims = d_y->dims(); int bsz_seq = 1; diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu index 53984707d50..1a22de67b53 100644 --- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu @@ -45,6 +45,7 @@ class FusedBatchNormActKernel platform::is_gpu_place(ctx.GetPlace()), true, platform::errors::PreconditionNotMet("It must use CUDAPlace.")); + auto &dev_ctx = ctx.template device_context(); double epsilon = static_cast(ctx.Attr("epsilon")); float momentum = ctx.Attr("momentum"); std::string act_type = ctx.Attr("act_type"); @@ -73,22 +74,26 @@ class FusedBatchNormActKernel // initialize them. auto *mean_out = ctx.Output("MeanOut"); auto *variance_out = ctx.Output("VarianceOut"); - mean_out->mutable_data>(ctx.GetPlace()); - variance_out->mutable_data>(ctx.GetPlace()); + dev_ctx.Alloc>( + mean_out, mean_out->numel() * sizeof(BatchNormParamType)); + dev_ctx.Alloc>( + variance_out, variance_out->numel() * sizeof(BatchNormParamType)); auto *saved_mean = ctx.Output("SavedMean"); auto *saved_variance = ctx.Output("SavedVariance"); - saved_mean->mutable_data>(ctx.GetPlace()); - saved_variance->mutable_data>(ctx.GetPlace()); + dev_ctx.Alloc>( + saved_mean, saved_mean->numel() * sizeof(BatchNormParamType)); + dev_ctx.Alloc>( + saved_variance, + saved_variance->numel() * sizeof(BatchNormParamType)); auto *y = ctx.Output("Y"); - y->mutable_data(ctx.GetPlace()); + dev_ctx.Alloc(y, y->numel() * sizeof(T)); int N, C, H, W, D; const DataLayout data_layout = DataLayout::kNHWC; ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); - auto &dev_ctx = ctx.template device_context(); if ((N * H * W * D) == 1) { // Only 1 element in normalization dimension, // skip the batch norm calculation, let y = act(x). @@ -172,10 +177,17 @@ class FusedBatchNormActKernel /*xDesc=*/data_desc_, /*sizeInBytes=*/&reserve_space_size)); - reserve_space_ptr = reserve_space->mutable_data( - ctx.GetPlace(), x->dtype(), reserve_space_size); - workspace_ptr = workspace_tensor.mutable_data( - ctx.GetPlace(), x->dtype(), workspace_size); + reserve_space->Resize({static_cast( + (reserve_space_size + experimental::SizeOf(x->dtype()) - 1) / + experimental::SizeOf(x->dtype()))}); + reserve_space_ptr = + dev_ctx.Alloc(reserve_space, reserve_space->numel() * sizeof(T)); + workspace_tensor.Resize({static_cast( + (workspace_size + experimental::SizeOf(x->dtype()) - 1) / + experimental::SizeOf(x->dtype()))}); + workspace_ptr = dev_ctx.Alloc(&workspace_tensor, + workspace_tensor.numel() * sizeof(T)); + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardTrainingEx( handle, @@ -193,15 +205,18 @@ class FusedBatchNormActKernel scale->template data>(), bias->template data>(), this_factor, - mean_out->template mutable_data>( - ctx.GetPlace()), - variance_out->template mutable_data>( - ctx.GetPlace()), + dev_ctx.template Alloc>( + mean_out, mean_out->numel() * sizeof(BatchNormParamType)), + dev_ctx.template Alloc>( + variance_out, + variance_out->numel() * sizeof(BatchNormParamType)), epsilon, - saved_mean->template mutable_data>( - ctx.GetPlace()), - saved_variance->template mutable_data>( - ctx.GetPlace()), + dev_ctx.template Alloc>( + saved_mean, + saved_mean->numel() * sizeof(BatchNormParamType)), + dev_ctx.template Alloc>( + saved_variance, + saved_variance->numel() * sizeof(BatchNormParamType)), activation_desc_, workspace_ptr, workspace_size, @@ -227,7 +242,7 @@ class FusedBatchNormActGradKernel platform::errors::PreconditionNotMet("It must use CUDAPlace.")); double epsilon = static_cast(ctx.Attr("epsilon")); std::string act_type = ctx.Attr("act_type"); - + auto &dev_ctx = ctx.template device_context(); const auto *x = ctx.Input("X"); const auto *y = ctx.Input("Y"); const auto *d_y = ctx.Input(framework::GradVarName("Y")); @@ -250,14 +265,16 @@ class FusedBatchNormActGradKernel auto *d_scale = ctx.Output(framework::GradVarName("Scale")); auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - d_x->mutable_data(ctx.GetPlace()); + dev_ctx.Alloc(d_x, d_x->numel() * sizeof(T)); PADDLE_ENFORCE_EQ( d_scale && d_bias, true, platform::errors::PreconditionNotMet( "Both the scale grad and the bias grad must not be null.")); - d_scale->mutable_data>(ctx.GetPlace()); - d_bias->mutable_data>(ctx.GetPlace()); + dev_ctx.Alloc>( + d_scale, d_scale->numel() * sizeof(BatchNormParamType)); + dev_ctx.Alloc>( + d_bias, d_bias->numel() * sizeof(BatchNormParamType)); PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL, platform::errors::PreconditionNotMet( @@ -268,7 +285,6 @@ class FusedBatchNormActGradKernel platform::errors::PreconditionNotMet( "The size of scale is equal to the channel of Input(X).")); - auto &dev_ctx = ctx.template device_context(); if ((N * H * W * D) == 1) { if (act_type == "relu") { auto x_v = framework::EigenVector::Flatten(*x); @@ -344,8 +360,11 @@ class FusedBatchNormActGradKernel /*activationDesc=*/activation_desc_, /*sizeInBytes=*/&workspace_size)); - workspace_ptr = workspace_tensor.mutable_data( - ctx.GetPlace(), x->type(), workspace_size); + workspace_tensor.Resize({static_cast( + (workspace_size + experimental::SizeOf(x->dtype()) - 1) / + experimental::SizeOf(x->dtype()))}); + workspace_ptr = dev_ctx.Alloc(&workspace_tensor, + workspace_tensor.numel() * sizeof(T)); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnBatchNormalizationBackwardEx( @@ -365,16 +384,17 @@ class FusedBatchNormActGradKernel /*dzDesc=*/nullptr, /*dzData=*/nullptr, /*dxDesc=*/data_desc_, - /*dxData=*/d_x->template mutable_data(ctx.GetPlace()), + /*dxData=*/ + dev_ctx.template Alloc(d_x, d_x->numel() * sizeof(T)), /*dBnScaleBiasDesc=*/bn_param_desc_, /*bnScaleData=*/scale->template data>(), /*bnBiasData=*/bias->template data>(), /*dBnScaleData=*/ - d_scale->template mutable_data>( - ctx.GetPlace()), + dev_ctx.template Alloc>( + d_scale, d_scale->numel() * sizeof(BatchNormParamType)), /*dBnBiasData=*/ - d_bias->template mutable_data>( - ctx.GetPlace()), + dev_ctx.template Alloc>( + d_bias, d_bias->numel() * sizeof(BatchNormParamType)), /*epsilon=*/epsilon, /*savedMean=*/saved_mean_data, /*savedInvVariance=*/saved_var_data, diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu index 23dbbe2ad08..6d541f07842 100644 --- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu +++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu @@ -23,6 +23,7 @@ #include "paddle/fluid/operators/norm_utils.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/common/data_type.h" #include "paddle/phi/kernels/funcs/math_function.h" DECLARE_bool(cudnn_batchnorm_spatial_persistent); @@ -44,6 +45,7 @@ class FusedBatchNormAddActKernel platform::is_gpu_place(ctx.GetPlace()), true, platform::errors::PreconditionNotMet("It must use CUDAPlace.")); + auto &dev_ctx = ctx.template device_context(); double epsilon = static_cast(ctx.Attr("epsilon")); float momentum = ctx.Attr("momentum"); std::string act_type = ctx.Attr("act_type"); @@ -66,23 +68,26 @@ class FusedBatchNormAddActKernel auto *mean_out = ctx.Output("MeanOut"); auto *variance_out = ctx.Output("VarianceOut"); - mean_out->mutable_data>(ctx.GetPlace()); - variance_out->mutable_data>(ctx.GetPlace()); + dev_ctx.Alloc>( + mean_out, mean_out->numel() * sizeof(BatchNormParamType)); + dev_ctx.Alloc>( + variance_out, variance_out->numel() * sizeof(BatchNormParamType)); auto *saved_mean = ctx.Output("SavedMean"); auto *saved_variance = ctx.Output("SavedVariance"); - saved_mean->mutable_data>(ctx.GetPlace()); - saved_variance->mutable_data>(ctx.GetPlace()); + dev_ctx.Alloc>( + saved_mean, saved_mean->numel() * sizeof(BatchNormParamType)); + dev_ctx.Alloc>( + saved_variance, + saved_variance->numel() * sizeof(BatchNormParamType)); auto *y = ctx.Output("Y"); - y->mutable_data(ctx.GetPlace()); + dev_ctx.Alloc(y, y->numel() * sizeof(T)); int N, C, H, W, D; const DataLayout data_layout = DataLayout::kNHWC; ExtractNCWHD(in_dims, data_layout, &N, &C, &H, &W, &D); - auto &dev_ctx = ctx.template device_context(); - // ------------------- cudnn descriptors --------------------- auto handle = dev_ctx.cudnn_handle(); cudnnTensorDescriptor_t data_desc_; @@ -149,10 +154,17 @@ class FusedBatchNormAddActKernel /*xDesc=*/data_desc_, /*sizeInBytes=*/&reserve_space_size)); - reserve_space_ptr = reserve_space->mutable_data( - ctx.GetPlace(), x->dtype(), reserve_space_size); - workspace_ptr = workspace_tensor.mutable_data( - ctx.GetPlace(), x->dtype(), workspace_size); + reserve_space->Resize({static_cast( + (reserve_space_size + experimental::SizeOf(x->dtype()) - 1) / + experimental::SizeOf(x->dtype()))}); + reserve_space_ptr = + dev_ctx.Alloc(reserve_space, reserve_space->numel() * sizeof(T)); + workspace_tensor.Resize({static_cast( + (workspace_size + experimental::SizeOf(x->dtype()) - 1) / + experimental::SizeOf(x->dtype()))}); + workspace_ptr = dev_ctx.Alloc(&workspace_tensor, + workspace_tensor.numel() * sizeof(T)); + PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardTrainingEx( handle, @@ -170,15 +182,18 @@ class FusedBatchNormAddActKernel scale->template data>(), bias->template data>(), this_factor, - mean_out->template mutable_data>( - ctx.GetPlace()), - variance_out->template mutable_data>( - ctx.GetPlace()), + dev_ctx.template Alloc>( + mean_out, mean_out->numel() * sizeof(BatchNormParamType)), + dev_ctx.template Alloc>( + variance_out, + variance_out->numel() * sizeof(BatchNormParamType)), epsilon, - saved_mean->template mutable_data>( - ctx.GetPlace()), - saved_variance->template mutable_data>( - ctx.GetPlace()), + dev_ctx.template Alloc>( + saved_mean, + saved_mean->numel() * sizeof(BatchNormParamType)), + dev_ctx.template Alloc>( + saved_variance, + saved_variance->numel() * sizeof(BatchNormParamType)), activation_desc_, workspace_ptr, workspace_size, @@ -212,6 +227,7 @@ class FusedBatchNormAddActGradKernel const auto *bias = ctx.Input("Bias"); const auto *reserve_space = ctx.Input("ReserveSpace"); + auto &dev_ctx = ctx.template device_context(); const auto &in_dims = x->dims(); int N, C, H, W, D; @@ -243,8 +259,6 @@ class FusedBatchNormAddActGradKernel platform::errors::PreconditionNotMet( "The size of scale is equal to the channel of Input(X).")); - auto &dev_ctx = ctx.template device_context(); - std::vector dims = {N, C, H, W, D}; std::vector strides = {H * W * C * D, 1, W * D * C, D * C, C}; // ------------------- cudnn descriptors --------------------- diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index 60b5ecfdd74..33d1e89bf28 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -57,7 +57,7 @@ static void AllReduce(framework::Tensor& tensor, // NOLINT int64_t numel = tensor.numel(); const void* sendbuff = tensor.data(); auto place = ctx.GetPlace(); - void* recvbuff = tensor.mutable_data(place); + void* recvbuff = ctx.Alloc(&tensor, tensor.numel() * sizeof(T)); auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); auto stream = ctx.stream(); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( @@ -125,7 +125,6 @@ class FusedFeedForwardKernel : public framework::OpKernel { FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( ctx, bsz_seq, d_model, dropout_param2, epsilon2); - auto place = ctx.GetPlace(); using U = LayerNormParamType; const framework::Tensor* in = &x; @@ -158,7 +157,8 @@ class FusedFeedForwardKernel : public framework::OpKernel { dropout1_out->data(), dropout1_mask->data()); framework::Tensor linear2_out; - linear2_out.mutable_data({bsz_seq, d_model}, place); + linear2_out.Resize({bsz_seq, d_model}); + ctx.Alloc(&linear2_out, linear2_out.numel() * sizeof(T)); MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out); // tensor model parallel @@ -203,6 +203,7 @@ class FusedFeedForwardKernel : public framework::OpKernel { auto* linear2_weight = context.Input("Linear2Weight"); auto* linear2_bias = context.Input("Linear2Bias"); const bool pre_layer_norm = context.Attr("pre_layer_norm"); + auto& dev_ctx = context.template device_context(); auto* ln1_scale = pre_layer_norm ? context.Input("Ln1Scale") : nullptr; @@ -245,22 +246,23 @@ class FusedFeedForwardKernel : public framework::OpKernel { DropoutParam dropout_param2(context, 2); using U = LayerNormParamType; - auto place = context.GetPlace(); - out->mutable_data(place); - dropout1_mask->mutable_data(place); - dropout2_mask->mutable_data(place); + dev_ctx.Alloc(out, out->numel() * sizeof(T)); + dev_ctx.Alloc(dropout1_mask, + dropout1_mask->numel() * sizeof(uint8_t)); + dev_ctx.Alloc(dropout2_mask, + dropout2_mask->numel() * sizeof(uint8_t)); if (pre_layer_norm) { - ln1_mean->mutable_data(place); - ln1_variance->mutable_data(place); - ln1_out->mutable_data(place); + dev_ctx.Alloc(ln1_mean, ln1_mean->numel() * sizeof(U)); + dev_ctx.Alloc(ln1_variance, ln1_variance->numel() * sizeof(U)); + dev_ctx.Alloc(ln1_out, ln1_out->numel() * sizeof(T)); } else { - ln2_mean->mutable_data(place); - ln2_variance->mutable_data(place); + dev_ctx.Alloc(ln2_mean, ln2_mean->numel() * sizeof(U)); + dev_ctx.Alloc(ln2_variance, ln2_variance->numel() * sizeof(U)); } - linear1_out->mutable_data(place); - dropout1_out->mutable_data(place); - dropout2_out->mutable_data(place); + dev_ctx.Alloc(linear1_out, linear1_out->numel() * sizeof(T)); + dev_ctx.Alloc(dropout1_out, dropout1_out->numel() * sizeof(T)); + dev_ctx.Alloc(dropout2_out, dropout2_out->numel() * sizeof(T)); auto x_dim = x->dims(); auto mat_dim_x = phi::funcs::CreateMatrixDescriptor( @@ -374,7 +376,6 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( ctx, bsz_seq, d_model, dropout_param2, epsilon2); - auto place = ctx.GetPlace(); using U = LayerNormParamType; const U* ln1_gamma_ptr = ln1_gamma == nullptr ? nullptr : ln1_gamma->data(); @@ -396,12 +397,16 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { U* d_ln2_beta_ptr = d_ln2_beta == nullptr ? nullptr : d_ln2_beta->data(); framework::Tensor d_linear2_out, d_dropout2_out, d_residual; - d_linear2_out.mutable_data({bsz_seq, d_model}, place); - d_dropout2_out.mutable_data({bsz_seq, d_model}, place); + d_linear2_out.Resize({bsz_seq, d_model}); + ctx.Alloc(&d_linear2_out, d_linear2_out.numel() * sizeof(T)); + d_dropout2_out.Resize({bsz_seq, d_model}); + ctx.Alloc(&d_dropout2_out, d_dropout2_out.numel() * sizeof(T)); T* d_residual_ptr = nullptr; if (add_residual) { - d_residual_ptr = d_residual.mutable_data(d_x->dims(), place); + d_residual.Resize(d_x->dims()); + d_residual_ptr = + ctx.Alloc(&d_residual, d_residual.numel() * sizeof(T)); } if (pre_layer_norm) { fused_dropout_layernorm_helper.ResidualDropoutBiasGrad( @@ -429,7 +434,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { } framework::Tensor d_dropout1_out; - d_dropout1_out.mutable_data({bsz_seq, dim_feedforward}, place); + d_dropout1_out.Resize({bsz_seq, dim_feedforward}); + ctx.Alloc(&d_dropout1_out, d_dropout1_out.numel() * sizeof(T)); MatMulGrad(ctx, d_linear2_out, dropout1_out, @@ -438,7 +444,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { d_linear2_weight); framework::Tensor d_linear1_out; - d_linear1_out.mutable_data({bsz_seq, dim_feedforward}, place); + d_linear1_out.Resize({bsz_seq, dim_feedforward}); + ctx.Alloc(&d_linear1_out, d_linear1_out.numel() * sizeof(T)); fused_act_dropout_helper.DropoutActBiasGrad(ctx, d_dropout1_out.data(), linear1_out.data(), @@ -450,7 +457,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { if (pre_layer_norm) { framework::Tensor d_ln1_out; - d_ln1_out.mutable_data({bsz_seq, d_model}, place); + d_ln1_out.Resize({bsz_seq, d_model}); + ctx.Alloc(&d_ln1_out, d_ln1_out.numel() * sizeof(T)); MatMulGrad(ctx, d_linear1_out, *ln1_out, @@ -485,6 +493,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { using U = LayerNormParamType; + auto& dev_ctx = context.template device_context(); auto d_out = *context.Input(framework::GradVarName("Out")); auto x = *context.Input("X"); @@ -550,28 +559,27 @@ class FusedFeedForwardGradKernel : public framework::OpKernel { DropoutParam dropout_param1(context, 1); DropoutParam dropout_param2(context, 2); - auto place = context.GetPlace(); - d_x->mutable_data(place); + dev_ctx.Alloc(d_x, d_x->numel() * sizeof(T)); if (d_ln1_scale) { - d_ln1_scale->mutable_data(place); + dev_ctx.Alloc(d_ln1_scale, d_ln1_scale->numel() * sizeof(U)); } if (d_ln1_bias) { - d_ln1_bias->mutable_data(place); + dev_ctx.Alloc(d_ln1_bias, d_ln1_bias->numel() * sizeof(U)); } if (d_ln2_scale) { - d_ln2_scale->mutable_data(place); + dev_ctx.Alloc(d_ln2_scale, d_ln2_scale->numel() * sizeof(U)); } if (d_ln2_bias) { - d_ln2_bias->mutable_data(place); + dev_ctx.Alloc(d_ln2_bias, d_ln2_bias->numel() * sizeof(U)); } if (d_linear1_bias) { - d_linear1_bias->mutable_data(place); + dev_ctx.Alloc(d_linear1_bias, d_linear1_bias->numel() * sizeof(T)); } if (d_linear2_bias) { - d_linear2_bias->mutable_data(place); + dev_ctx.Alloc(d_linear2_bias, d_linear2_bias->numel() * sizeof(T)); } - d_linear1_weight->mutable_data(place); - d_linear2_weight->mutable_data(place); + dev_ctx.Alloc(d_linear1_weight, d_linear1_weight->numel() * sizeof(T)); + dev_ctx.Alloc(d_linear2_weight, d_linear2_weight->numel() * sizeof(T)); auto x_dim = x.dims(); auto mat_dim_x = phi::funcs::CreateMatrixDescriptor( diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h index f9d9fad110e..12db3e6e0d6 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention.h +++ b/paddle/fluid/operators/fused/fused_gate_attention.h @@ -47,7 +47,7 @@ template void AllocWithDebugInfo(const phi::GPUContext& dev_ctx, const std::string& info, Tensor* t) { - t->mutable_data(dev_ctx.GetPlace()); + dev_ctx.Alloc(t, t->numel() * sizeof(T)); VLOG(4) << info << ": " << MemoryDebugString(*t); } @@ -505,9 +505,12 @@ class FMHAGateRef { k_transpose_out_grad.Resize(config->kv_transpose_out_dims); v_transpose_out_grad.Resize(config->kv_transpose_out_dims); - q_grad_ptr = q_transpose_out_grad.mutable_data(dev_ctx_.GetPlace()); - k_grad_ptr = k_transpose_out_grad.mutable_data(dev_ctx_.GetPlace()); - v_grad_ptr = v_transpose_out_grad.mutable_data(dev_ctx_.GetPlace()); + q_grad_ptr = dev_ctx_.Alloc(&q_transpose_out_grad, + q_transpose_out_grad.numel() * sizeof(T)); + k_grad_ptr = dev_ctx_.Alloc(&k_transpose_out_grad, + k_transpose_out_grad.numel() * sizeof(T)); + v_grad_ptr = dev_ctx_.Alloc(&v_transpose_out_grad, + v_transpose_out_grad.numel() * sizeof(T)); } Tensor softmax_out_grad; diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu index 139a365c10e..413dc41dbd1 100644 --- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu @@ -90,7 +90,8 @@ void ComputeMergedQKVMatmulBackward(const framework::ExecutionContext &ctx, auto *qkv_weight = ctx.Input("QKVWeight"); auto *qkv_weight_grad = ctx.Output(framework::GradVarName("QKVWeight")); - qkv_weight_grad->mutable_data(ctx.GetPlace()); + auto &dev_ctx = ctx.template device_context(); + dev_ctx.Alloc(qkv_weight_grad, qkv_weight_grad->numel() * sizeof(T)); // Gradient of GEMM(query, qkv_weight) int m = config.batch_size * config.seq_len_m * config.seq_len_r; @@ -160,7 +161,8 @@ void ComputeSeparatedQKVMatmulBackward(const framework::ExecutionContext &ctx, const auto *key_weight = ctx.Input("KeyWeight"); auto *key_weight_grad = ctx.Output(framework::GradVarName("KeyWeight")); - key_weight_grad->mutable_data(ctx.GetPlace()); + auto &dev_ctx = ctx.template device_context(); + dev_ctx.Alloc(key_weight_grad, key_weight_grad->numel() * sizeof(T)); int kv_m = config.batch_size * config.seq_len_m * config.m_size; int kv_n = config.num_heads * config.head_dim; @@ -174,7 +176,7 @@ void ComputeSeparatedQKVMatmulBackward(const framework::ExecutionContext &ctx, auto *value_weight = ctx.Input("ValueWeight"); auto *value_weight_grad = ctx.Output(framework::GradVarName("ValueWeight")); - value_weight_grad->mutable_data(ctx.GetPlace()); + dev_ctx.Alloc(value_weight_grad, value_weight_grad->numel() * sizeof(T)); kv_compute.ComputeBackward(key, value_weight, @@ -188,7 +190,7 @@ void ComputeSeparatedQKVMatmulBackward(const framework::ExecutionContext &ctx, const auto *query_weight = ctx.Input("QueryWeight"); auto *query_weight_grad = ctx.Output(framework::GradVarName("QueryWeight")); - query_weight_grad->mutable_data(ctx.GetPlace()); + dev_ctx.Alloc(query_weight_grad, query_weight_grad->numel() * sizeof(T)); int q_m = config.batch_size * config.seq_len_m * config.seq_len_r; int q_n = config.num_heads * config.head_dim; @@ -242,11 +244,11 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx, Tensor *fmha_out_grad) { const auto *gate_weight = ctx.Input("GateWeight"); const auto *gate_bias = ctx.Input("GateBias"); - + auto &dev_ctx = ctx.template device_context(); // Re-compute gate_bias_out Tensor gate_bias_out; gate_bias_out.Resize(config.gate_out_dims); - gate_bias_out.mutable_data(ctx.GetPlace()); + dev_ctx.Alloc(&gate_bias_out, gate_bias_out.numel() * sizeof(T)); int m = config.batch_size * config.seq_len_m * config.seq_len_r; int n = config.num_heads * config.head_dim; @@ -267,8 +269,8 @@ void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx, auto *gate_weight_grad = ctx.Output(framework::GradVarName("GateWeight")); auto *gate_bias_grad = ctx.Output(framework::GradVarName("GateBias")); - gate_weight_grad->mutable_data(ctx.GetPlace()); - gate_bias_grad->mutable_data(ctx.GetPlace()); + dev_ctx.Alloc(gate_weight_grad, gate_weight_grad->numel() * sizeof(T)); + dev_ctx.Alloc(gate_bias_grad, gate_bias_grad->numel() * sizeof(T)); gate_attn_compute.ComputeBackward(query, gate_weight, @@ -301,6 +303,7 @@ void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx, const GateAttentionGradConfig &config, const Tensor *input, Tensor *input_grad) { + auto &dev_ctx = ctx.template device_context(); const auto *out_grad = ctx.Input(framework::GradVarName("Out")); const auto *out_linear_weight = ctx.Input("OutLinearWeight"); @@ -309,8 +312,10 @@ void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx, auto *out_linear_bias_grad = ctx.Output(framework::GradVarName("OutLinearBias")); - out_linear_weight_grad->mutable_data(ctx.GetPlace()); - out_linear_bias_grad->mutable_data(ctx.GetPlace()); + dev_ctx.Alloc(out_linear_weight_grad, + out_linear_weight_grad->numel() * sizeof(T)); + dev_ctx.Alloc(out_linear_bias_grad, + out_linear_bias_grad->numel() * sizeof(T)); int m = config.batch_size * config.seq_len_m * config.seq_len_r; int n = config.q_dim; diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu index 05d3013da5b..aaea0b66ff5 100644 --- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu @@ -46,7 +46,7 @@ class FusedGemmEpilogueKernel : public framework::OpKernel { << " , activation = " << activation; bool enable_auxiliary = reserve_space == nullptr ? false : true; - out->mutable_data(ctx.GetPlace()); + dev_ctx->Alloc(out, out->numel() * sizeof(T)); auto* out_data = out->data(); auto x_mat_dims = @@ -110,8 +110,7 @@ class FusedGemmEpilogueKernel : public framework::OpKernel { } else { reserve_space_size = phi::product(out->dims()) * sizeof(T); } - reserve_space->mutable_data( - ctx.GetPlace(), out->type(), reserve_space_size); + dev_ctx->Alloc(reserve_space, out->type(), reserve_space_size); void* aux_data = reinterpret_cast(reserve_space->data()); PADDLE_ENFORCE_GPU_SUCCESS( @@ -493,7 +492,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel { workspace_size, phi::Stream(reinterpret_cast(dev_ctx.stream()))); - auto* dx_data = dx->mutable_data(ctx.GetPlace()); + auto* dx_data = dev_ctx->Alloc(dx, dx->numel() * sizeof(T)); const auto* y_data = y->data(); const auto* dout_data = dout->data(); const auto* a_data = kXGradAIsDZ ? dout_data : y_data; @@ -601,7 +600,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel { sizeof(epiloque_func_for_dy))); if (dbias) { - auto* dbias_data = dbias->mutable_data(ctx.GetPlace()); + auto* dbias_data = dev_ctx->Alloc(dbias, dbias->numel() * sizeof(T)); PADDLE_ENFORCE_GPU_SUCCESS( platform::dynload::cublasLtMatmulDescSetAttribute( dy_operation_desc, @@ -614,7 +613,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel { dev_ctx.GetPlace(), workspace_size, phi::Stream(reinterpret_cast(dev_ctx.stream()))); - auto* dy_data = dy->mutable_data(ctx.GetPlace()); + auto* dy_data = dev_ctx->Alloc(dy, dy->numel() * sizeof(T)); const auto* dout_data = dout->data(); const auto* x_data = x->data(); const auto* a_data = kYGradAIsDZ ? dout_data : x_data; diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu index 6414954667b..04681f3d7a3 100644 --- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu +++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu @@ -70,7 +70,7 @@ static void AllReduce(framework::Tensor &tensor, // NOLINT int64_t numel = tensor.numel(); const void *sendbuff = tensor.data(); auto place = ctx.GetPlace(); - void *recvbuff = tensor.mutable_data(place); + void *recvbuff = ctx.Alloc(&tensor, tensor.numel() * sizeof(T)); auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); auto stream = ctx.stream(); PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce( @@ -1161,7 +1161,6 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { using U = LayerNormParamType; - auto place = ctx.GetPlace(); auto &dev_ctx = ctx.cuda_device_context(); auto *time_step = ctx.Input("TimeStep"); @@ -1181,8 +1180,11 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { auto ln_compute = AttnLayerNorm(dev_ctx, epsilon, bsz_seq, dim_embed); Tensor ln_mean, ln_var; - auto *ln_mean_data = ln_mean.mutable_data({bsz_seq}, place); - auto *ln_var_data = ln_var.mutable_data({bsz_seq}, place); + ln_mean.Resize({{bsz_seq}}); + auto *ln_mean_data = + dev_ctx.Alloc(&ln_mean, ln_mean.numel() * sizeof(U)); + ln_var.Resize({{bsz_seq}}); + auto *ln_var_data = dev_ctx.Alloc(&ln_var, ln_var.numel() * sizeof(U)); // 2. qkv // x: qkv's input [batch_size, seq_len, dim_embed] @@ -1207,8 +1209,9 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { input_size, compute_bias); Tensor qkv_out; + qkv_out.Resize({{bsz, seq_len, 3, num_head, dim_head}}); auto *qkv_out_data = - qkv_out.mutable_data({bsz, seq_len, 3, num_head, dim_head}, place); + dev_ctx.Alloc(&qkv_out, qkv_out.numel() * sizeof(T)); // 3. fmha AttnDropoutParam attn_param( @@ -1243,26 +1246,32 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { } Tensor transpose_out_2, qk_out; - auto *transpose_out_2_data = transpose_out_2.mutable_data( - {3, bsz, num_head, seq_len, dim_head}, place); - auto *qk_out_data = - qk_out.mutable_data({bsz, num_head, seq_len, out_seq_len}, place); + transpose_out_2.Resize({{3, bsz, num_head, seq_len, dim_head}}); + auto *transpose_out_2_data = + dev_ctx.Alloc(&transpose_out_2, transpose_out_2.numel() * sizeof(T)); + qk_out.Resize({{bsz, num_head, seq_len, out_seq_len}}); + auto *qk_out_data = dev_ctx.Alloc(&qk_out, qk_out.numel() * sizeof(T)); Tensor softmax_out; Tensor attn_dropout_mask_out, attn_dropout_out; Tensor qktv_out, fmha_out; - auto *softmax_out_data = softmax_out.mutable_data( - {bsz, num_head, seq_len, out_seq_len}, place); - - auto *attn_dropout_mask_out_data = attn_dropout_mask_out.mutable_data( - {bsz, num_head, seq_len, out_seq_len}, place); - auto *attn_dropout_data_data = attn_dropout_out.mutable_data( - {bsz, num_head, seq_len, out_seq_len}, place); - + softmax_out.Resize({{bsz, num_head, seq_len, out_seq_len}}); + auto *softmax_out_data = + dev_ctx.Alloc(&softmax_out, softmax_out.numel() * sizeof(T)); + + attn_dropout_mask_out.Resize({{bsz, num_head, seq_len, out_seq_len}}); + auto *attn_dropout_mask_out_data = dev_ctx.Alloc( + &attn_dropout_mask_out, attn_dropout_mask_out.numel() * sizeof(T)); + attn_dropout_out.Resize({{bsz, num_head, seq_len, out_seq_len}}); + auto *attn_dropout_data_data = dev_ctx.Alloc( + &attn_dropout_out, attn_dropout_out.numel() * sizeof(T)); + + qktv_out.Resize({{bsz, num_head, seq_len, dim_head}}); auto *qktv_out_data = - qktv_out.mutable_data({bsz, num_head, seq_len, dim_head}, place); + dev_ctx.Alloc(&qktv_out, qktv_out.numel() * sizeof(T)); + fmha_out.Resize({{bsz, seq_len, num_head, dim_head}}); auto *fmha_out_data = - fmha_out.mutable_data({bsz, seq_len, num_head, dim_head}, place); + dev_ctx.Alloc(&fmha_out, fmha_out.numel() * sizeof(T)); // 4. out_linear auto out_linear_weights = ctx.MultiInput("OutLinearW"); @@ -1281,12 +1290,14 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { Tensor bias_dropout_residual_out, dropout_mask_out; T *bias_dropout_residual_out_data = nullptr; if (pre_layer_norm) { + bias_dropout_residual_out.Resize({{bsz, seq_len, dim_embed}}); bias_dropout_residual_out_data = - bias_dropout_residual_out.mutable_data({bsz, seq_len, dim_embed}, - place); + dev_ctx.Alloc(&bias_dropout_residual_out, + bias_dropout_residual_out.numel() * sizeof(T)); } - auto *dropout_mask_out_data = dropout_mask_out.mutable_data( - {bsz, seq_len, dim_embed}, place); + dropout_mask_out.Resize({{bsz, seq_len, dim_embed}}); + auto *dropout_mask_out_data = dev_ctx.Alloc( + &dropout_mask_out, dropout_mask_out.numel() * sizeof(uint8_t)); // 6. ffn matmul1 auto ffn1_weights = ctx.MultiInput("FFN1Weight"); @@ -1297,17 +1308,21 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { auto ffn1_linear_compute = AttnMatMul( dev_ctx, false, false, bsz_seq, dim_ffn, dim_embed, false); Tensor ffn1_out; - auto *ffn1_out_data = ffn1_out.mutable_data({bsz_seq, dim_ffn}, place); + ffn1_out.Resize({{bsz_seq, dim_ffn}}); + auto *ffn1_out_data = + dev_ctx.Alloc(&ffn1_out, ffn1_out.numel() * sizeof(T)); // 7. ffn act + bias DropoutParam ffn1_dropout_param(true, 0, true, true, 0.0, nullptr, 0); FusedDropoutHelper fused_act_dropout_helper( dev_ctx, bsz_seq, dim_ffn, ffn1_dropout_param); Tensor ffn1_dropout_out, ffn1_dropout_mask; - auto *ffn1_dropout_out_data = - ffn1_dropout_out.mutable_data({bsz_seq, dim_ffn}, place); - auto *ffn1_dropout_mask_data = - ffn1_dropout_mask.mutable_data({bsz_seq, dim_ffn}, place); + ffn1_dropout_out.Resize({{bsz_seq, dim_ffn}}); + auto *ffn1_dropout_out_data = dev_ctx.Alloc( + &ffn1_dropout_out, ffn1_dropout_out.numel() * sizeof(T)); + ffn1_dropout_mask.Resize({{bsz_seq, dim_ffn}}); + auto *ffn1_dropout_mask_data = dev_ctx.Alloc( + &ffn1_dropout_mask, ffn1_dropout_mask.numel() * sizeof(uint8_t)); // 8. ffn2 matmul auto ffn2_weights = ctx.MultiInput("FFN2Weight"); @@ -1322,11 +1337,12 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel { // calc auto *out = ctx.Output("Out"); - auto *from_data = out->mutable_data(place); + auto *from_data = dev_ctx.Alloc(out, out->numel() * sizeof(T)); Tensor *from_tensor = out; Tensor tmp_out; + tmp_out.Resize({{bsz, seq_len, dim_embed}}); auto *tmp_out_data = - tmp_out.mutable_data({bsz, seq_len, dim_embed}, place); + dev_ctx.Alloc(&tmp_out, tmp_out.numel() * sizeof(T)); auto *x_data = input_x->data(); Tensor *buf0 = nullptr; diff --git a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu index a6a49b7ac62..dbfabe07f47 100644 --- a/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu +++ b/paddle/fluid/operators/fused/fused_seqpool_cvm_op.cu @@ -426,7 +426,7 @@ class FusedSeqpoolCVMCUDAKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext &ctx) const override { auto inputs = ctx.MultiInput("X"); auto outputs = ctx.MultiOutput("Out"); - + auto &dev_ctx = ctx.template device_context(); const auto slot_size = inputs.size(); std::vector input_data(slot_size); std::vector lods_data(slot_size); @@ -478,13 +478,13 @@ class FusedSeqpoolCVMCUDAKernel : public framework::OpKernel { } else { output->Resize({batch_size, embedding_size - cvm_offset}); } - output_data[i] = - reinterpret_cast(output->mutable_data(ctx.GetPlace())); + output_data[i] = reinterpret_cast( + dev_ctx.Alloc(output, output->numel() * sizeof(T))); mix_lods_v[i] = new paddle::framework::MixVector(&lods); lods_data[i] = mix_lods_v[i]->CUDAData(ctx.GetPlace()); - seqpool_output_data[i] = - reinterpret_cast(seqpool_outputs[i].mutable_data( - {batch_size, embedding_size}, ctx.GetPlace())); + seqpool_outputs[i].Resize({batch_size, embedding_size}); + seqpool_output_data[i] = reinterpret_cast(dev_ctx.Alloc( + &seqpool_outputs[i], seqpool_outputs[i].numel() * sizeof(T))); } FusedSeqpoolCVM(ctx, @@ -512,7 +512,7 @@ class FusedSeqpoolCVMGradCUDAKernel : public framework::OpKernel { auto out_grads = ctx.MultiInput(framework::GradVarName("Out")); auto in_grads = ctx.MultiOutput(framework::GradVarName("X")); auto *cvm = ctx.Input("CVM"); - + auto &dev_ctx = ctx.template device_context(); std::string pooltype = ctx.Attr("pooltype"); auto use_cvm = ctx.Attr("use_cvm"); const int cvm_offset = ctx.Attr("cvm_offset"); @@ -559,8 +559,8 @@ class FusedSeqpoolCVMGradCUDAKernel : public framework::OpKernel { auto *out_grad = out_grads[i]; out_grads_data[i] = reinterpret_cast(out_grad->data()); - in_grads_data[i] = - reinterpret_cast(in_grad->mutable_data(ctx.GetPlace())); + in_grads_data[i] = reinterpret_cast( + dev_ctx.Alloc(in_grad, in_grad->numel() * sizeof(T))); mix_lods_v[i] = new paddle::framework::MixVector(&lods); lods_data[i] = mix_lods_v[i]->CUDAData(ctx.GetPlace()); cvm_data[i] = reinterpret_cast(cvm->data()); diff --git a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu index ce892024d8d..194d171c46e 100644 --- a/paddle/fluid/operators/fused/fusion_conv_inception_op.cu +++ b/paddle/fluid/operators/fused/fusion_conv_inception_op.cu @@ -55,8 +55,10 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { static_cast(ctx.Attr("workspace_size_MB")); const T* input_data = input->data(); - T* output_data = output->mutable_data(ctx.GetPlace()); - T* temp_data = temp_outs[0]->mutable_data(input->dims(), ctx.GetPlace()); + T* output_data = dev_ctx.Alloc(output, output->numel() * sizeof(T)); + temp_outs[0]->Resize(input->dims()); + T* temp_data = + dev_ctx.Alloc(temp_outs[0], temp_outs[0]->numel() * sizeof(T)); DataLayout layout = DataLayout::kNCHW; std::vector in_dim = phi::vectorize(input->dims()); @@ -254,8 +256,9 @@ class CUDNNConvInceptionFusionOpKernel : public framework::OpKernel { in_datas.push_back(static_cast(input_data)); in_datas.push_back( static_cast(output_data + (oc0 + oc1) * h * w)); - T* temp2_data = temp_outs[1]->mutable_data(phi::make_ddim(out_dims[2]), - ctx.GetPlace()); + temp_outs[1]->Resize(phi::make_ddim(out_dims[2])); + T* temp2_data = + dev_ctx.Alloc(temp_outs[1], temp_outs[1]->numel() * sizeof(T)); in_datas.push_back(static_cast(temp2_data + oc2 * h * w)); std::vector out_datas; -- GitLab