diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu index 219a517315b52f9c6aa6258883ff983a4b638298..05d3013da5b009101072fae05af787957135f7d4 100644 --- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu +++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu @@ -150,8 +150,10 @@ class FusedGemmEpilogueKernel : public framework::OpKernel { // "enough". I just followed the settings from the NVIDIA MLPerf BERT code. size_t workspace_size = static_cast(4) * 1024 * 1024; cudaStream_t stream = dev_ctx.stream(); - memory::allocation::AllocationPtr workspace = - memory::Alloc(dev_ctx, workspace_size); + memory::allocation::AllocationPtr workspace = memory::Alloc( + dev_ctx.GetPlace(), + workspace_size, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); double alpha64 = 1.0, beta64 = 0.0; float alpha32 = 1.0f, beta32 = 0.0f; @@ -486,7 +488,10 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel { sizeof(aux_ld))); } - auto dx_workspace = memory::Alloc(dev_ctx, workspace_size); + auto dx_workspace = memory::Alloc( + dev_ctx.GetPlace(), + workspace_size, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); auto* dx_data = dx->mutable_data(ctx.GetPlace()); const auto* y_data = y->data(); @@ -605,7 +610,10 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel { sizeof(dbias_data))); } - auto dy_workspace = memory::Alloc(dev_ctx, workspace_size); + auto dy_workspace = memory::Alloc( + dev_ctx.GetPlace(), + workspace_size, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); auto* dy_data = dy->mutable_data(ctx.GetPlace()); const auto* dout_data = dout->data(); const auto* x_data = x->data(); diff --git a/paddle/fluid/operators/math/eigen_values_vectors.h b/paddle/fluid/operators/math/eigen_values_vectors.h index 61682a95c132955df12b094ad08bdd85ffa44780..1ba2d8f18ca1c53fd471a985cd8ad30281bd1955 100644 --- a/paddle/fluid/operators/math/eigen_values_vectors.h +++ b/paddle/fluid/operators/math/eigen_values_vectors.h @@ -207,7 +207,10 @@ struct MatrixEighFunctor { auto vector_stride = dims[dim_size - 1] * dims[dim_size - 2]; auto values_stride = dims[dim_size - 1]; int lwork = 0; - auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_size); + auto info = memory::Alloc( + dev_ctx.GetPlace(), + sizeof(int) * batch_size, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); auto *info_ptr = reinterpret_cast(info->ptr()); // When the input type is float32, and the feature value input dimension is @@ -240,7 +243,10 @@ struct MatrixEighFunctor { out_value, &lwork); } - auto work = memory::Alloc(dev_ctx, sizeof(T) * lwork); + auto work = memory::Alloc( + dev_ctx.GetPlace(), + sizeof(T) * lwork, + phi::Stream(reinterpret_cast(dev_ctx.stream()))); auto *work_ptr = reinterpret_cast(work->ptr()); for (auto i = 0; i < batch_size; i++) { auto *input_data = input_vector + i * vector_stride; diff --git a/paddle/fluid/operators/sparse_attention_op.cu b/paddle/fluid/operators/sparse_attention_op.cu index 95c0562eb7272012798b8fb3ef8c76dea2c6d41d..fd2ccfdea33cf933e77210c9e6ce40af8d24d25b 100644 --- a/paddle/fluid/operators/sparse_attention_op.cu +++ b/paddle/fluid/operators/sparse_attention_op.cu @@ -522,7 +522,10 @@ void DotSdd(const phi::GPUContext& ctx, gpu_type, CUSPARSE_SDDMM_ALG_DEFAULT, &buffer_size); - auto d_buffer_ptr = paddle::memory::Alloc(ctx, buffer_size); + auto d_buffer_ptr = paddle::memory::Alloc( + ctx.GetPlace(), + buffer_size, + phi::Stream(reinterpret_cast(ctx.stream()))); void* d_buffer = static_cast(d_buffer_ptr->ptr()); platform::dynload::cusparseSDDMM(handle, @@ -616,7 +619,10 @@ void DotDsd(const phi::GPUContext& ctx, gpu_type, CUSPARSE_SPMM_ALG_DEFAULT, &buffer_size); - auto d_buffer_ptr = paddle::memory::Alloc(ctx, buffer_size); + auto d_buffer_ptr = paddle::memory::Alloc( + ctx.GetPlace(), + buffer_size, + phi::Stream(reinterpret_cast(ctx.stream()))); void* d_buffer = static_cast(d_buffer_ptr->ptr()); platform::dynload::cusparseSpMM(handle,