From a79d4a7506d2e44cf010412644208183081ccd21 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Wed, 17 Aug 2022 16:11:11 +0800
Subject: [PATCH] fix multi stream error. (#45196)

* fix multi stream error.
---
 .../allocation/stream_safe_cuda_allocator.cc    |  6 ++++--
 paddle/fluid/operators/fc_op.h                  |  6 ++++--
 .../fused_embedding_eltwise_layernorm_op.cu     | 16 +++++++++++-----
 .../fused/fused_fc_elementwise_layernorm_op.cu  | 12 ++++++++----
 .../operators/fused/multihead_matmul_op.cu      | 17 +++++++++++------
 paddle/fluid/operators/matmul_op.cc             |  4 +++-
 6 files changed, 41 insertions(+), 20 deletions(-)
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index d540f907a6..b62ba99df7 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
+#include <thread>
 
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
@@ -194,8 +195,9 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
       static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)),
       default_stream_,
       this);
-  VLOG(8) << "Allocate " << allocation->size() << " bytes at address "
-          << allocation->ptr() << "  , stream: " << default_stream_;
+  VLOG(8) << "Thread " << std::this_thread::get_id() << " Allocate "
+          << allocation->size() << " bytes at address " << allocation->ptr()
+          << "  , stream: " << default_stream_;
   return allocation;
 }
 
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index 52b2a98a9d..24380b29ee 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -69,6 +69,8 @@ class FCOpKernel : public framework::OpKernel<T> {
     auto w_dims = w->dims();
     bool padding_weights = ctx.Attr<bool>("padding_weights");
 
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
     std::vector<int64_t> output_dims;
     FCOutputSize(
         input->dims(), w_dims, output_dims, in_num_col_dims, padding_weights);
@@ -82,9 +84,9 @@ class FCOpKernel : public framework::OpKernel<T> {
 
     const T* input_data = input->data<T>();
     const T* w_data = w->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    auto* output_data =
+        dev_ctx.template Alloc<T>(output, output->numel() * sizeof(T));
 
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
     phi::funcs::FCFunctor<DeviceContext, T> fc;
     fc(dev_ctx,
        M,
diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
index ac09d337e9..75e131b2de 100644
--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -15,6 +15,7 @@
 #include <paddle/fluid/platform/device_context.h>
 
 #include <algorithm>
+#include <cstdint>
 #include <type_traits>
 
 #include "paddle/fluid/framework/convert_utils.h"
@@ -49,12 +50,16 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
 #else
     cudaGetDevice(&device_id);
 #endif
+
+    auto &dev_ctx = context.template device_context<phi::GPUContext>();
+
     in_ids_.Resize(in_dim);
     in_embs_.Resize(in_dim);
-    int64_t *in_ids_d =
-        in_ids_.mutable_data<int64_t>(platform::CUDAPlace(device_id));
-    int64_t *in_embs_d =
-        in_embs_.mutable_data<int64_t>(platform::CUDAPlace(device_id));
+
+    int64_t *in_ids_d = dev_ctx.template Alloc<int64_t>(
+        &in_ids_, in_ids_.numel() * sizeof(int64_t));
+    int64_t *in_embs_d = dev_ctx.template Alloc<int64_t>(
+        &in_embs_, in_embs_.numel() * sizeof(int64_t));
 
     std::vector<int64_t> in1s, in2s;
     for (int i = 0; i < input_num; ++i) {
@@ -99,7 +104,8 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
 
     auto *bias_d = bias->data<T>();
     auto *scale_d = scale->data<T>();
-    auto *output_d = out->mutable_data<T>(context.GetPlace());
+    auto *output_d = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
+
     float eps = context.Attr<float>("epsilon");
 
     if (std::is_same<T, paddle::platform::float16>::value) {
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
index 3e117c4535..758fb8a23f 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -395,9 +395,10 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
 
     const T* x_data = x->data<T>();
     const T* w_data = w->data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
 
     auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
+    auto* out_data = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
+
     auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
     blas.GEMM(false,
               false,
@@ -425,9 +426,12 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
     auto* mean = ctx.Output<framework::Tensor>("Mean");
     auto* variance = ctx.Output<framework::Tensor>("Variance");
 
-    T* mean_data = mean ? mean->mutable_data<T>(ctx.GetPlace()) : nullptr;
-    T* variance_data =
-        variance ? variance->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    T* mean_data =
+        mean ? dev_ctx.template Alloc<T>(mean, mean->numel() * sizeof(T))
+             : nullptr;
+    T* variance_data = variance ? dev_ctx.template Alloc<T>(
+                                      variance, variance->numel() * sizeof(T))
+                                : nullptr;
 
     bool with_relu =
         (ctx.Attr<std::string>("activation_type") == "relu") ? true : false;
diff --git a/paddle/fluid/operators/fused/multihead_matmul_op.cu b/paddle/fluid/operators/fused/multihead_matmul_op.cu
index 16ab0d916d..c2e2754830 100644
--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
@@ -287,7 +287,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
     // if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted
     if (bias_qk && bias_qk->numel() == (batch * seq_len)) {
       temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len});
-      auto *temp_qk_bias = temp_bias_tensor.mutable_data<T>(context.GetPlace());
+      auto *temp_qk_bias = device_ctx.template Alloc<T>(
+          &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
       int grid = batch * head_number * seq_len;
       int block = round_up(seq_len);
       broadcast<<<grid, block, 0, stream>>>(
@@ -297,7 +298,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
     if (!bias_qk) {
       int size = batch * head_number * seq_len * seq_len;
       temp_bias_tensor.Resize({size});
-      auto *temp_qk_bias = temp_bias_tensor.mutable_data<T>(context.GetPlace());
+      auto *temp_qk_bias = device_ctx.template Alloc<T>(
+          &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
 #ifdef PADDLE_WITH_HIP
       hipMemset(temp_qk_bias, 0, sizeof(float) * size);
 #else
@@ -310,7 +312,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
 
     auto *out = context.Output<framework::Tensor>("Out");
     out->Resize({batch, seq_len, all_head_size});
-    auto *output_d = out->mutable_data<T>(context.GetPlace());
+    auto *output_d =
+        device_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
 
     // (B*S, hidden)
     const Tensor input_matrix =
@@ -324,7 +327,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
         phi::make_ddim({batch, seq_len, 3, head_number, head_size});
     temp_out_tensor.Resize(
         {batch * seq_len, phi::product(temp_out_dims) / (batch * seq_len)});
-    auto *temp_out_data = temp_out_tensor.mutable_data<T>(context.GetPlace());
+    auto *temp_out_data = device_ctx.template Alloc<T>(
+        &temp_out_tensor, temp_out_tensor.numel() * sizeof(T));
 
     // (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)
     auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(device_ctx);
@@ -336,8 +340,9 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
     // B * head_number * S * S * 1 + B * S * 3 * N * H
     int scratch_size = batch * head_number * seq_len * seq_len * 1;
     multihead_temp_tensor.Resize({scratch_size + temp_out_tensor.numel()});
-    auto *multihead_temp_data =
-        multihead_temp_tensor.mutable_data<T>(context.GetPlace());
+    auto *multihead_temp_data = device_ctx.template Alloc<T>(
+        &multihead_temp_tensor, multihead_temp_tensor.numel() * sizeof(T));
+
     auto *qkptr = multihead_temp_data;
     auto *tptr = multihead_temp_data + scratch_size;
 
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 80018ddb1c..a31c218307 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -65,7 +65,9 @@ class MatMulKernel : public framework::OpKernel<T> {
     auto &y = GET_DATA_SAFELY(
         context.Input<framework::Tensor>("Y"), "Input", "Y", "MatMul");
     auto *out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
 
     auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
     auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(
-- 
GitLab