fix multi stream error. (#45196)

* fix multi stream error.

fix multi stream error. (#45196)
* fix multi stream error.
a79d4a75 · Wilber · GitHub · f59c666c · a79d4a75 · a79d4a75
6 changed file
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
+#include <thread>
 #include "paddle/fluid/platform/profiler/event_tracing.h"
@@ -194,8 +195,9 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
      static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)),
      default_stream_,
      this);
-  VLOG(8) << "Allocate " << allocation->size() << " bytes at address "
+  VLOG(8) << "Thread " << std::this_thread::get_id() << " Allocate "
-          << allocation->ptr() << "  , stream: " << default_stream_;
+          << allocation->size() << " bytes at address " << allocation->ptr()
+          << "  , stream: " << default_stream_;
  return allocation;
 }

--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -69,6 +69,8 @@ class FCOpKernel : public framework::OpKernel<T> {
    auto w_dims = w->dims();
    bool padding_weights = ctx.Attr<bool>("padding_weights");
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    std::vector<int64_t> output_dims;
    FCOutputSize(
        input->dims(), w_dims, output_dims, in_num_col_dims, padding_weights);
@@ -82,9 +84,9 @@ class FCOpKernel : public framework::OpKernel<T> {
    const T* input_data = input->data<T>();
    const T* w_data = w->data<T>();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    auto* output_data =
+        dev_ctx.template Alloc<T>(output, output->numel() * sizeof(T));
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    phi::funcs::FCFunctor<DeviceContext, T> fc;
    fc(dev_ctx,
       M,

--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -15,6 +15,7 @@
 #include <paddle/fluid/platform/device_context.h>
 #include <algorithm>
+#include <cstdint>
 #include <type_traits>
 #include "paddle/fluid/framework/convert_utils.h"
@@ -49,12 +50,16 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
 #else
    cudaGetDevice(&device_id);
 #endif
+    auto &dev_ctx = context.template device_context<phi::GPUContext>();
    in_ids_.Resize(in_dim);
    in_embs_.Resize(in_dim);
-    int64_t *in_ids_d =
-        in_ids_.mutable_data<int64_t>(platform::CUDAPlace(device_id));
+    int64_t *in_ids_d = dev_ctx.template Alloc<int64_t>(
-    int64_t *in_embs_d =
+        &in_ids_, in_ids_.numel() * sizeof(int64_t));
-        in_embs_.mutable_data<int64_t>(platform::CUDAPlace(device_id));
+    int64_t *in_embs_d = dev_ctx.template Alloc<int64_t>(
+        &in_embs_, in_embs_.numel() * sizeof(int64_t));
    std::vector<int64_t> in1s, in2s;
    for (int i = 0; i < input_num; ++i) {
@@ -99,7 +104,8 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
    auto *bias_d = bias->data<T>();
    auto *scale_d = scale->data<T>();
-    auto *output_d = out->mutable_data<T>(context.GetPlace());
+    auto *output_d = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
    float eps = context.Attr<float>("epsilon");
    if (std::is_same<T, paddle::platform::float16>::value) {

--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -395,9 +395,10 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
    const T* x_data = x->data<T>();
    const T* w_data = w->data<T>();
-    T* out_data = out->mutable_data<T>(ctx.GetPlace());
    auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
+    auto* out_data = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
    blas.GEMM(false,
              false,
@@ -425,9 +426,12 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
    auto* mean = ctx.Output<framework::Tensor>("Mean");
    auto* variance = ctx.Output<framework::Tensor>("Variance");
-    T* mean_data = mean ? mean->mutable_data<T>(ctx.GetPlace()) : nullptr;
+    T* mean_data =
-    T* variance_data =
+        mean ? dev_ctx.template Alloc<T>(mean, mean->numel() * sizeof(T))
-        variance ? variance->mutable_data<T>(ctx.GetPlace()) : nullptr;
+             : nullptr;
+    T* variance_data = variance ? dev_ctx.template Alloc<T>(
+                                      variance, variance->numel() * sizeof(T))
+                                : nullptr;
    bool with_relu =
        (ctx.Attr<std::string>("activation_type") == "relu") ? true : false;

--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
@@ -287,7 +287,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
    // if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted
    if (bias_qk && bias_qk->numel() == (batch * seq_len)) {
      temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len});
-      auto *temp_qk_bias = temp_bias_tensor.mutable_data<T>(context.GetPlace());
+      auto *temp_qk_bias = device_ctx.template Alloc<T>(
+          &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
      int grid = batch * head_number * seq_len;
      int block = round_up(seq_len);
      broadcast<<<grid, block, 0, stream>>>(
@@ -297,7 +298,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
    if (!bias_qk) {
      int size = batch * head_number * seq_len * seq_len;
      temp_bias_tensor.Resize({size});
-      auto *temp_qk_bias = temp_bias_tensor.mutable_data<T>(context.GetPlace());
+      auto *temp_qk_bias = device_ctx.template Alloc<T>(
+          &temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
 #ifdef PADDLE_WITH_HIP
      hipMemset(temp_qk_bias, 0, sizeof(float) * size);
 #else
@@ -310,7 +312,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
    auto *out = context.Output<framework::Tensor>("Out");
    out->Resize({batch, seq_len, all_head_size});
-    auto *output_d = out->mutable_data<T>(context.GetPlace());
+    auto *output_d =
+        device_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
    // (B*S, hidden)
    const Tensor input_matrix =
@@ -324,7 +327,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
        phi::make_ddim({batch, seq_len, 3, head_number, head_size});
    temp_out_tensor.Resize(
        {batch * seq_len, phi::product(temp_out_dims) / (batch * seq_len)});
-    auto *temp_out_data = temp_out_tensor.mutable_data<T>(context.GetPlace());
+    auto *temp_out_data = device_ctx.template Alloc<T>(
+        &temp_out_tensor, temp_out_tensor.numel() * sizeof(T));
    // (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)
    auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(device_ctx);
@@ -336,8 +340,9 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
    // B * head_number * S * S * 1 + B * S * 3 * N * H
    int scratch_size = batch * head_number * seq_len * seq_len * 1;
    multihead_temp_tensor.Resize({scratch_size + temp_out_tensor.numel()});
-    auto *multihead_temp_data =
+    auto *multihead_temp_data = device_ctx.template Alloc<T>(
-        multihead_temp_tensor.mutable_data<T>(context.GetPlace());
+        &multihead_temp_tensor, multihead_temp_tensor.numel() * sizeof(T));
    auto *qkptr = multihead_temp_data;
    auto *tptr = multihead_temp_data + scratch_size;

--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -65,7 +65,9 @@ class MatMulKernel : public framework::OpKernel<T> {
    auto &y = GET_DATA_SAFELY(
        context.Input<framework::Tensor>("Y"), "Input", "Y", "MatMul");
    auto *out = context.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+    dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
    auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
    auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(