未验证 提交 a79d4a75 编写于 作者: W Wilber 提交者: GitHub

fix multi stream error. (#45196)

* fix multi stream error.
上级 f59c666c
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h" #include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
#include <thread>
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
...@@ -194,8 +195,9 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) { ...@@ -194,8 +195,9 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)), static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)),
default_stream_, default_stream_,
this); this);
VLOG(8) << "Allocate " << allocation->size() << " bytes at address " VLOG(8) << "Thread " << std::this_thread::get_id() << " Allocate "
<< allocation->ptr() << " , stream: " << default_stream_; << allocation->size() << " bytes at address " << allocation->ptr()
<< " , stream: " << default_stream_;
return allocation; return allocation;
} }
......
...@@ -69,6 +69,8 @@ class FCOpKernel : public framework::OpKernel<T> { ...@@ -69,6 +69,8 @@ class FCOpKernel : public framework::OpKernel<T> {
auto w_dims = w->dims(); auto w_dims = w->dims();
bool padding_weights = ctx.Attr<bool>("padding_weights"); bool padding_weights = ctx.Attr<bool>("padding_weights");
auto& dev_ctx = ctx.template device_context<DeviceContext>();
std::vector<int64_t> output_dims; std::vector<int64_t> output_dims;
FCOutputSize( FCOutputSize(
input->dims(), w_dims, output_dims, in_num_col_dims, padding_weights); input->dims(), w_dims, output_dims, in_num_col_dims, padding_weights);
...@@ -82,9 +84,9 @@ class FCOpKernel : public framework::OpKernel<T> { ...@@ -82,9 +84,9 @@ class FCOpKernel : public framework::OpKernel<T> {
const T* input_data = input->data<T>(); const T* input_data = input->data<T>();
const T* w_data = w->data<T>(); const T* w_data = w->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace()); auto* output_data =
dev_ctx.template Alloc<T>(output, output->numel() * sizeof(T));
auto& dev_ctx = ctx.template device_context<DeviceContext>();
phi::funcs::FCFunctor<DeviceContext, T> fc; phi::funcs::FCFunctor<DeviceContext, T> fc;
fc(dev_ctx, fc(dev_ctx,
M, M,
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
#include <paddle/fluid/platform/device_context.h> #include <paddle/fluid/platform/device_context.h>
#include <algorithm> #include <algorithm>
#include <cstdint>
#include <type_traits> #include <type_traits>
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
...@@ -49,12 +50,16 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> { ...@@ -49,12 +50,16 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
#else #else
cudaGetDevice(&device_id); cudaGetDevice(&device_id);
#endif #endif
auto &dev_ctx = context.template device_context<phi::GPUContext>();
in_ids_.Resize(in_dim); in_ids_.Resize(in_dim);
in_embs_.Resize(in_dim); in_embs_.Resize(in_dim);
int64_t *in_ids_d =
in_ids_.mutable_data<int64_t>(platform::CUDAPlace(device_id)); int64_t *in_ids_d = dev_ctx.template Alloc<int64_t>(
int64_t *in_embs_d = &in_ids_, in_ids_.numel() * sizeof(int64_t));
in_embs_.mutable_data<int64_t>(platform::CUDAPlace(device_id)); int64_t *in_embs_d = dev_ctx.template Alloc<int64_t>(
&in_embs_, in_embs_.numel() * sizeof(int64_t));
std::vector<int64_t> in1s, in2s; std::vector<int64_t> in1s, in2s;
for (int i = 0; i < input_num; ++i) { for (int i = 0; i < input_num; ++i) {
...@@ -99,7 +104,8 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> { ...@@ -99,7 +104,8 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
auto *bias_d = bias->data<T>(); auto *bias_d = bias->data<T>();
auto *scale_d = scale->data<T>(); auto *scale_d = scale->data<T>();
auto *output_d = out->mutable_data<T>(context.GetPlace()); auto *output_d = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
float eps = context.Attr<float>("epsilon"); float eps = context.Attr<float>("epsilon");
if (std::is_same<T, paddle::platform::float16>::value) { if (std::is_same<T, paddle::platform::float16>::value) {
......
...@@ -395,9 +395,10 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> { ...@@ -395,9 +395,10 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
const T* x_data = x->data<T>(); const T* x_data = x->data<T>();
const T* w_data = w->data<T>(); const T* w_data = w->data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<phi::GPUContext>(); auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto* out_data = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx); auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
blas.GEMM(false, blas.GEMM(false,
false, false,
...@@ -425,9 +426,12 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> { ...@@ -425,9 +426,12 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
auto* mean = ctx.Output<framework::Tensor>("Mean"); auto* mean = ctx.Output<framework::Tensor>("Mean");
auto* variance = ctx.Output<framework::Tensor>("Variance"); auto* variance = ctx.Output<framework::Tensor>("Variance");
T* mean_data = mean ? mean->mutable_data<T>(ctx.GetPlace()) : nullptr; T* mean_data =
T* variance_data = mean ? dev_ctx.template Alloc<T>(mean, mean->numel() * sizeof(T))
variance ? variance->mutable_data<T>(ctx.GetPlace()) : nullptr; : nullptr;
T* variance_data = variance ? dev_ctx.template Alloc<T>(
variance, variance->numel() * sizeof(T))
: nullptr;
bool with_relu = bool with_relu =
(ctx.Attr<std::string>("activation_type") == "relu") ? true : false; (ctx.Attr<std::string>("activation_type") == "relu") ? true : false;
......
...@@ -287,7 +287,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> { ...@@ -287,7 +287,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
// if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted // if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted
if (bias_qk && bias_qk->numel() == (batch * seq_len)) { if (bias_qk && bias_qk->numel() == (batch * seq_len)) {
temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len}); temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len});
auto *temp_qk_bias = temp_bias_tensor.mutable_data<T>(context.GetPlace()); auto *temp_qk_bias = device_ctx.template Alloc<T>(
&temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
int grid = batch * head_number * seq_len; int grid = batch * head_number * seq_len;
int block = round_up(seq_len); int block = round_up(seq_len);
broadcast<<<grid, block, 0, stream>>>( broadcast<<<grid, block, 0, stream>>>(
...@@ -297,7 +298,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> { ...@@ -297,7 +298,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
if (!bias_qk) { if (!bias_qk) {
int size = batch * head_number * seq_len * seq_len; int size = batch * head_number * seq_len * seq_len;
temp_bias_tensor.Resize({size}); temp_bias_tensor.Resize({size});
auto *temp_qk_bias = temp_bias_tensor.mutable_data<T>(context.GetPlace()); auto *temp_qk_bias = device_ctx.template Alloc<T>(
&temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
hipMemset(temp_qk_bias, 0, sizeof(float) * size); hipMemset(temp_qk_bias, 0, sizeof(float) * size);
#else #else
...@@ -310,7 +312,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> { ...@@ -310,7 +312,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
auto *out = context.Output<framework::Tensor>("Out"); auto *out = context.Output<framework::Tensor>("Out");
out->Resize({batch, seq_len, all_head_size}); out->Resize({batch, seq_len, all_head_size});
auto *output_d = out->mutable_data<T>(context.GetPlace()); auto *output_d =
device_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
// (B*S, hidden) // (B*S, hidden)
const Tensor input_matrix = const Tensor input_matrix =
...@@ -324,7 +327,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> { ...@@ -324,7 +327,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
phi::make_ddim({batch, seq_len, 3, head_number, head_size}); phi::make_ddim({batch, seq_len, 3, head_number, head_size});
temp_out_tensor.Resize( temp_out_tensor.Resize(
{batch * seq_len, phi::product(temp_out_dims) / (batch * seq_len)}); {batch * seq_len, phi::product(temp_out_dims) / (batch * seq_len)});
auto *temp_out_data = temp_out_tensor.mutable_data<T>(context.GetPlace()); auto *temp_out_data = device_ctx.template Alloc<T>(
&temp_out_tensor, temp_out_tensor.numel() * sizeof(T));
// (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H) // (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)
auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(device_ctx); auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(device_ctx);
...@@ -336,8 +340,9 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> { ...@@ -336,8 +340,9 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
// B * head_number * S * S * 1 + B * S * 3 * N * H // B * head_number * S * S * 1 + B * S * 3 * N * H
int scratch_size = batch * head_number * seq_len * seq_len * 1; int scratch_size = batch * head_number * seq_len * seq_len * 1;
multihead_temp_tensor.Resize({scratch_size + temp_out_tensor.numel()}); multihead_temp_tensor.Resize({scratch_size + temp_out_tensor.numel()});
auto *multihead_temp_data = auto *multihead_temp_data = device_ctx.template Alloc<T>(
multihead_temp_tensor.mutable_data<T>(context.GetPlace()); &multihead_temp_tensor, multihead_temp_tensor.numel() * sizeof(T));
auto *qkptr = multihead_temp_data; auto *qkptr = multihead_temp_data;
auto *tptr = multihead_temp_data + scratch_size; auto *tptr = multihead_temp_data + scratch_size;
......
...@@ -65,7 +65,9 @@ class MatMulKernel : public framework::OpKernel<T> { ...@@ -65,7 +65,9 @@ class MatMulKernel : public framework::OpKernel<T> {
auto &y = GET_DATA_SAFELY( auto &y = GET_DATA_SAFELY(
context.Input<framework::Tensor>("Y"), "Input", "Y", "MatMul"); context.Input<framework::Tensor>("Y"), "Input", "Y", "MatMul");
auto *out = context.Output<framework::Tensor>("Out"); auto *out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
auto &dev_ctx = context.template device_context<DeviceContext>();
dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
auto blas = phi::funcs::GetBlas<DeviceContext, T>(context); auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
auto mat_dim_a = phi::funcs::CreateMatrixDescriptor( auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册