未验证 提交 a79d4a75 编写于 作者: W Wilber 提交者: GitHub

fix multi stream error. (#45196)

* fix multi stream error.
上级 f59c666c
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "paddle/fluid/memory/allocation/stream_safe_cuda_allocator.h"
#include <thread>
#include "paddle/fluid/platform/profiler/event_tracing.h"
......@@ -194,8 +195,9 @@ phi::Allocation* StreamSafeCUDAAllocator::AllocateImpl(size_t size) {
static_unique_ptr_cast<Allocation>(std::move(underlying_allocation)),
default_stream_,
this);
VLOG(8) << "Allocate " << allocation->size() << " bytes at address "
<< allocation->ptr() << " , stream: " << default_stream_;
VLOG(8) << "Thread " << std::this_thread::get_id() << " Allocate "
<< allocation->size() << " bytes at address " << allocation->ptr()
<< " , stream: " << default_stream_;
return allocation;
}
......
......@@ -69,6 +69,8 @@ class FCOpKernel : public framework::OpKernel<T> {
auto w_dims = w->dims();
bool padding_weights = ctx.Attr<bool>("padding_weights");
auto& dev_ctx = ctx.template device_context<DeviceContext>();
std::vector<int64_t> output_dims;
FCOutputSize(
input->dims(), w_dims, output_dims, in_num_col_dims, padding_weights);
......@@ -82,9 +84,9 @@ class FCOpKernel : public framework::OpKernel<T> {
const T* input_data = input->data<T>();
const T* w_data = w->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
auto* output_data =
dev_ctx.template Alloc<T>(output, output->numel() * sizeof(T));
auto& dev_ctx = ctx.template device_context<DeviceContext>();
phi::funcs::FCFunctor<DeviceContext, T> fc;
fc(dev_ctx,
M,
......
......@@ -15,6 +15,7 @@
#include <paddle/fluid/platform/device_context.h>
#include <algorithm>
#include <cstdint>
#include <type_traits>
#include "paddle/fluid/framework/convert_utils.h"
......@@ -49,12 +50,16 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
#else
cudaGetDevice(&device_id);
#endif
auto &dev_ctx = context.template device_context<phi::GPUContext>();
in_ids_.Resize(in_dim);
in_embs_.Resize(in_dim);
int64_t *in_ids_d =
in_ids_.mutable_data<int64_t>(platform::CUDAPlace(device_id));
int64_t *in_embs_d =
in_embs_.mutable_data<int64_t>(platform::CUDAPlace(device_id));
int64_t *in_ids_d = dev_ctx.template Alloc<int64_t>(
&in_ids_, in_ids_.numel() * sizeof(int64_t));
int64_t *in_embs_d = dev_ctx.template Alloc<int64_t>(
&in_embs_, in_embs_.numel() * sizeof(int64_t));
std::vector<int64_t> in1s, in2s;
for (int i = 0; i < input_num; ++i) {
......@@ -99,7 +104,8 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel<T> {
auto *bias_d = bias->data<T>();
auto *scale_d = scale->data<T>();
auto *output_d = out->mutable_data<T>(context.GetPlace());
auto *output_d = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
float eps = context.Attr<float>("epsilon");
if (std::is_same<T, paddle::platform::float16>::value) {
......
......@@ -395,9 +395,10 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
const T* x_data = x->data<T>();
const T* w_data = w->data<T>();
T* out_data = out->mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<phi::GPUContext>();
auto* out_data = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
blas.GEMM(false,
false,
......@@ -425,9 +426,12 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
auto* mean = ctx.Output<framework::Tensor>("Mean");
auto* variance = ctx.Output<framework::Tensor>("Variance");
T* mean_data = mean ? mean->mutable_data<T>(ctx.GetPlace()) : nullptr;
T* variance_data =
variance ? variance->mutable_data<T>(ctx.GetPlace()) : nullptr;
T* mean_data =
mean ? dev_ctx.template Alloc<T>(mean, mean->numel() * sizeof(T))
: nullptr;
T* variance_data = variance ? dev_ctx.template Alloc<T>(
variance, variance->numel() * sizeof(T))
: nullptr;
bool with_relu =
(ctx.Attr<std::string>("activation_type") == "relu") ? true : false;
......
......@@ -287,7 +287,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
// if bias_qk is[batch, 1, 1, seq_len], the bias_qk_d need to be broadcasted
if (bias_qk && bias_qk->numel() == (batch * seq_len)) {
temp_bias_tensor.Resize({batch * head_number * seq_len * seq_len});
auto *temp_qk_bias = temp_bias_tensor.mutable_data<T>(context.GetPlace());
auto *temp_qk_bias = device_ctx.template Alloc<T>(
&temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
int grid = batch * head_number * seq_len;
int block = round_up(seq_len);
broadcast<<<grid, block, 0, stream>>>(
......@@ -297,7 +298,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
if (!bias_qk) {
int size = batch * head_number * seq_len * seq_len;
temp_bias_tensor.Resize({size});
auto *temp_qk_bias = temp_bias_tensor.mutable_data<T>(context.GetPlace());
auto *temp_qk_bias = device_ctx.template Alloc<T>(
&temp_bias_tensor, temp_bias_tensor.numel() * sizeof(T));
#ifdef PADDLE_WITH_HIP
hipMemset(temp_qk_bias, 0, sizeof(float) * size);
#else
......@@ -310,7 +312,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
auto *out = context.Output<framework::Tensor>("Out");
out->Resize({batch, seq_len, all_head_size});
auto *output_d = out->mutable_data<T>(context.GetPlace());
auto *output_d =
device_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
// (B*S, hidden)
const Tensor input_matrix =
......@@ -324,7 +327,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
phi::make_ddim({batch, seq_len, 3, head_number, head_size});
temp_out_tensor.Resize(
{batch * seq_len, phi::product(temp_out_dims) / (batch * seq_len)});
auto *temp_out_data = temp_out_tensor.mutable_data<T>(context.GetPlace());
auto *temp_out_data = device_ctx.template Alloc<T>(
&temp_out_tensor, temp_out_tensor.numel() * sizeof(T));
// (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)
auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(device_ctx);
......@@ -336,8 +340,9 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
// B * head_number * S * S * 1 + B * S * 3 * N * H
int scratch_size = batch * head_number * seq_len * seq_len * 1;
multihead_temp_tensor.Resize({scratch_size + temp_out_tensor.numel()});
auto *multihead_temp_data =
multihead_temp_tensor.mutable_data<T>(context.GetPlace());
auto *multihead_temp_data = device_ctx.template Alloc<T>(
&multihead_temp_tensor, multihead_temp_tensor.numel() * sizeof(T));
auto *qkptr = multihead_temp_data;
auto *tptr = multihead_temp_data + scratch_size;
......
......@@ -65,7 +65,9 @@ class MatMulKernel : public framework::OpKernel<T> {
auto &y = GET_DATA_SAFELY(
context.Input<framework::Tensor>("Y"), "Input", "Y", "MatMul");
auto *out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace());
auto &dev_ctx = context.template device_context<DeviceContext>();
dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册