From 13baef487d2dbe333491ae8eb939463a0fbaefc2 Mon Sep 17 00:00:00 2001 From: ZhangDY-6483 <64682152+ZhangDY-6483@users.noreply.github.com> Date: Mon, 27 Mar 2023 17:19:26 +0800 Subject: [PATCH] edit formate of mea (#52147) --- paddle/phi/infermeta/backward.cc | 170 ++++++++--------- paddle/phi/infermeta/backward.h | 40 ++-- paddle/phi/infermeta/multiary.cc | 178 +++++++++--------- paddle/phi/infermeta/multiary.h | 36 ++-- .../cutlass/memory_efficient_attention.cu | 2 +- .../memory_efficient_attention_backward.cu | 2 +- 6 files changed, 214 insertions(+), 214 deletions(-) diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index a9d3eafdad6..8acd927f473 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -633,6 +633,91 @@ void MaxPoolWithIndexGradInferMeta(const MetaTensor& x, dx->share_meta(x); } +void MemoryEfficientAttentionGradInferMeta(const MetaTensor& query, + const MetaTensor& key, + const MetaTensor& value, + const MetaTensor& bias, + const MetaTensor& cu_seqlens_q, + const MetaTensor& cu_seqlens_k, + const MetaTensor& output, + const MetaTensor& logsumexp, + const MetaTensor& seed_and_offset, + const MetaTensor& output_grad, + const Scalar& max_seqlen_q, + const Scalar& max_seqlen_k, + const bool causal, + const double dropout_p, + const float scale, + MetaTensor* query_grad, + MetaTensor* key_grad, + MetaTensor* value_grad, + MetaTensor* bias_grad) { + PADDLE_ENFORCE_EQ( + output_grad.dims().size(), + 4, + phi::errors::InvalidArgument("Key should be a 4-D tensor" + "But received Key dimension(%s)", + output_grad.dims().size())); + PADDLE_ENFORCE_EQ( + output.dims().size(), + 4, + phi::errors::InvalidArgument("Key should be a 4-D tensor" + "But received Key dimension(%s)", + output_grad.dims().size())); + + const int64_t query_batch_size = query.dims()[0]; + const int64_t query_seq_length = query.dims()[1]; + const int64_t query_num_head = query.dims()[2]; + const int64_t query_head_size = query.dims()[3]; + + const int64_t key_batch_size = key.dims()[0]; + const int64_t key_seq_length = key.dims()[1]; + const int64_t key_num_head = key.dims()[2]; + const int64_t key_head_size = key.dims()[3]; + + const int64_t value_batch_size = value.dims()[0]; + const int64_t value_seq_length = value.dims()[1]; + const int64_t value_num_head = value.dims()[2]; + const int64_t value_head_size = value.dims()[3]; + + std::vector query_grad_dims( + {query_batch_size, query_seq_length, query_num_head, query_head_size}); + std::vector key_grad_dims( + {key_batch_size, key_seq_length, key_num_head, key_head_size}); + std::vector value_grad_dims( + {value_batch_size, value_seq_length, value_num_head, value_head_size}); + + query_grad->set_dims(phi::make_ddim(query_grad_dims)); + query_grad->share_lod(query); + query_grad->set_dtype(query.dtype()); + query_grad->set_layout(query.layout()); + + key_grad->set_dims(phi::make_ddim(key_grad_dims)); + key_grad->share_lod(key); + key_grad->set_dtype(key.dtype()); + key_grad->set_layout(key.layout()); + + value_grad->set_dims(phi::make_ddim(value_grad_dims)); + value_grad->share_lod(value); + value_grad->set_dtype(value.dtype()); + value_grad->set_layout(value.layout()); + + if (bias) { + const int64_t bias_batch_size = bias.dims()[0]; + const int64_t bias_seq_length = bias.dims()[1]; + const int64_t bias_num_head = bias.dims()[2]; + const int64_t bias_head_size = bias.dims()[3]; + + std::vector bias_grad_dims( + {bias_batch_size, bias_seq_length, bias_num_head, bias_head_size}); + + bias_grad->set_dims(phi::make_ddim(bias_grad_dims)); + bias_grad->share_lod(bias); + bias_grad->set_dtype(bias.dtype()); + bias_grad->set_layout(bias.layout()); + } +} + void MeshgridGradInferMeta(const std::vector& inputs, const std::vector& outputs_grad, std::vector inputs_grad) { @@ -1052,89 +1137,4 @@ void IndexAddGradInferMeta(const MetaTensor& index, } } -void MemoryEfficientAttentionGradInferMeta(const MetaTensor& query, - const MetaTensor& key, - const MetaTensor& value, - const MetaTensor& bias, - const MetaTensor& cu_seqlens_q, - const MetaTensor& cu_seqlens_k, - const MetaTensor& output, - const MetaTensor& logsumexp, - const MetaTensor& seed_and_offset, - const MetaTensor& output_grad, - const Scalar& max_seqlen_q, - const Scalar& max_seqlen_k, - const bool causal, - const double dropout_p, - const float scale, - MetaTensor* query_grad, - MetaTensor* key_grad, - MetaTensor* value_grad, - MetaTensor* bias_grad) { - PADDLE_ENFORCE_EQ( - output_grad.dims().size(), - 4, - phi::errors::InvalidArgument("Key should be a 4-D tensor" - "But received Key dimension(%s)", - output_grad.dims().size())); - PADDLE_ENFORCE_EQ( - output.dims().size(), - 4, - phi::errors::InvalidArgument("Key should be a 4-D tensor" - "But received Key dimension(%s)", - output_grad.dims().size())); - - const int64_t query_batch_size = query.dims()[0]; - const int64_t query_seq_length = query.dims()[1]; - const int64_t query_num_head = query.dims()[2]; - const int64_t query_head_size = query.dims()[3]; - - const int64_t key_batch_size = key.dims()[0]; - const int64_t key_seq_length = key.dims()[1]; - const int64_t key_num_head = key.dims()[2]; - const int64_t key_head_size = key.dims()[3]; - - const int64_t value_batch_size = value.dims()[0]; - const int64_t value_seq_length = value.dims()[1]; - const int64_t value_num_head = value.dims()[2]; - const int64_t value_head_size = value.dims()[3]; - - std::vector query_grad_dims( - {query_batch_size, query_seq_length, query_num_head, query_head_size}); - std::vector key_grad_dims( - {key_batch_size, key_seq_length, key_num_head, key_head_size}); - std::vector value_grad_dims( - {value_batch_size, value_seq_length, value_num_head, value_head_size}); - - query_grad->set_dims(phi::make_ddim(query_grad_dims)); - query_grad->share_lod(query); - query_grad->set_dtype(query.dtype()); - query_grad->set_layout(query.layout()); - - key_grad->set_dims(phi::make_ddim(key_grad_dims)); - key_grad->share_lod(key); - key_grad->set_dtype(key.dtype()); - key_grad->set_layout(key.layout()); - - value_grad->set_dims(phi::make_ddim(value_grad_dims)); - value_grad->share_lod(value); - value_grad->set_dtype(value.dtype()); - value_grad->set_layout(value.layout()); - - if (bias) { - const int64_t bias_batch_size = bias.dims()[0]; - const int64_t bias_seq_length = bias.dims()[1]; - const int64_t bias_num_head = bias.dims()[2]; - const int64_t bias_head_size = bias.dims()[3]; - - std::vector bias_grad_dims( - {bias_batch_size, bias_seq_length, bias_num_head, bias_head_size}); - - bias_grad->set_dims(phi::make_ddim(bias_grad_dims)); - bias_grad->share_lod(bias); - bias_grad->set_dtype(bias.dtype()); - bias_grad->set_layout(bias.layout()); - } -} - } // namespace phi diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 8f095220655..e65ba2085e6 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -294,6 +294,26 @@ void MeshgridGradInferMeta(const std::vector& inputs, const std::vector& outputs_grad, std::vector inputs_grad); +void MemoryEfficientAttentionGradInferMeta(const MetaTensor& query, + const MetaTensor& key, + const MetaTensor& value, + const MetaTensor& bias, + const MetaTensor& cu_seqlens_q, + const MetaTensor& cu_seqlens_k, + const MetaTensor& output, + const MetaTensor& logsumexp, + const MetaTensor& seed_and_offset, + const MetaTensor& output_grad, + const Scalar& max_seqlen_q, + const Scalar& max_seqlen_k, + const bool causal, + const double dropout_p, + const float scale, + MetaTensor* query_grad, + MetaTensor* key_grad, + MetaTensor* value_grad, + MetaTensor* bias_grad); + void MultiDotGradInferMeta(const std::vector& x, const MetaTensor& out_grad, std::vector x_grad); @@ -418,24 +438,4 @@ void IndexAddGradInferMeta(const MetaTensor& index, MetaTensor* x_grad, MetaTensor* add_tensor_grad); -void MemoryEfficientAttentionGradInferMeta(const MetaTensor& query, - const MetaTensor& key, - const MetaTensor& value, - const MetaTensor& bias, - const MetaTensor& cu_seqlens_q, - const MetaTensor& cu_seqlens_k, - const MetaTensor& output, - const MetaTensor& logsumexp, - const MetaTensor& seed_and_offset, - const MetaTensor& output_grad, - const Scalar& max_seqlen_q, - const Scalar& max_seqlen_k, - const bool causal, - const double dropout_p, - const float scale, - MetaTensor* query_grad, - MetaTensor* key_grad, - MetaTensor* value_grad, - MetaTensor* bias_grad); - } // namespace phi diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index 73e0ff975d9..14a1cba1f33 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -2112,6 +2112,95 @@ void MergedMomentumInferMeta( std::vector velocity_out, std::vector master_param_out) {} +void MemoryEfficientAttentionInferMeta(const MetaTensor& query, + const MetaTensor& key, + const MetaTensor& value, + const MetaTensor& bias, + const MetaTensor& cu_seqlens_q, + const MetaTensor& cu_seqlens_k, + const MetaTensor& causal_diagonal, + const MetaTensor& seqlen_k, + const Scalar& max_seqlen_q, + const Scalar& max_seqlen_k, + const bool causal, + const double dropout_p, + const float scale, + const bool is_test, + MetaTensor* output, + MetaTensor* logsumexp, + MetaTensor* seed_and_offset) { + PADDLE_ENFORCE_EQ( + query.dims().size(), + 4, + phi::errors::InvalidArgument("Query should be a 4-D tensor" + "But received Query dimension(%s)", + query.dims().size())); + PADDLE_ENFORCE_EQ( + key.dims().size(), + 4, + phi::errors::InvalidArgument("Key should be a 4-D tensor" + "But received Key dimension(%s)", + key.dims().size())); + PADDLE_ENFORCE_EQ( + value.dims().size(), + 4, + phi::errors::InvalidArgument("Value should be a 4-D tensor" + "But received Value dimension(%s)", + value.dims().size())); + + const int64_t query_batch_size = query.dims()[0]; + const int64_t query_seq_length = query.dims()[1]; + const int64_t query_num_head = query.dims()[2]; + const int64_t query_head_size = query.dims()[3]; + + const int64_t key_batch_size = key.dims()[0]; + const int64_t key_seq_length = key.dims()[1]; + const int64_t key_num_head = key.dims()[2]; + const int64_t key_head_size = key.dims()[3]; + + const int64_t value_batch_size = value.dims()[0]; + const int64_t value_seq_length = value.dims()[1]; + const int64_t value_num_head = value.dims()[2]; + const int64_t value_head_size = value.dims()[3]; + + PADDLE_ENFORCE_EQ(((query_batch_size == key_batch_size) && + (key_batch_size == value_batch_size)), + true, + phi::errors::InvalidArgument( + "The batchsize of Query, Key, Value should be equal.")); + + PADDLE_ENFORCE_EQ( + ((query_num_head == key_num_head) && (key_num_head == value_num_head)), + true, + phi::errors::InvalidArgument( + "The head number of Query, Key, Value should be equal.")); + + PADDLE_ENFORCE_EQ(query_head_size == key_head_size, + true, + phi::errors::InvalidArgument( + "The head size of Query, Key should be equal.")); + + PADDLE_ENFORCE_EQ(key_seq_length == value_seq_length, + true, + phi::errors::InvalidArgument( + "The seq length of Key, Value should be equal.")); + std::vector out_dims( + {query_batch_size, query_seq_length, query_num_head, value_head_size}); + std::vector logsumexp_dims({query_num_head, query_batch_size}); + std::vector seed_and_offset_dims({2}); + + output->set_dims(phi::make_ddim(out_dims)); + output->share_lod(query); + output->set_dtype(query.dtype()); + output->set_layout(query.layout()); + + logsumexp->set_dims(phi::make_ddim(logsumexp_dims)); + logsumexp->set_dtype(phi::DataType::FLOAT32); + + seed_and_offset->set_dims(phi::make_ddim(seed_and_offset_dims)); + seed_and_offset->set_dtype(phi::DataType::INT64); +} + void MeshgridInferMeta(const std::vector& inputs, std::vector outputs) { const size_t inputs_num = inputs.size(); @@ -3129,94 +3218,5 @@ void MoeInferMeta(const MetaTensor& x, out->set_layout(x.layout()); } -void MemoryEfficientAttentionInferMeta(const MetaTensor& query, - const MetaTensor& key, - const MetaTensor& value, - const MetaTensor& bias, - const MetaTensor& cu_seqlens_q, - const MetaTensor& cu_seqlens_k, - const MetaTensor& causal_diagonal, - const MetaTensor& seqlen_k, - const Scalar& max_seqlen_q, - const Scalar& max_seqlen_k, - const bool causal, - const double dropout_p, - const float scale, - const bool is_test, - MetaTensor* output, - MetaTensor* logsumexp, - MetaTensor* seed_and_offset) { - PADDLE_ENFORCE_EQ( - query.dims().size(), - 4, - phi::errors::InvalidArgument("Query should be a 4-D tensor" - "But received Query dimension(%s)", - query.dims().size())); - PADDLE_ENFORCE_EQ( - key.dims().size(), - 4, - phi::errors::InvalidArgument("Key should be a 4-D tensor" - "But received Key dimension(%s)", - key.dims().size())); - PADDLE_ENFORCE_EQ( - value.dims().size(), - 4, - phi::errors::InvalidArgument("Value should be a 4-D tensor" - "But received Value dimension(%s)", - value.dims().size())); - - const int64_t query_batch_size = query.dims()[0]; - const int64_t query_seq_length = query.dims()[1]; - const int64_t query_num_head = query.dims()[2]; - const int64_t query_head_size = query.dims()[3]; - - const int64_t key_batch_size = key.dims()[0]; - const int64_t key_seq_length = key.dims()[1]; - const int64_t key_num_head = key.dims()[2]; - const int64_t key_head_size = key.dims()[3]; - - const int64_t value_batch_size = value.dims()[0]; - const int64_t value_seq_length = value.dims()[1]; - const int64_t value_num_head = value.dims()[2]; - const int64_t value_head_size = value.dims()[3]; - - PADDLE_ENFORCE_EQ(((query_batch_size == key_batch_size) && - (key_batch_size == value_batch_size)), - true, - phi::errors::InvalidArgument( - "The batchsize of Query, Key, Value should be equal.")); - - PADDLE_ENFORCE_EQ( - ((query_num_head == key_num_head) && (key_num_head == value_num_head)), - true, - phi::errors::InvalidArgument( - "The head number of Query, Key, Value should be equal.")); - - PADDLE_ENFORCE_EQ(query_head_size == key_head_size, - true, - phi::errors::InvalidArgument( - "The head size of Query, Key should be equal.")); - - PADDLE_ENFORCE_EQ(key_seq_length == value_seq_length, - true, - phi::errors::InvalidArgument( - "The seq length of Key, Value should be equal.")); - std::vector out_dims( - {query_batch_size, query_seq_length, query_num_head, value_head_size}); - std::vector logsumexp_dims({query_num_head, query_batch_size}); - std::vector seed_and_offset_dims({2}); - - output->set_dims(phi::make_ddim(out_dims)); - output->share_lod(query); - output->set_dtype(query.dtype()); - output->set_layout(query.layout()); - - logsumexp->set_dims(phi::make_ddim(logsumexp_dims)); - logsumexp->set_dtype(phi::DataType::FLOAT32); - - seed_and_offset->set_dims(phi::make_ddim(seed_and_offset_dims)); - seed_and_offset->set_dtype(phi::DataType::INT64); -} - } // namespace phi PD_REGISTER_INFER_META_FN(batch_norm_infer, phi::BatchNormInferInferMeta); diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index baf7ec6c956..a8a74fdb5d3 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -398,6 +398,24 @@ void MergedMomentumInferMeta( std::vector velocity_out, std::vector master_param_out); +void MemoryEfficientAttentionInferMeta(const MetaTensor& query, + const MetaTensor& key, + const MetaTensor& value, + const MetaTensor& bias, + const MetaTensor& cu_seqlens_q, + const MetaTensor& cu_seqlens_k, + const MetaTensor& causal_diagonal, + const MetaTensor& seqlen_k, + const Scalar& max_seqlen_q, + const Scalar& max_seqlen_k, + const bool causal, + const double dropout_p, + const float scale, + const bool is_test, + MetaTensor* output, + MetaTensor* logsumexp, + MetaTensor* seed_and_offset); + void MeshgridInferMeta(const std::vector& inputs, std::vector outputs); @@ -587,22 +605,4 @@ void MoeInferMeta(const MetaTensor& x, const std::string& act_type, MetaTensor* out); -void MemoryEfficientAttentionInferMeta(const MetaTensor& query, - const MetaTensor& key, - const MetaTensor& value, - const MetaTensor& bias, - const MetaTensor& cu_seqlens_q, - const MetaTensor& cu_seqlens_k, - const MetaTensor& causal_diagonal, - const MetaTensor& seqlen_k, - const Scalar& max_seqlen_q, - const Scalar& max_seqlen_k, - const bool causal, - const double dropout_p, - const float scale, - const bool is_test, - MetaTensor* output, - MetaTensor* logsumexp, - MetaTensor* seed_and_offset); - } // namespace phi diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu index e1a91a8a8bb..62ef34e00d9 100644 --- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu +++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_backward.cu b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_backward.cu index ac9eb64c120..3f529d32b93 100644 --- a/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_backward.cu +++ b/paddle/phi/kernels/fusion/cutlass/memory_efficient_attention_backward.cu @@ -1,4 +1,4 @@ -// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. -- GitLab