From da963eab7aa891e3758bbf38e00f567ec76b6ebb Mon Sep 17 00:00:00 2001 From: Yuang Liu Date: Sat, 6 May 2023 12:30:06 +0800 Subject: [PATCH] use int64 to calc dim for c softmax (#53541) * use int64 to calc dim for c softmax * fix complie bug --- .../c_softmax_with_cross_entropy_op.cu | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu index 114c52f608f..5a786898197 100644 --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu @@ -30,9 +30,9 @@ namespace paddle { namespace operators { static constexpr int kNumCUDAThreads = 512; -static constexpr int kNumMaxinumNumBlocks = 4096; +static constexpr int64_t kNumMaxinumNumBlocks = 4096; -static inline int NumBlocks(const int N) { +static inline int64_t NumBlocks(const int64_t N) { return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, kNumMaxinumNumBlocks); } @@ -42,12 +42,12 @@ __global__ void MaskLabelByIndex(T* predicted_logits, const T* logit, const IndexT* label, const IndexT ignore_index, - const int start_index, - const int end_index, + const int64_t start_index, + const int64_t end_index, const int64_t N, const int64_t D, const int nranks) { - CUDA_KERNEL_LOOP(i, N) { + CUDA_KERNEL_LOOP_TYPE(i, N, int64_t) { auto real_label = label[i]; PADDLE_ENFORCE(((real_label < D * nranks) && (real_label >= 0)) || (real_label == ignore_index), @@ -71,8 +71,8 @@ __global__ void CaculateLoss(T* loss, const T* sum_exp_logits, const IndexT* label, const int64_t ignore_index, - const int N) { - CUDA_KERNEL_LOOP(i, N) { + const int64_t N) { + CUDA_KERNEL_LOOP_TYPE(i, N, int64_t) { auto real_label = static_cast(label[i]); loss[i] = ignore_index == real_label ? static_cast(0) @@ -87,12 +87,12 @@ template __global__ void MaskLabelByIndexGrad(T* logits_grad, const T* loss_grad, const IndexT* labels, - const int start_index, - const int end_index, + const int64_t start_index, + const int64_t end_index, const int64_t N, const int64_t D, const int64_t ignore_index) { - CUDA_KERNEL_LOOP(i, N * D) { + CUDA_KERNEL_LOOP_TYPE(i, N * D, int64_t) { auto row = i / D; auto col = i % D; auto lbl = static_cast(labels[row]); @@ -152,8 +152,8 @@ struct CSoftmaxWithCrossEntropyFunctor { const auto& labels_dims = labels->dims(); const int axis = logits_dims.size() - 1; - const int N = phi::funcs::SizeToAxis(axis, logits_dims); - const int D = phi::funcs::SizeFromAxis(axis, logits_dims); + const int64_t N = phi::funcs::SizeToAxis(axis, logits_dims); + const int64_t D = phi::funcs::SizeFromAxis(axis, logits_dims); phi::DenseTensor logits_2d, softmax_2d, loss_2d; logits_2d.ShareDataWith(*logits).Resize({N, D}); @@ -200,10 +200,10 @@ struct CSoftmaxWithCrossEntropyFunctor { auto t = framework::EigenVector::Flatten(predicted_logits); t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); - const int start_index = rank * D; - const int end_index = start_index + D; + const int64_t start_index = rank * D; + const int64_t end_index = start_index + D; - int blocks = NumBlocks(N); + int64_t blocks = NumBlocks(N); int threads = kNumCUDAThreads; const auto& label_type = framework::TransToProtoVarType(labels->dtype()); @@ -318,8 +318,8 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { const auto& labels_dims = labels->dims(); const int axis = logits_dims.size() - 1; - const int N = phi::funcs::SizeToAxis(axis, logits_dims); - const int D = phi::funcs::SizeFromAxis(axis, logits_dims); + const int64_t N = phi::funcs::SizeToAxis(axis, logits_dims); + const int64_t D = phi::funcs::SizeFromAxis(axis, logits_dims); phi::DenseTensor logits_2d, softmax_2d, loss_2d; logits_2d.ShareDataWith(*logits).Resize({N, D}); @@ -358,10 +358,10 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor { auto t = framework::EigenVector::Flatten(predicted_logits); t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); - const int start_index = rank * D; - const int end_index = start_index + D; + const int64_t start_index = rank * D; + const int64_t end_index = start_index + D; - int blocks = NumBlocks(N); + int64_t blocks = NumBlocks(N); int threads = kNumCUDAThreads; const auto& label_type = framework::TransToProtoVarType(labels->dtype()); @@ -454,17 +454,17 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { } const auto sofrmax_dims = softmax->dims(); const int axis = sofrmax_dims.size() - 1; - const int N = phi::funcs::SizeToAxis(axis, sofrmax_dims); - const int D = phi::funcs::SizeFromAxis(axis, sofrmax_dims); + const int64_t N = phi::funcs::SizeToAxis(axis, sofrmax_dims); + const int64_t D = phi::funcs::SizeFromAxis(axis, sofrmax_dims); phi::DenseTensor logit_grad_2d; logit_grad_2d.ShareDataWith(*logit_grad).Resize({N, D}); - int blocks = NumBlocks(N * D); + int64_t blocks = NumBlocks(N * D); int threads = kNumCUDAThreads; const auto& label_type = framework::TransToProtoVarType(labels->dtype()); - const int start_index = rank * D; - const int end_index = start_index + D; + const int64_t start_index = rank * D; + const int64_t end_index = start_index + D; if (label_type == framework::proto::VarType::INT32) { MaskLabelByIndexGrad -- GitLab