未验证 提交 da963eab 编写于 作者: Y Yuang Liu 提交者: GitHub

use int64 to calc dim for c softmax (#53541)

* use int64 to calc dim for c softmax

* fix complie bug
上级 03fe3ce5
...@@ -30,9 +30,9 @@ namespace paddle { ...@@ -30,9 +30,9 @@ namespace paddle {
namespace operators { namespace operators {
static constexpr int kNumCUDAThreads = 512; static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaxinumNumBlocks = 4096; static constexpr int64_t kNumMaxinumNumBlocks = 4096;
static inline int NumBlocks(const int N) { static inline int64_t NumBlocks(const int64_t N) {
return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
kNumMaxinumNumBlocks); kNumMaxinumNumBlocks);
} }
...@@ -42,12 +42,12 @@ __global__ void MaskLabelByIndex(T* predicted_logits, ...@@ -42,12 +42,12 @@ __global__ void MaskLabelByIndex(T* predicted_logits,
const T* logit, const T* logit,
const IndexT* label, const IndexT* label,
const IndexT ignore_index, const IndexT ignore_index,
const int start_index, const int64_t start_index,
const int end_index, const int64_t end_index,
const int64_t N, const int64_t N,
const int64_t D, const int64_t D,
const int nranks) { const int nranks) {
CUDA_KERNEL_LOOP(i, N) { CUDA_KERNEL_LOOP_TYPE(i, N, int64_t) {
auto real_label = label[i]; auto real_label = label[i];
PADDLE_ENFORCE(((real_label < D * nranks) && (real_label >= 0)) || PADDLE_ENFORCE(((real_label < D * nranks) && (real_label >= 0)) ||
(real_label == ignore_index), (real_label == ignore_index),
...@@ -71,8 +71,8 @@ __global__ void CaculateLoss(T* loss, ...@@ -71,8 +71,8 @@ __global__ void CaculateLoss(T* loss,
const T* sum_exp_logits, const T* sum_exp_logits,
const IndexT* label, const IndexT* label,
const int64_t ignore_index, const int64_t ignore_index,
const int N) { const int64_t N) {
CUDA_KERNEL_LOOP(i, N) { CUDA_KERNEL_LOOP_TYPE(i, N, int64_t) {
auto real_label = static_cast<int64_t>(label[i]); auto real_label = static_cast<int64_t>(label[i]);
loss[i] = ignore_index == real_label loss[i] = ignore_index == real_label
? static_cast<T>(0) ? static_cast<T>(0)
...@@ -87,12 +87,12 @@ template <typename T, typename IndexT> ...@@ -87,12 +87,12 @@ template <typename T, typename IndexT>
__global__ void MaskLabelByIndexGrad(T* logits_grad, __global__ void MaskLabelByIndexGrad(T* logits_grad,
const T* loss_grad, const T* loss_grad,
const IndexT* labels, const IndexT* labels,
const int start_index, const int64_t start_index,
const int end_index, const int64_t end_index,
const int64_t N, const int64_t N,
const int64_t D, const int64_t D,
const int64_t ignore_index) { const int64_t ignore_index) {
CUDA_KERNEL_LOOP(i, N * D) { CUDA_KERNEL_LOOP_TYPE(i, N * D, int64_t) {
auto row = i / D; auto row = i / D;
auto col = i % D; auto col = i % D;
auto lbl = static_cast<int64_t>(labels[row]); auto lbl = static_cast<int64_t>(labels[row]);
...@@ -152,8 +152,8 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> { ...@@ -152,8 +152,8 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
const auto& labels_dims = labels->dims(); const auto& labels_dims = labels->dims();
const int axis = logits_dims.size() - 1; const int axis = logits_dims.size() - 1;
const int N = phi::funcs::SizeToAxis(axis, logits_dims); const int64_t N = phi::funcs::SizeToAxis<int64_t>(axis, logits_dims);
const int D = phi::funcs::SizeFromAxis(axis, logits_dims); const int64_t D = phi::funcs::SizeFromAxis<int64_t>(axis, logits_dims);
phi::DenseTensor logits_2d, softmax_2d, loss_2d; phi::DenseTensor logits_2d, softmax_2d, loss_2d;
logits_2d.ShareDataWith(*logits).Resize({N, D}); logits_2d.ShareDataWith(*logits).Resize({N, D});
...@@ -200,10 +200,10 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> { ...@@ -200,10 +200,10 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
auto t = framework::EigenVector<T>::Flatten(predicted_logits); auto t = framework::EigenVector<T>::Flatten(predicted_logits);
t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0)); t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
const int start_index = rank * D; const int64_t start_index = rank * D;
const int end_index = start_index + D; const int64_t end_index = start_index + D;
int blocks = NumBlocks(N); int64_t blocks = NumBlocks(N);
int threads = kNumCUDAThreads; int threads = kNumCUDAThreads;
const auto& label_type = framework::TransToProtoVarType(labels->dtype()); const auto& label_type = framework::TransToProtoVarType(labels->dtype());
...@@ -318,8 +318,8 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> { ...@@ -318,8 +318,8 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
const auto& labels_dims = labels->dims(); const auto& labels_dims = labels->dims();
const int axis = logits_dims.size() - 1; const int axis = logits_dims.size() - 1;
const int N = phi::funcs::SizeToAxis(axis, logits_dims); const int64_t N = phi::funcs::SizeToAxis<int64_t>(axis, logits_dims);
const int D = phi::funcs::SizeFromAxis(axis, logits_dims); const int64_t D = phi::funcs::SizeFromAxis<int64_t>(axis, logits_dims);
phi::DenseTensor logits_2d, softmax_2d, loss_2d; phi::DenseTensor logits_2d, softmax_2d, loss_2d;
logits_2d.ShareDataWith(*logits).Resize({N, D}); logits_2d.ShareDataWith(*logits).Resize({N, D});
...@@ -358,10 +358,10 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> { ...@@ -358,10 +358,10 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
auto t = framework::EigenVector<T>::Flatten(predicted_logits); auto t = framework::EigenVector<T>::Flatten(predicted_logits);
t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0)); t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
const int start_index = rank * D; const int64_t start_index = rank * D;
const int end_index = start_index + D; const int64_t end_index = start_index + D;
int blocks = NumBlocks(N); int64_t blocks = NumBlocks(N);
int threads = kNumCUDAThreads; int threads = kNumCUDAThreads;
const auto& label_type = framework::TransToProtoVarType(labels->dtype()); const auto& label_type = framework::TransToProtoVarType(labels->dtype());
...@@ -454,17 +454,17 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> { ...@@ -454,17 +454,17 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
} }
const auto sofrmax_dims = softmax->dims(); const auto sofrmax_dims = softmax->dims();
const int axis = sofrmax_dims.size() - 1; const int axis = sofrmax_dims.size() - 1;
const int N = phi::funcs::SizeToAxis(axis, sofrmax_dims); const int64_t N = phi::funcs::SizeToAxis<int64_t>(axis, sofrmax_dims);
const int D = phi::funcs::SizeFromAxis(axis, sofrmax_dims); const int64_t D = phi::funcs::SizeFromAxis<int64_t>(axis, sofrmax_dims);
phi::DenseTensor logit_grad_2d; phi::DenseTensor logit_grad_2d;
logit_grad_2d.ShareDataWith(*logit_grad).Resize({N, D}); logit_grad_2d.ShareDataWith(*logit_grad).Resize({N, D});
int blocks = NumBlocks(N * D); int64_t blocks = NumBlocks(N * D);
int threads = kNumCUDAThreads; int threads = kNumCUDAThreads;
const auto& label_type = framework::TransToProtoVarType(labels->dtype()); const auto& label_type = framework::TransToProtoVarType(labels->dtype());
const int start_index = rank * D; const int64_t start_index = rank * D;
const int end_index = start_index + D; const int64_t end_index = start_index + D;
if (label_type == framework::proto::VarType::INT32) { if (label_type == framework::proto::VarType::INT32) {
MaskLabelByIndexGrad<T, int32_t> MaskLabelByIndexGrad<T, int32_t>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册