diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 3ac7a5a127b37972c0209cd12a16634ef83b5b7c..f86f02544dc980e7235aeaf3733d560b581ba6dc 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -37,11 +37,17 @@ __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels, template __global__ void Scale(T* logit_grad, const T* loss_grad, const int num, - const int d, const int remain) { + const int d, const int remain, const int64_t* labels, + const int ignore_index) { CUDA_KERNEL_LOOP(index, num) { int idx_n = index / d; int idx_remain = index % remain; - logit_grad[index] *= loss_grad[idx_n * remain + idx_remain]; + int idx_lbl = idx_n * remain + idx_remain; + if (labels[idx_lbl] == ignore_index) { + logit_grad[index] = static_cast(0.); + } else { + logit_grad[index] *= loss_grad[idx_lbl]; + } } } @@ -260,6 +266,7 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor { int idx_remain = idx % remain; // labels, loss view as [n, remain] int idx_lbl = idx_n * remain + idx_remain; + // It also would ignore labels not in range(class_num). if (idx_axis != labels_[idx_lbl]) { log_softmax_[idx] = exp_on_device(log_softmax_[idx]); } else { @@ -513,7 +520,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { int num = n * d; grid = (num + block - 1) / block; Scale<<>>(logit_grad_data, loss_grad_data, num, - d, remain); + d, remain, label_data, ignore_index); } } }; diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h index cebd466f361d1e12cb927689b47d2579b5085dc3..93f2552c3cee90a3eb1a948494fb231a41f6f74d 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h @@ -82,6 +82,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { } const bool soft_label = context.Attr("soft_label"); + auto ignore_index = context.Attr("ignore_index"); const int rank = logit_grad->dims().size(); const int axis = CanonicalAxis(context.Attr("axis"), rank); @@ -115,8 +116,14 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { for (int i = 0; i < n; ++i) { for (int j = 0; j < remain; j++) { int idx = i * remain + j; - logit_grad_data[i * d + label_data[idx] * remain + j] -= - out_grad_data[idx]; + if (label_data[idx] == ignore_index) { + for (int k = 0; k < axis_dim; ++k) { + logit_grad_data[i * d + k * remain + j] = 0; + } + } else { + logit_grad_data[i * d + label_data[idx] * remain + j] -= + out_grad_data[idx]; + } } } } diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py index df2a0a523ad1ef05b462c8f8044c83e76d91f8a3..0ee58d5be15e60f50b0d6f4d0fc7c55075b81aea 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py @@ -83,9 +83,9 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): self.attrs = { "numeric_stable_mode": self.numeric_stable_mode, "soft_label": self.soft_label, + "ignore_index": self.ignore_index, } - if self.ignore_index >= 0: - self.attrs['ignore_index'] = self.ignore_index + if self.axis != -1: self.attrs['axis'] = self.axis @@ -93,7 +93,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest): self.check_output() def test_check_grad(self): - self.check_grad(["Logits"], "Loss", max_relative_error=0.05) + self.check_grad(["Logits"], "Loss", max_relative_error=5e-5) class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):