Fix gradients with ignore_idx in softmax_with_cross_entropy (#28622)

* Fix gradients with ignore_idx in softmax_with_cross_entropy. test=develop * Fix gradients with ignore_idx in softmax_with_cross_entropy on cpu. Remove softmax_with_cross_entropy from op_threshold_white_list. test=develop * Fix test_softmax_cross_entropy_op.py. test=develop

Fix gradients with ignore_idx in softmax_with_cross_entropy (#28622)
* Fix gradients with ignore_idx in softmax_with_cross_entropy. test=develop * Fix gradients with ignore_idx in softmax_with_cross_entropy on cpu. Remove softmax_with_cross_entropy from op_threshold_white_list. test=develop * Fix test_softmax_cross_entropy_op.py. test=develop
110febdc · Guo Sheng · GitHub · a3bc3bcd · 110febdc · 110febdc
3 changed file
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -37,11 +37,17 @@ __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
 template <typename T>
 __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
-                      const int d, const int remain) {
+                      const int d, const int remain, const int64_t* labels,
+                      const int ignore_index) {
  CUDA_KERNEL_LOOP(index, num) {
    int idx_n = index / d;
    int idx_remain = index % remain;
-    logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
+    int idx_lbl = idx_n * remain + idx_remain;
+    if (labels[idx_lbl] == ignore_index) {
+      logit_grad[index] = static_cast<T>(0.);
+    } else {
+      logit_grad[index] *= loss_grad[idx_lbl];
+    }
  }
 }
@@ -260,6 +266,7 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor {
    int idx_remain = idx % remain;
    // labels, loss view as [n, remain]
    int idx_lbl = idx_n * remain + idx_remain;
+    // It also would ignore labels not in range(class_num).
    if (idx_axis != labels_[idx_lbl]) {
      log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
    } else {
@@ -513,7 +520,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
      int num = n * d;
      grid = (num + block - 1) / block;
      Scale<T><<<grid, block, 0, stream>>>(logit_grad_data, loss_grad_data, num,
-                                           d, remain);
+                                           d, remain, label_data, ignore_index);
    }
  }
 };

--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -82,6 +82,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
    }
    const bool soft_label = context.Attr<bool>("soft_label");
+    auto ignore_index = context.Attr<int>("ignore_index");
    const int rank = logit_grad->dims().size();
    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
@@ -115,12 +116,18 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
      for (int i = 0; i < n; ++i) {
        for (int j = 0; j < remain; j++) {
          int idx = i * remain + j;
+          if (label_data[idx] == ignore_index) {
+            for (int k = 0; k < axis_dim; ++k) {
+              logit_grad_data[i * d + k * remain + j] = 0;
+            }
+          } else {
            logit_grad_data[i * d + label_data[idx] * remain + j] -=
                out_grad_data[idx];
          }
        }
      }
    }
+  }
 };
 }  // namespace operators

--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -83,9 +83,9 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
        self.attrs = {
            "numeric_stable_mode": self.numeric_stable_mode,
            "soft_label": self.soft_label,
+            "ignore_index": self.ignore_index,
        }
-        if self.ignore_index >= 0:
-            self.attrs['ignore_index'] = self.ignore_index
        if self.axis != -1:
            self.attrs['axis'] = self.axis
@@ -93,7 +93,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
        self.check_output()
    def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
+        self.check_grad(["Logits"], "Loss", max_relative_error=5e-5)
 class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):