diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 3ac7a5a127b37972c0209cd12a16634ef83b5b7c..f86f02544dc980e7235aeaf3733d560b581ba6dc 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -37,11 +37,17 @@ __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
 
 template <typename T>
 __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
-                      const int d, const int remain) {
+                      const int d, const int remain, const int64_t* labels,
+                      const int ignore_index) {
   CUDA_KERNEL_LOOP(index, num) {
     int idx_n = index / d;
     int idx_remain = index % remain;
-    logit_grad[index] *= loss_grad[idx_n * remain + idx_remain];
+    int idx_lbl = idx_n * remain + idx_remain;
+    if (labels[idx_lbl] == ignore_index) {
+      logit_grad[index] = static_cast<T>(0.);
+    } else {
+      logit_grad[index] *= loss_grad[idx_lbl];
+    }
   }
 }
 
@@ -260,6 +266,7 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor {
     int idx_remain = idx % remain;
     // labels, loss view as [n, remain]
     int idx_lbl = idx_n * remain + idx_remain;
+    // It also would ignore labels not in range(class_num).
     if (idx_axis != labels_[idx_lbl]) {
       log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
     } else {
@@ -513,7 +520,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
       int num = n * d;
       grid = (num + block - 1) / block;
       Scale<T><<<grid, block, 0, stream>>>(logit_grad_data, loss_grad_data, num,
-                                           d, remain);
+                                           d, remain, label_data, ignore_index);
     }
   }
 };
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
index cebd466f361d1e12cb927689b47d2579b5085dc3..93f2552c3cee90a3eb1a948494fb231a41f6f74d 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -82,6 +82,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
     }
 
     const bool soft_label = context.Attr<bool>("soft_label");
+    auto ignore_index = context.Attr<int>("ignore_index");
 
     const int rank = logit_grad->dims().size();
     const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
@@ -115,8 +116,14 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
       for (int i = 0; i < n; ++i) {
         for (int j = 0; j < remain; j++) {
           int idx = i * remain + j;
-          logit_grad_data[i * d + label_data[idx] * remain + j] -=
-              out_grad_data[idx];
+          if (label_data[idx] == ignore_index) {
+            for (int k = 0; k < axis_dim; ++k) {
+              logit_grad_data[i * d + k * remain + j] = 0;
+            }
+          } else {
+            logit_grad_data[i * d + label_data[idx] * remain + j] -=
+                out_grad_data[idx];
+          }
         }
       }
     }
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
index df2a0a523ad1ef05b462c8f8044c83e76d91f8a3..0ee58d5be15e60f50b0d6f4d0fc7c55075b81aea 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -83,9 +83,9 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
         self.attrs = {
             "numeric_stable_mode": self.numeric_stable_mode,
             "soft_label": self.soft_label,
+            "ignore_index": self.ignore_index,
         }
-        if self.ignore_index >= 0:
-            self.attrs['ignore_index'] = self.ignore_index
+
         if self.axis != -1:
             self.attrs['axis'] = self.axis
 
@@ -93,7 +93,7 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.05)
+        self.check_grad(["Logits"], "Loss", max_relative_error=5e-5)
 
 
 class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):