diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index 1a4559d5cd6b58c274b1e7aee565e818d8d39ef2..9c40fdd3f27787f36e53b77767a33172d34bf34d 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -185,9 +185,10 @@ __global__ void CrossEntropyHardLabel(T* loss,
   // thread ids compute loss[ids] using softmax[idx]
   if (ids < n * d) {
     auto lbl = static_cast<int64_t>(labels[ids]);
-    if (lbl < 0) {  // label is negative
+    assert(lbl >= 0 && lbl < dim || lbl == ignore_idx);
+    if (lbl < 0 || lbl >= dim) {  // label is out of bound
       loss[ids] = static_cast<T>(0.0);
-    } else {  // label is positive of zero
+    } else {
       int64_t idx = idx_n * dim * d + lbl * d + idx_d;
       if (IgnoreIndex == true) {
         // IgnoreIndex is true
@@ -225,6 +226,7 @@ __global__ void CrossEntropyExpHardLabel(T* loss,
 
   if (idx < n * dim * d) {
     auto lbl = static_cast<int64_t>(labels[ids]);
+    assert(lbl >= 0 && lbl < dim || lbl == ignore_idx);
     if (IgnoreIndex == true) {
       // IgnoreIndex is true
       if (idx_dim == lbl) {
@@ -333,6 +335,7 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl(
   int tid = threadIdx.x;
   int label_id = blockIdx.x;
   auto label_value = static_cast<int64_t>(label[label_id]);
+  assert(label_value >= 0 && label_value < size || label_value == ignore_index);
   const bool label_valid = label_value >= 0 && label_value < size;
   int loss_id_offset = 0;
 
@@ -438,6 +441,7 @@ __device__ __forceinline__ void ScalarSoftmaxForwardImpl(
   int remain = size % (VecSize * blockDim.x);
   int label_id = blockIdx.x;
   auto label_value = static_cast<int64_t>(label[label_id]);
+  assert(label_value >= 0 && label_value < size || label_value == ignore_index);
   const bool label_valid = label_value >= 0 && label_value < size;
 
   // main part
@@ -1029,6 +1033,7 @@ __global__ void WarpSoftmaxForward(T* loss,
             // label
             int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize;
             auto lbl = static_cast<int64_t>(label[first_batch + i]);
+            assert(lbl >= 0 && lbl < element_count || lbl == ignore_index);
             if (IgnoreIndex == true) {
               // IgnoreIndex is true
               if (lbl == loss_idx) {
@@ -1072,6 +1077,7 @@ __global__ void WarpSoftmaxForward(T* loss,
             // label
             int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize + s;
             auto lbl = static_cast<int64_t>(label[first_batch + i]);
+            assert(lbl >= 0 && lbl < element_count || lbl == ignore_index);
             if (IgnoreIndex == true) {
               // IgnoreIndex is true
               if (lbl == loss_idx && lbl != ignore_index) {
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 31022690da39bbbd5ba0015d4765c0f852175afa..bc12bba94cce4e31c4a0e79c33e9531cb8e88e51 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2382,14 +2382,6 @@ def cross_entropy(input,
         if soft_label == False:
             valid_label = paddle.cast(label != ignore_index,
                                       dtype=label.dtype) * label
-            label_min = paddle.min(valid_label)
-            label_max = paddle.max(valid_label)
-            if label_min < 0:
-                raise ValueError("Target {} is out of lower bound.".format(
-                    label_min.item()))
-            if label_max >= input.shape[axis]:
-                raise ValueError("Target {} is out of upper bound.".format(
-                    label_max.item()))
         if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
             if soft_label == False:
                 _, _, out = _legacy_C_ops.softmax_with_cross_entropy(