diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op_xpu.cc
index 908018d7550802590a3a02e8ddab361310a2fd2f..e8944040751fc802ca630ea6c0124e4258cc3d30 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op_xpu.cc
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op_xpu.cc
@@ -33,12 +33,12 @@ template <typename DeviceContext, typename T>
 class CSoftmaxWithCrossEntropyOp : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    const int ignore_index = ctx.Attr<int>("ignore_index");
+    const int64_t ignore_index = ctx.Attr<int64_t>("ignore_index");
     PADDLE_ENFORCE_LT(ignore_index,
                       0,
                       platform::errors::InvalidArgument(
                           "When SoftmaxWithCrossEntropy run on XPU, "
-                          "ignore_index should be <=0, however it's %d",
+                          "ignore_index should be <=0, however it's %ld",
                           ignore_index));
     const int rid = ctx.Attr<int>("ring_id");
     auto map = distributed::ProcessGroupMapFromGid::getInstance();
@@ -460,12 +460,12 @@ class CSoftmaxWithCrossEntropyGrad : public framework::OpKernel<T> {
         context.Output<phi::DenseTensor>(framework::GradVarName("Logits"));
     const phi::DenseTensor* softmax =
         context.Input<phi::DenseTensor>("Softmax");
-    const int ignore_index = context.Attr<int>("ignore_index");
+    const int64_t ignore_index = context.Attr<int64_t>("ignore_index");
     PADDLE_ENFORCE_LT(ignore_index,
                       0,
                       platform::errors::InvalidArgument(
                           "When SoftmaxWithCrossEntropy run on XPU, "
-                          "ignore_index should be <=0, however it's %d",
+                          "ignore_index should be <=0, however it's %ld",
                           ignore_index));
     const int rank = context.Attr<int>("rank");
     auto& dev_ctx = context.template device_context<DeviceContext>();
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
index f820acfa8f112054d7d6501f128d7ce5adda245b..55bec32bb1a5cfdfdafe27f5472194520488a089 100644
--- a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
@@ -529,7 +529,7 @@ class ParallelCrossEntropy(paddle.nn.Layer):
         mp_group(Group): The tensor parallel group.
         name(str, optional): Normally there is no need for user to set this parameter.
             For detailed information, please refer to :ref:`api_guide_Name` .
-        ignore_index (int, optional):  Specifies a target value that is ignored and
+        ignore_index (long int, optional):  Specifies a target value that is ignored and
             does not contribute to the loss. A negative value means that no label value
             needs to be ignored. Default is -100 .