fix weighted CE loss's bug

12bcd023 · HydrogenSulfate · chajchaj · 1506d266 · 12bcd023 · 12bcd023
Showing with 377 addition and 76 deletion

python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py ...n/paddle/fluid/tests/unittests/test_cross_entropy_loss.py +252 -12

python/paddle/nn/functional/loss.py python/paddle/nn/functional/loss.py +125 -64

未找到文件。
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -50,7 +50,7 @@ def cross_entropy_loss_1d(input,
        total_weight += cur_weight
        out[i] = -log_softmax_out[i][cur_target] * cur_weight

-    ###2. deal with reduction 
+    ###2. deal with reduction
    if reduction == 'sum':
        return np.sum(out), np.array([total_weight]).astype('float64')
    elif reduction == 'mean':
@@ -434,7 +434,7 @@ class CrossEntropyLoss(unittest.TestCase):

        paddle.set_device("cpu")

-        #2 dygraph 
+        #2 dygraph
        paddle.disable_static()
        paddle_loss_mean = paddle.nn.functional.cross_entropy(
            fluid.dygraph.to_variable(self.logits),
@@ -841,6 +841,55 @@ class CrossEntropyLoss(unittest.TestCase):
        self.assertTrue(np.allclose(static_ret, expected))
        self.assertTrue(np.allclose(dy_ret_value, expected))

+    def test_cross_entropy_loss_1d_with_weight_mean_ignore_exceedlabel(self):
+        N = 100
+        C = 200
+        input_np = np.random.random([N, C]).astype(self.dtype)
+        label_np = np.random.randint(0, C, size=(N)).astype(np.int64)
+        label_np[0] = 255
+        weight_np = np.random.random([C]).astype(self.dtype)
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(name='input', shape=[N, C], dtype=self.dtype)
+            label = fluid.data(name='label', shape=[N], dtype='int64')
+            weight = fluid.data(
+                name='weight', shape=[C],
+                dtype=self.dtype)  #weight for each class
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, ignore_index=255)
+            ret = cross_entropy_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+
+        with fluid.dygraph.guard():
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=fluid.dygraph.to_variable(weight_np),
+                axis=1,
+                ignore_index=255)
+            dy_ret = cross_entropy_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_1d(
+            input_np, label_np, weight=weight_np, ignore_index=255)[0]
+
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
    def test_cross_entropy_loss_1d_with_weight_mean(self):
        input_np = np.random.random([2, 4]).astype(self.dtype)
        label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
@@ -1013,7 +1062,7 @@ class CrossEntropyLoss(unittest.TestCase):
    def test_cross_entropy_loss_1d_mean(self):
        input_np = np.random.random([100, 200]).astype(self.dtype)  #N,C
        label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
-        weight_np = np.random.random([200]).astype(self.dtype)  #C
+        # weight_np = np.random.random([200]).astype(self.dtype)  #C
        paddle.enable_static()
        prog = fluid.Program()
        startup_prog = fluid.Program()
@@ -1022,7 +1071,7 @@ class CrossEntropyLoss(unittest.TestCase):
        with fluid.program_guard(prog, startup_prog):
            input = fluid.data(name='input', shape=[100, 200], dtype=self.dtype)
            label = fluid.data(name='label', shape=[100], dtype='int64')
-            weight = fluid.data(name='weight', shape=[100], dtype=self.dtype)
+            # weight = fluid.data(name='weight', shape=[100], dtype=self.dtype)
            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss()
            ret = cross_entropy_loss(input, label)
            exe = fluid.Executor(place)
@@ -1156,6 +1205,58 @@ class CrossEntropyLoss(unittest.TestCase):
        self.assertTrue(np.allclose(static_ret, expected))
        self.assertTrue(np.allclose(dy_ret_value, expected))

+    def test_cross_entropy_loss_2d_with_weight_mean_ignore_exceedlabel(self):
+        N = 4
+        C = 3
+        H = 512
+        W = 512
+        input_np = np.random.random([N, H, W, C]).astype(self.dtype)
+        label_np = np.random.randint(0, C, size=(N, H, W)).astype(np.int64)
+        label_np[0, 0, 0] = 255
+        weight_np = np.random.random([C]).astype(self.dtype)
+        paddle.enable_static()
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+        with fluid.program_guard(prog, startup_prog):
+            input = fluid.data(
+                name='input', shape=[N, H, W, C], dtype=self.dtype)
+            label = fluid.data(name='label', shape=[N, H, W], dtype='int64')
+            weight = fluid.data(
+                name='weight', shape=[C],
+                dtype=self.dtype)  #weight for each class
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=weight, ignore_index=255)
+            ret = cross_entropy_loss(input, label)
+
+            exe = fluid.Executor(place)
+            static_ret = exe.run(prog,
+                                 feed={
+                                     'input': input_np,
+                                     'label': label_np,
+                                     "weight": weight_np
+                                 },
+                                 fetch_list=[ret])
+            self.assertIsNotNone(static_ret)
+
+        with fluid.dygraph.guard():
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=fluid.dygraph.to_variable(weight_np),
+                axis=1,
+                ignore_index=255)
+            dy_ret = cross_entropy_loss(
+                fluid.dygraph.to_variable(input_np),
+                fluid.dygraph.to_variable(label_np))
+            dy_ret_value = dy_ret.numpy()
+            self.assertIsNotNone(dy_ret_value)
+        expected = cross_entropy_loss_2d(
+            input_np, label_np, weight=weight_np, ignore_index=255)[0]
+
+        self.assertTrue(np.allclose(static_ret, dy_ret_value))
+        self.assertTrue(np.allclose(static_ret, expected))
+        self.assertTrue(np.allclose(dy_ret_value, expected))
+
    def test_cross_entropy_loss_2d_with_weight_mean(self):
        input_np = np.random.random(size=(2, 2, 2, 3)).astype(self.dtype)  #NHWC
        label_np = np.random.randint(
@@ -1362,21 +1463,62 @@ class TestCrossEntropyFAPIError(unittest.TestCase):
    def test_errors(self):
        with program_guard(Program(), Program()):

-            def test_LabelValue():
+            # def test_LabelValue():
+            #     input_data = paddle.rand(shape=[20, 100])
+            #     label_data = paddle.randint(
+            #         0, 100, shape=[20, 1], dtype="int64")
+            #     label_data[0] = 255
+            #     weight_data = paddle.rand([100])
+            #     paddle.nn.functional.cross_entropy(
+            #         input=input_data,
+            #         label=label_data,
+            #         weight=weight_data,
+            #         ignore_index=255)
+
+            # self.assertRaises(ValueError, test_LabelValue)
+
+            # def test_LabelValueNeg():
+            #     input_data = paddle.rand(shape=[20, 100])
+            #     label_data = paddle.randint(
+            #         0, 100, shape=[20, 1], dtype="int64")
+            #     label_data[0] = -1
+            #     weight_data = paddle.rand([100])
+            #     paddle.nn.functional.cross_entropy(
+            #         input=input_data,
+            #         label=label_data,
+            #         weight=weight_data,
+            #         ignore_index=-1)
+
+            # self.assertRaises(ValueError, test_LabelValueNeg)
+
+            def test_WeightLength_NotEqual():
                input_data = paddle.rand(shape=[20, 100])
                label_data = paddle.randint(
                    0, 100, shape=[20, 1], dtype="int64")
-                label_data[0] = 255
+                weight_data = paddle.rand([100 + 1])
+                paddle.nn.functional.cross_entropy(
+                    input=input_data,
+                    label=label_data,
+                    weight=weight_data,
+                    ignore_index=-100)
+
+            self.assertRaises(ValueError, test_WeightLength_NotEqual)
+
+            def test_LabelValue_ExceedMax():
+                input_data = paddle.rand(shape=[20, 100])
+                label_data = paddle.randint(
+                    0, 100, shape=[20, 1], dtype="int64")
+                label_data[0] = 100
                weight_data = paddle.rand([100])
                paddle.nn.functional.cross_entropy(
                    input=input_data,
                    label=label_data,
                    weight=weight_data,
-                    ignore_index=255)
+                    ignore_index=-100)

-            self.assertRaises(ValueError, test_LabelValue)
+            self.assertRaises(ValueError, test_LabelValue_ExceedMax)

-            def test_LabelValueNeg():
+            def test_LabelValue_ExceedMin():
                input_data = paddle.rand(shape=[20, 100])
                label_data = paddle.randint(
                    0, 100, shape=[20, 1], dtype="int64")
@@ -1386,9 +1528,107 @@ class TestCrossEntropyFAPIError(unittest.TestCase):
                    input=input_data,
                    label=label_data,
                    weight=weight_data,
-                    ignore_index=-1)
-
-            self.assertRaises(ValueError, test_LabelValueNeg)
+                    ignore_index=-100)
+
+            self.assertRaises(ValueError, test_LabelValue_ExceedMin)
+
+            def static_test_WeightLength_NotEqual():
+                input_np = np.random.random([2, 4]).astype(self.dtype)
+                label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
+                weight_np = np.random.random([3]).astype(self.dtype)  #shape:C
+                paddle.enable_static()
+                prog = fluid.Program()
+                startup_prog = fluid.Program()
+                place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+                ) else fluid.CPUPlace()
+                with fluid.program_guard(prog, startup_prog):
+                    input = fluid.data(
+                        name='input', shape=[2, 4], dtype=self.dtype)
+                    label = fluid.data(name='label', shape=[2], dtype='int64')
+                    weight = fluid.data(
+                        name='weight', shape=[3],
+                        dtype=self.dtype)  #weight for each class
+                    cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                        weight=weight)
+                    ret = cross_entropy_loss(input, label)
+
+                    exe = fluid.Executor(place)
+                    static_ret = exe.run(prog,
+                                         feed={
+                                             'input': input_np,
+                                             'label': label_np,
+                                             "weight": weight_np
+                                         },
+                                         fetch_list=[ret])
+                    self.assertIsNotNone(static_ret)
+
+            self.assertRaises(ValueError, static_test_WeightLength_NotEqual)
+
+            def static_test_LabelValue_ExceedMax():
+                input_np = np.random.random([2, 4]).astype(self.dtype)
+                label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
+                label_np[0] = 255
+                weight_np = np.random.random([4]).astype(self.dtype)  #shape:C
+                paddle.enable_static()
+                prog = fluid.Program()
+                startup_prog = fluid.Program()
+                place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+                ) else fluid.CPUPlace()
+                with fluid.program_guard(prog, startup_prog):
+                    input = fluid.data(
+                        name='input', shape=[2, 4], dtype=self.dtype)
+                    label = fluid.data(name='label', shape=[2], dtype='int64')
+                    weight = fluid.data(
+                        name='weight', shape=[4],
+                        dtype=self.dtype)  #weight for each class
+                    cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                        weight=weight)
+                    ret = cross_entropy_loss(input, label)
+
+                    exe = fluid.Executor(place)
+                    static_ret = exe.run(prog,
+                                         feed={
+                                             'input': input_np,
+                                             'label': label_np,
+                                             "weight": weight_np
+                                         },
+                                         fetch_list=[ret])
+                    self.assertIsNotNone(static_ret)
+
+            self.assertRaises(ValueError, static_test_LabelValue_ExceedMax)
+
+            def static_test_LabelValue_ExceedMin():
+                input_np = np.random.random([2, 4]).astype(self.dtype)
+                label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
+                label_np[0] = -1
+                weight_np = np.random.random([4]).astype(self.dtype)  #shape:C
+                paddle.enable_static()
+                prog = fluid.Program()
+                startup_prog = fluid.Program()
+                place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
+                ) else fluid.CPUPlace()
+                with fluid.program_guard(prog, startup_prog):
+                    input = fluid.data(
+                        name='input', shape=[2, 4], dtype=self.dtype)
+                    label = fluid.data(name='label', shape=[2], dtype='int64')
+                    weight = fluid.data(
+                        name='weight', shape=[4],
+                        dtype=self.dtype)  #weight for each class
+                    cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                        weight=weight)
+                    ret = cross_entropy_loss(input, label)
+
+                    exe = fluid.Executor(place)
+                    static_ret = exe.run(prog,
+                                         feed={
+                                             'input': input_np,
+                                             'label': label_np,
+                                             "weight": weight_np
+                                         },
+                                         fetch_list=[ret])
+                    self.assertIsNotNone(static_ret)
+
+            self.assertRaises(ValueError, static_test_LabelValue_ExceedMin)


 if __name__ == "__main__":

--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1389,18 +1389,18 @@ def cross_entropy(input,
                  use_softmax=True,
                  name=None):
    r"""
-    By default, this operator implements the cross entropy loss function with softmax. This function 
-    combines the calculation of the softmax operation and the cross entropy loss function 
-    to provide a more numerically stable computing. 
+    By default, this operator implements the cross entropy loss function with softmax. This function
+    combines the calculation of the softmax operation and the cross entropy loss function
+    to provide a more numerically stable computing.

    This operator will calculate the cross entropy loss function without softmax when use_softmax=False.

-    By default, this operator will calculate the mean of the result, and you can also affect 
-    the default behavior by using the reduction parameter. Please refer to the part of 
+    By default, this operator will calculate the mean of the result, and you can also affect
+    the default behavior by using the reduction parameter. Please refer to the part of
    parameters for details.

    This operator can be used to calculate the softmax cross entropy loss with soft and hard labels.
-    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels 
+    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels
    mean the probability of the actual label, 0.6, 0.8, 0.2, etc.

    The calculation of this operator includes the following two steps.
@@ -1455,7 +1455,7 @@ def cross_entropy(input,
            1.1. Hard labels (soft_label = False)

            .. math::
-                \\loss_j=loss_j*weight[label_j] 
+                \\loss_j=loss_j*weight[label_j]


            1.2. Soft labels (soft_label = True)
@@ -1465,21 +1465,21 @@ def cross_entropy(input,

        2. reduction

-            2.1 if the ``reduction`` parameter is ``none`` 
+            2.1 if the ``reduction`` parameter is ``none``

                Return the previous result directly

-            2.2 if the ``reduction`` parameter is ``sum`` 
+            2.2 if the ``reduction`` parameter is ``sum``

                Return the sum of the previous results

            .. math::
               \\loss=\sum_{j}loss_j

-            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to 
-            the ``weight`` parameter as follows. 
+            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to
+            the ``weight`` parameter as follows.

-            2.3.1. If the  ``weight``  parameter is ``None`` 
+            2.3.1. If the  ``weight``  parameter is ``None``

                   Return the average value of the previous results

@@ -1493,48 +1493,48 @@ def cross_entropy(input,
            1. Hard labels (soft_label = False)

             .. math::
-                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j] 
+                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j]

            2. Soft labels (soft_label = True)

             .. math::
                \\loss=\sum_{j}loss_j/\sum_{j}\left(\sum_{i}weight[label_i]\right)
- 
- 
+
+
    Parameters:

        - **input** (Tensor)

            Input tensor, the data type is float32, float64. Shape is
-	    :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` . 
+	    :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` .

-            Note: 
+            Note:

-                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the 
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
                output of softmax operator, which will produce incorrect results.

                2. when use_softmax=False, it expects the output of softmax operator.
- 
+
        - **label** (Tensor)

            1. If soft_label=False, the shape is
            :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
            the data type is int32, int64, float32, float64, where each value is [0, C-1].

-            2. If soft_label=True, the shape and data type should be same with ``input`` , 
+            2. If soft_label=True, the shape and data type should be same with ``input`` ,
            and the sum of the labels for each sample should be 1.

        - **weight** (Tensor, optional)

-            a manual rescaling weight given to each class. 
-            If given, has to be a Tensor of size C and the data type is float32, float64. 
+            a manual rescaling weight given to each class.
+            If given, has to be a Tensor of size C and the data type is float32, float64.
            Default is ``'None'`` .

        - **ignore_index** (int64, optional)

            Specifies a target value that is ignored
-            and does not contribute to the loss. A negative value means that no label 
-            value needs to be ignored. Only valid when soft_label = False.  
+            and does not contribute to the loss. A negative value means that no label
+            value needs to be ignored. Only valid when soft_label = False.
            Default is ``-100`` .

        - **reduction** (str, optional)
@@ -1548,14 +1548,14 @@ def cross_entropy(input,

        - **soft_label** (bool, optional)

-            Indicate whether label is soft. 
+            Indicate whether label is soft.
            Default is ``False``.

        - **axis** (int, optional)

-            The index of dimension to perform softmax calculations. 
-            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the 
-            number of dimensions of input :attr:`input`. 
+            The index of dimension to perform softmax calculations.
+            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the
+            number of dimensions of input :attr:`input`.
            Default is ``-1`` .

        - **use_softmax** (bool, optional)
@@ -1577,24 +1577,24 @@ def cross_entropy(input,

        If :attr:`reduction` is ``'none'``:

-        1. If soft_label = False, the dimension of return value is the same with ``label`` . 
+        1. If soft_label = False, the dimension of return value is the same with ``label`` .

-        2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . 
+        2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .


     Example1(hard labels):

        .. code-block:: python
-            
+
            import paddle
            paddle.seed(99999)
            N=100
            C=200
            reduction='mean'
-            input =  paddle.rand([N, C], dtype='float64')  
+            input =  paddle.rand([N, C], dtype='float64')
            label =  paddle.randint(0, C, shape=[N], dtype='int64')
-            weight = paddle.rand([C], dtype='float64') 
-            
+            weight = paddle.rand([C], dtype='float64')
+
            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                weight=weight, reduction=reduction)
            dy_ret = cross_entropy_loss(
@@ -1606,7 +1606,7 @@ def cross_entropy(input,
    Example2(soft labels):

        .. code-block:: python
-            
+
            import paddle
            paddle.seed(99999)
            axis = -1
@@ -1620,9 +1620,9 @@ def cross_entropy(input,
            labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
            labels /= paddle.sum(labels, axis=axis, keepdim=True)
            paddle_loss_mean = paddle.nn.functional.cross_entropy(
-                                                                  logits,  
-                                                                  labels, 
-                                                                  soft_label=True, 
+                                                                  logits,
+                                                                  labels,
+                                                                  soft_label=True,
                                                                  axis=axis,
                                                                  weight=weight,
                                                                  reduction=reduction)
@@ -1657,7 +1657,7 @@ def cross_entropy(input,

        if weight is not None:

-            #trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases.
+            # trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases.
            if soft_label == True:
                # chajchaj:
                # weight's shape is C, where C is class num.
@@ -1675,14 +1675,43 @@ def cross_entropy(input,
                out = _C_ops.elementwise_mul(out, weight_gather_reshape)

            else:
-                label_min = paddle.min(label)
-                label_max = paddle.max(label)
-                if label_min < 0 or label_max >= input.shape[-1]:
+                if input.shape[-1] != weight.shape[-1]:
                    raise ValueError(
-                        'Expected 0 <= label_value < class_dimension({}), but got {} <= label_value <= {} '.
-                        format(input.shape[-1],
-                               label_min.numpy(), label_max.numpy()))
-                weight_gather = _C_ops.gather_nd(weight, label)
+                        "input's class_dimension({}) must equal to \
+                        weight's class_dimension({}) \
+                            when weight is provided"
+                        .format(input.shape[-1], weight.shape[-1]))
+                valid_label = paddle.where(
+                    label == ignore_index,
+                    paddle.to_tensor(
+                        0, dtype=label.dtype),
+                    label)
+
+                if (len(paddle.nonzero(valid_label < 0)) > 0) or (
+                        len(paddle.nonzero(valid_label >= input.shape[-1])) > 0
+                ):
+                    invalid_label = paddle.gather_nd(
+                        input, paddle.nonzero(valid_label < 0))
+                    if invalid_label.numel() > 0:
+                        raise ValueError(
+                            "Target({}) is out of class_dimension's lower bound({})".
+                            format(invalid_label[0], 0))
+                    invalid_label = paddle.gather_nd(
+                        input, paddle.nonzero(valid_label >= input.shape[-1]))
+                    if invalid_label.numel() > 0:
+                        raise ValueError(
+                            "Target({}) is out of class_dimension's upper bound({})".
+                            format(invalid_label[0], input.shape[-1]))
+
+                ignore_weight_mask = paddle.cast((label != ignore_index),
+                                                 out.dtype)
+                if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
+                        -1] == 1:
+                    ignore_weight_mask.squeeze_(-1)
+                weight_gather = _C_ops.gather_nd(
+                    weight, valid_label)  # ignore的位置暂时用label0的权重代替
+                weight_gather = _C_ops.elementwise_mul(weight_gather,
+                                                       ignore_weight_mask)
                input_shape = list(label.shape)
                weight_gather_reshape = reshape(
                    weight_gather, shape=input_shape)
@@ -1690,22 +1719,22 @@ def cross_entropy(input,
                out = _C_ops.elementwise_mul(out, weight_gather_reshape)

        if reduction == "sum":
-            #   because of fluid_softmax_with_cross_entropy op's inner logic, 
+            #   because of fluid_softmax_with_cross_entropy op's inner logic,
            #   in the out tensor of this op, the loss of sample with class_index==ignore_index is 0
            #   so, reduce_sum all directly is ok
            return _C_ops.reduce_sum(out, 'reduce_all', True)
        elif reduction == "mean":
-            #1. if weight==none, 
-            #    numerator: reduce_sum all loss directly is ok causeof fluid_softmax_with_cross_entropy's inner logic
-            #    denominator: count sample num with class_index!=ignore_index
-            #2. else
-            #    numerator: loss's weighted sum 
-            #    denominator: cal the sum of weight where the sample's class_index!=ignore_index
+            # 1. if weight==none,
+            #     numerator: reduce_sum all loss directly is ok causeof fluid_softmax_with_cross_entropy's inner logic
+            #     denominator: count sample num with class_index!=ignore_index
+            # 2. else
+            #     numerator: loss's weighted sum
+            #     denominator: cal the sum of weight where the sample's class_index!=ignore_index
            if ignore_index != -100:
                out_sum = _C_ops.reduce_sum(out, 'reduce_all', True)
-                #for each label[i],set 1 or 0, according to ignore_index
-                #mask[i]=0, if label[i]==ignore_index
-                #mask[i]=1, otherwise 
+                # for each label[i],set 1 or 0, according to ignore_index
+                # mask[i]=0, if label[i]==ignore_index
+                # mask[i]=1, otherwise
                mask = (label != ignore_index)
                if weight is None:
                    mask = paddle.cast(mask, dtype=out_sum.dtype)
@@ -1761,7 +1790,7 @@ def cross_entropy(input,
        weight_name = name if reduction == 'none' else None
        if soft_label == True:
            # chajchaj:
-            #trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases.
+            # trans weight from class to sample, shape:N or [N,H,W] for 1d and 2d cases.
            # weight's shape is C, where C is class num.
            # for 1d case: label's shape is [N,C], weight_gather's shape is N.
            # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
@@ -1775,8 +1804,40 @@ def cross_entropy(input,
            weight_gather_reshape = reshape(weight_gather, shape=out_shape)
            out = paddle.cast(out, weight_gather_reshape.dtype)
        else:
+            if input.shape[-1] != weight.shape[-1]:
+                raise ValueError("input's class_dimension({}) must equal to \
+                        weight's class_dimension({}) \
+                            when weight is provided"
+                                 .format(input.shape[-1], weight.shape[-1]))
+            valid_label = paddle.where(
+                label == ignore_index,
+                paddle.to_tensor(
+                    0, dtype=label.dtype),
+                label)
+            if (len(paddle.nonzero(valid_label < 0)) > 0) or (
+                    len(paddle.nonzero(valid_label >= input.shape[-1])) > 0):
+                invalid_label = paddle.gather_nd(
+                    input, paddle.nonzero(valid_label < 0))
+                if invalid_label.numel() > 0:
+                    raise ValueError(
+                        "Target({}) is out of class_dimension's lower bound({})".
+                        format(invalid_label[0], 0))
+                invalid_label = paddle.gather_nd(
+                    input, paddle.nonzero(valid_label >= input.shape[-1]))
+                if invalid_label.numel() > 0:
+                    raise ValueError(
+                        "Target({}) is out of class_dimension's upper bound({})".
+                        format(invalid_label[0], input.shape[-1]))
+
+            ignore_weight_mask = paddle.cast((label != ignore_index), out.dtype)
+            if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
+                    -1] == 1:
+                ignore_weight_mask = paddle.squeeze(ignore_weight_mask, -1)
+
            weight_gather = paddle.gather_nd(
-                weight, label)  #trans weight from class to sample, shape:N
+                weight,
+                valid_label)  #trans weight from class to sample, shape:N
+            weight_gather = paddle.multiply(weight_gather, ignore_weight_mask)
            input_shape = list(label.shape)
            weight_gather_reshape = reshape(weight_gather, shape=input_shape)
        out = paddle.multiply(out, weight_gather_reshape, name=weight_name)
@@ -1786,9 +1847,9 @@ def cross_entropy(input,
    elif reduction == "mean":
        if ignore_index != -100:
            out_sum = paddle.sum(out, name=name)
-            #for each label[i],set 1 or 0, according to ignore_index
-            #mask[i]=0, if label[i]==ignore_index
-            #mask[i]=1, otherwise 
+            # for each label[i],set 1 or 0, according to ignore_index
+            # mask[i]=0, if label[i]==ignore_index
+            # mask[i]=1, otherwise
            mask = (label != ignore_index)
            if (weight is None):
                mask = paddle.cast(mask, dtype=out_sum.dtype)
@@ -1828,12 +1889,12 @@ def sigmoid_focal_loss(logit,
    it is used in one-stage object detection where the foreground-background class
    imbalance is extremely high.

-    This operator measures focal loss function as follows: 
+    This operator measures focal loss function as follows:

    .. math::
           Out = -Labels * alpha * {(1 - \sigma(Logit))}^{gamma}\log(\sigma(Logit)) - (1 - Labels) * (1 - alpha) * {\sigma(Logit)}^{gamma}\log(1 - \sigma(Logit))

-    We know that :math:`\sigma(Logit) = \frac{1}{1 + \exp(-Logit)}`. 
+    We know that :math:`\sigma(Logit) = \frac{1}{1 + \exp(-Logit)}`.

    Then, if :attr:`normalizer` is not None, this operator divides the
    normalizer tensor on the loss `Out`:
@@ -1860,7 +1921,7 @@ def sigmoid_focal_loss(logit,
            For object detection task, it is the the number of positive samples.
            If set to None, the focal loss will not be normalized. Default is None.
        alpha(int|float, optional): Hyper-parameter to balance the positive and negative example,
-            it should be between 0 and 1.  Default value is set to 0.25. 
+            it should be between 0 and 1.  Default value is set to 0.25.
        gamma(int|float, optional): Hyper-parameter to modulate the easy and hard examples.
            Default value is set to 2.0.
        reduction (str, optional): Indicate how to average the loss by batch_size,