From b52427327d9530b128d38caf152faa705471dfcc Mon Sep 17 00:00:00 2001
From: chajchaj <57249073+chajchaj@users.noreply.github.com>
Date: Tue, 24 Nov 2020 19:58:05 +0800
Subject: [PATCH] add soft_label and axis for CrossEntropyLoss and improve
 performance (#29024)

* add soft_label and axis for CrossEntropyLoss and improve performance,test=develop

* fix conflict in nn/functional/loss.py, test=develop
---
 .../unittests/test_cross_entropy_loss.py      | 580 +++---------------
 python/paddle/nn/functional/__init__.py       |   2 +
 python/paddle/nn/functional/loss.py           | 214 ++++---
 python/paddle/nn/layer/loss.py                | 151 ++---
 4 files changed, 299 insertions(+), 648 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
index c619059010..cd44d584bb 100644
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py
@@ -26,7 +26,7 @@ def stable_softmax(x):
     return exps / np.sum(exps)
 
 
-def log_softmax(x, axis=1):
+def log_softmax(x, axis=-1):
     softmax_out = np.apply_along_axis(stable_softmax, axis, x)
     return np.log(softmax_out)
 
@@ -67,8 +67,9 @@ def cross_entropy_loss_2d(input,
     log_softmax_out = log_softmax(input)
     input_shape = log_softmax_out.shape
     N = input_shape[0]
-    H = input_shape[2]
-    W = input_shape[3]
+    H = input_shape[1]
+    W = input_shape[2]
+
     out = np.zeros_like(label).astype(np.float64)
     total_weight = 0
     for i in range(N):
@@ -80,8 +81,8 @@ def cross_entropy_loss_2d(input,
                     continue
                 cur_weight = weight[cur_target] if weight is not None else 1
                 total_weight += cur_weight
-                out[i][h][w] = -log_softmax_out[i][cur_target][h][
-                    w] * cur_weight
+                out[i][h][w] = -log_softmax_out[i][h][w][
+                    cur_target] * cur_weight
     if reduction == 'sum':
         return np.sum(out), np.array([total_weight]).astype('float64')
     elif reduction == 'mean':
@@ -93,17 +94,20 @@ def cross_entropy_loss_2d(input,
 
 class CrossEntropyLoss(unittest.TestCase):
     def test_cross_entropy_loss_1d_with_weight_mean(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)
-        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
-        weight_np = np.random.random([200]).astype(np.float64)
+        input_np = np.random.random([2, 4]).astype(np.float64)
+        label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
+        weight_np = np.random.random([4]).astype(np.float64)  #shape:C
+        paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
-            label = fluid.data(name='label', shape=[100], dtype='int64')
-            weight = fluid.data(name='weight', shape=[200], dtype='float64')
+            input = fluid.data(name='input', shape=[2, 4], dtype='float64')
+            label = fluid.data(name='label', shape=[2], dtype='int64')
+            weight = fluid.data(
+                name='weight', shape=[4],
+                dtype='float64')  #weight for each class
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(weight=weight)
             ret = cross_entropy_loss(input, label)
 
@@ -116,9 +120,12 @@ class CrossEntropyLoss(unittest.TestCase):
                                  },
                                  fetch_list=[ret])
             self.assertIsNotNone(static_ret)
+        expected = cross_entropy_loss_1d(
+            input_np, label_np, weight=weight_np)[0]
+
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
-                weight=fluid.dygraph.to_variable(weight_np))
+                weight=fluid.dygraph.to_variable(weight_np), axis=1)
             dy_ret = cross_entropy_loss(
                 fluid.dygraph.to_variable(input_np),
                 fluid.dygraph.to_variable(label_np))
@@ -131,9 +138,10 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_with_weight_sum(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)
-        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
-        weight_np = np.random.random([200]).astype(np.float64)
+        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
+        weight_np = np.random.random([200]).astype(np.float64)  #C
+        paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
@@ -170,9 +178,10 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_with_weight_none(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)
-        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
-        weight_np = np.random.random([200]).astype(np.float64)
+        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
+        weight_np = np.random.random([200]).astype(np.float64)  #C
+        paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
@@ -193,6 +202,7 @@ class CrossEntropyLoss(unittest.TestCase):
                                      "weight": weight_np
                                  },
                                  fetch_list=[ret])
+            static_ret = np.squeeze(static_ret)
             self.assertIsNotNone(static_ret)
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
@@ -201,6 +211,7 @@ class CrossEntropyLoss(unittest.TestCase):
                 fluid.dygraph.to_variable(input_np),
                 fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
+            dy_ret_value = np.squeeze(dy_ret_value)
             self.assertIsNotNone(dy_ret_value)
         expected = cross_entropy_loss_1d(
             input_np, label_np, weight=weight_np, reduction='none')
@@ -209,8 +220,10 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_mean(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)
-        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
+        weight_np = np.random.random([200]).astype(np.float64)  #C
+        paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
@@ -218,9 +231,9 @@ class CrossEntropyLoss(unittest.TestCase):
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(name='input', shape=[100, 200], dtype='float64')
             label = fluid.data(name='label', shape=[100], dtype='int64')
+            weight = fluid.data(name='weight', shape=[100], dtype='float64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss()
             ret = cross_entropy_loss(input, label)
-
             exe = fluid.Executor(place)
             static_ret = exe.run(prog,
                                  feed={'input': input_np,
@@ -240,8 +253,9 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_sum(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)
-        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
+        paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
@@ -252,7 +266,6 @@ class CrossEntropyLoss(unittest.TestCase):
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='sum')
             ret = cross_entropy_loss(input, label)
-
             exe = fluid.Executor(place)
             static_ret = exe.run(prog,
                                  feed={'input': input_np,
@@ -273,8 +286,9 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_1d_none(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)
-        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
+        input_np = np.random.random([100, 200]).astype(np.float64)  #N,C
+        label_np = np.random.randint(0, 100, size=(100)).astype(np.int64)  #N,1
+        paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
@@ -285,12 +299,12 @@ class CrossEntropyLoss(unittest.TestCase):
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='none')
             ret = cross_entropy_loss(input, label)
-
             exe = fluid.Executor(place)
             static_ret = exe.run(prog,
                                  feed={'input': input_np,
                                        'label': label_np},
                                  fetch_list=[ret])
+            static_ret = np.squeeze(static_ret)
             self.assertIsNotNone(static_ret)
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
@@ -299,6 +313,7 @@ class CrossEntropyLoss(unittest.TestCase):
                 fluid.dygraph.to_variable(input_np),
                 fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
+            dy_ret_value = np.squeeze(dy_ret_value)
             self.assertIsNotNone(dy_ret_value)
         expected = cross_entropy_loss_1d(input_np, label_np, reduction='none')
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
@@ -306,17 +321,20 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_with_weight_none(self):
-        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
-        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
-        weight_np = np.random.random(size=(3, )).astype(np.float64)
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        label_np = np.random.randint(
+            0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW1
+        weight_np = np.random.random(size=(3, )).astype(np.float64)  #C
+
+        paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
-            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+                name='input', shape=[2, 2, 2, 3], dtype='float64')
+            label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             weight = fluid.data(name='weight', shape=[3], dtype='float64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction='none')
@@ -330,6 +348,7 @@ class CrossEntropyLoss(unittest.TestCase):
                                      "weight": weight_np
                                  },
                                  fetch_list=[ret])
+            static_ret = np.squeeze(static_ret)
             self.assertIsNotNone(static_ret)
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
@@ -338,6 +357,7 @@ class CrossEntropyLoss(unittest.TestCase):
                 fluid.dygraph.to_variable(input_np),
                 fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
+            dy_ret_value = np.squeeze(dy_ret_value)
             self.assertIsNotNone(dy_ret_value)
         expected = cross_entropy_loss_2d(
             input_np, label_np, weight=weight_np, reduction='none')
@@ -346,17 +366,19 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_with_weight_mean(self):
-        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
-        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
-        weight_np = np.random.random(size=(3, )).astype(np.float64)
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        label_np = np.random.randint(
+            0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
+        weight_np = np.random.random(size=(3, )).astype(np.float64)  #C
+        paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
-            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+                name='input', shape=[2, 2, 2, 3], dtype='float64')
+            label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             weight = fluid.data(name='weight', shape=[3], dtype='float64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction='mean')
@@ -386,17 +408,20 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_with_weight_sum(self):
-        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
-        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
-        weight_np = np.random.random(size=(3, )).astype(np.float64)
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        label_np = np.random.randint(
+            0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
+        weight_np = np.random.random(size=(3, )).astype(np.float64)  #C
+        paddle.enable_static()
+
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
-            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+                name='input', shape=[2, 2, 2, 3], dtype='float64')
+            label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             weight = fluid.data(name='weight', shape=[3], dtype='float64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction='sum')
@@ -426,20 +451,21 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_none(self):
-        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
-        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        label_np = np.random.randint(
+            0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
+        paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
-            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+                name='input', shape=[2, 2, 2, 3], dtype='float64')
+            label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='none')
             ret = cross_entropy_loss(input, label)
-
             exe = fluid.Executor(place)
             static_ret = exe.run(prog,
                                  feed={
@@ -447,6 +473,7 @@ class CrossEntropyLoss(unittest.TestCase):
                                      'label': label_np,
                                  },
                                  fetch_list=[ret])
+            static_ret = np.squeeze(static_ret)
             self.assertIsNotNone(static_ret)
         with fluid.dygraph.guard():
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
@@ -455,6 +482,7 @@ class CrossEntropyLoss(unittest.TestCase):
                 fluid.dygraph.to_variable(input_np),
                 fluid.dygraph.to_variable(label_np))
             dy_ret_value = dy_ret.numpy()
+            dy_ret_value = np.squeeze(dy_ret_value)
             self.assertIsNotNone(dy_ret_value)
         expected = cross_entropy_loss_2d(input_np, label_np, reduction='none')
         self.assertTrue(np.allclose(static_ret, dy_ret_value))
@@ -462,16 +490,18 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_mean(self):
-        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
-        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        label_np = np.random.randint(
+            0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
+        paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
-            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+                name='input', shape=[2, 2, 2, 3], dtype='float64')
+            label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='mean')
             ret = cross_entropy_loss(input, label)
@@ -499,16 +529,18 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
     def test_cross_entropy_loss_2d_sum(self):
-        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
-        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
+        input_np = np.random.random(size=(2, 2, 2, 3)).astype(np.float64)  #NHWC
+        label_np = np.random.randint(
+            0, 3, size=(2, 2, 2)).astype(np.int64)  #NHW
+        paddle.enable_static()
         prog = fluid.Program()
         startup_prog = fluid.Program()
         place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
         ) else fluid.CPUPlace()
         with fluid.program_guard(prog, startup_prog):
             input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
-            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
+                name='input', shape=[2, 2, 2, 3], dtype='float64')
+            label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64')
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 reduction='sum')
             ret = cross_entropy_loss(input, label)
@@ -535,443 +567,5 @@ class CrossEntropyLoss(unittest.TestCase):
         self.assertTrue(np.allclose(dy_ret_value, expected))
 
 
-class FuncCrossEntropyLoss(unittest.TestCase):
-    #1
-    def test_cross_entropy_loss_1d_with_weight_mean(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)
-        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
-        weight_np = np.random.random([200]).astype(np.float64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
-            label = fluid.data(name='label', shape=[100], dtype='int64')
-            weight = fluid.data(name='weight', shape=[200], dtype='float64')
-            ret = paddle.nn.functional.cross_entropy(
-                input, label, weight=weight)
-
-            exe = fluid.Executor(place)
-            static_ret = exe.run(prog,
-                                 feed={
-                                     'input': input_np,
-                                     'label': label_np,
-                                     "weight": weight_np
-                                 },
-                                 fetch_list=[ret])
-            self.assertIsNotNone(static_ret)
-        with fluid.dygraph.guard():
-            dy_ret = paddle.nn.functional.cross_entropy(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np),
-                weight=fluid.dygraph.to_variable(weight_np))
-            dy_ret_value = dy_ret.numpy()
-            self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_1d(
-            input_np, label_np, weight=weight_np)[0]
-        self.assertTrue(np.allclose(static_ret, dy_ret_value))
-        self.assertTrue(np.allclose(static_ret, expected))
-        self.assertTrue(np.allclose(dy_ret_value, expected))
-
-    #2
-    def test_cross_entropy_loss_1d_with_weight_sum(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)
-        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
-        weight_np = np.random.random([200]).astype(np.float64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
-            label = fluid.data(name='label', shape=[100], dtype='int64')
-            weight = fluid.data(name='weight', shape=[200], dtype='float64')
-            ret = paddle.nn.functional.cross_entropy(
-                input, label, weight=weight, reduction='sum')
-
-            exe = fluid.Executor(place)
-            static_ret = exe.run(prog,
-                                 feed={
-                                     'input': input_np,
-                                     'label': label_np,
-                                     "weight": weight_np
-                                 },
-                                 fetch_list=[ret])
-            self.assertIsNotNone(static_ret)
-        with fluid.dygraph.guard():
-            dy_ret = paddle.nn.functional.cross_entropy(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np),
-                weight=fluid.dygraph.to_variable(weight_np),
-                reduction='sum')
-            dy_ret_value = dy_ret.numpy()
-            self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_1d(
-            input_np, label_np, weight=weight_np, reduction='sum')[0]
-        self.assertTrue(np.allclose(static_ret, dy_ret_value))
-        self.assertTrue(np.allclose(static_ret, expected))
-        self.assertTrue(np.allclose(dy_ret_value, expected))
-
-    #3
-    def test_cross_entropy_loss_1d_with_weight_none(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)
-        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
-        weight_np = np.random.random([200]).astype(np.float64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
-            label = fluid.data(name='label', shape=[100], dtype='int64')
-            weight = fluid.data(name='weight', shape=[200], dtype='float64')
-            ret = paddle.nn.functional.cross_entropy(
-                input, label, weight=weight, reduction='none')
-
-            exe = fluid.Executor(place)
-            static_ret = exe.run(prog,
-                                 feed={
-                                     'input': input_np,
-                                     'label': label_np,
-                                     "weight": weight_np
-                                 },
-                                 fetch_list=[ret])
-            self.assertIsNotNone(static_ret)
-        with fluid.dygraph.guard():
-            dy_ret = paddle.nn.functional.cross_entropy(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np),
-                weight=fluid.dygraph.to_variable(weight_np),
-                reduction='none')
-            dy_ret_value = dy_ret.numpy()
-            self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_1d(
-            input_np, label_np, weight=weight_np, reduction='none')
-        self.assertTrue(np.allclose(static_ret, dy_ret_value))
-        self.assertTrue(np.allclose(static_ret, expected))
-        self.assertTrue(np.allclose(dy_ret_value, expected))
-
-    #4
-    def test_cross_entropy_loss_1d_mean(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)
-        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
-            label = fluid.data(name='label', shape=[100], dtype='int64')
-            ret = paddle.nn.functional.cross_entropy(input, label)
-
-            exe = fluid.Executor(place)
-            static_ret = exe.run(prog,
-                                 feed={'input': input_np,
-                                       'label': label_np},
-                                 fetch_list=[ret])
-            self.assertIsNotNone(static_ret)
-        with fluid.dygraph.guard():
-            dy_ret = paddle.nn.functional.cross_entropy(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np))
-            dy_ret_value = dy_ret.numpy()
-            self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_1d(input_np, label_np)[0]
-        self.assertTrue(np.allclose(static_ret, dy_ret_value))
-        self.assertTrue(np.allclose(static_ret, expected))
-        self.assertTrue(np.allclose(dy_ret_value, expected))
-
-    #5
-    def test_cross_entropy_loss_1d_sum(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)
-        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
-            label = fluid.data(name='label', shape=[100], dtype='int64')
-            ret = paddle.nn.functional.cross_entropy(
-                input, label, reduction='sum')
-
-            exe = fluid.Executor(place)
-            static_ret = exe.run(prog,
-                                 feed={'input': input_np,
-                                       'label': label_np},
-                                 fetch_list=[ret])
-            self.assertIsNotNone(static_ret)
-        with fluid.dygraph.guard():
-            dy_ret = paddle.nn.functional.cross_entropy(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np),
-                reduction='sum')
-            dy_ret_value = dy_ret.numpy()
-            self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_1d(input_np, label_np, reduction='sum')[0]
-        self.assertTrue(np.allclose(static_ret, dy_ret_value))
-        self.assertTrue(np.allclose(static_ret, expected))
-        self.assertTrue(np.allclose(dy_ret_value, expected))
-
-    #6
-    def test_cross_entropy_loss_1d_none(self):
-        input_np = np.random.random([100, 200]).astype(np.float64)
-        label_np = np.random.randint(0, 100, size=(100, )).astype(np.int64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(name='input', shape=[100, 200], dtype='float64')
-            label = fluid.data(name='label', shape=[100], dtype='int64')
-            ret = paddle.nn.functional.cross_entropy(
-                input, label, reduction='none')
-
-            exe = fluid.Executor(place)
-            static_ret = exe.run(prog,
-                                 feed={'input': input_np,
-                                       'label': label_np},
-                                 fetch_list=[ret])
-            self.assertIsNotNone(static_ret)
-        with fluid.dygraph.guard():
-            dy_ret = paddle.nn.functional.cross_entropy(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np),
-                reduction='none')
-            dy_ret_value = dy_ret.numpy()
-            self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_1d(input_np, label_np, reduction='none')
-        self.assertTrue(np.allclose(static_ret, dy_ret_value))
-        self.assertTrue(np.allclose(static_ret, expected))
-        self.assertTrue(np.allclose(dy_ret_value, expected))
-
-    #7
-    def test_cross_entropy_loss_2d_with_weight_none(self):
-        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
-        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
-        weight_np = np.random.random(size=(3, )).astype(np.float64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
-            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
-            weight = fluid.data(name='weight', shape=[3], dtype='float64')
-            ret = paddle.nn.functional.cross_entropy(
-                input, label, weight=weight, reduction='none')
-
-            exe = fluid.Executor(place)
-            static_ret = exe.run(prog,
-                                 feed={
-                                     'input': input_np,
-                                     'label': label_np,
-                                     "weight": weight_np
-                                 },
-                                 fetch_list=[ret])
-            self.assertIsNotNone(static_ret)
-        with fluid.dygraph.guard():
-            dy_ret = paddle.nn.functional.cross_entropy(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np),
-                weight=fluid.dygraph.to_variable(weight_np),
-                reduction='none')
-            dy_ret_value = dy_ret.numpy()
-            self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_2d(
-            input_np, label_np, weight=weight_np, reduction='none')
-        self.assertTrue(np.allclose(static_ret, dy_ret_value))
-        self.assertTrue(np.allclose(static_ret, expected))
-        self.assertTrue(np.allclose(dy_ret_value, expected))
-
-    #8
-    def test_cross_entropy_loss_2d_with_weight_mean(self):
-        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
-        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
-        weight_np = np.random.random(size=(3, )).astype(np.float64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
-            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
-            weight = fluid.data(name='weight', shape=[3], dtype='float64')
-            ret = paddle.nn.functional.cross_entropy(
-                input, label, weight=weight, reduction='mean')
-
-            exe = fluid.Executor(place)
-            static_ret = exe.run(prog,
-                                 feed={
-                                     'input': input_np,
-                                     'label': label_np,
-                                     "weight": weight_np
-                                 },
-                                 fetch_list=[ret])
-            self.assertIsNotNone(static_ret)
-        with fluid.dygraph.guard():
-            dy_ret = paddle.nn.functional.cross_entropy(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np),
-                weight=fluid.dygraph.to_variable(weight_np),
-                reduction='mean')
-            dy_ret_value = dy_ret.numpy()
-            self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_2d(
-            input_np, label_np, weight=weight_np, reduction='mean')[0]
-        self.assertTrue(np.allclose(static_ret, dy_ret_value))
-        self.assertTrue(np.allclose(static_ret, expected))
-        self.assertTrue(np.allclose(dy_ret_value, expected))
-
-    #9
-    def test_cross_entropy_loss_2d_with_weight_sum(self):
-        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
-        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
-        weight_np = np.random.random(size=(3, )).astype(np.float64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
-            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
-            weight = fluid.data(name='weight', shape=[3], dtype='float64')
-            ret = paddle.nn.functional.cross_entropy(
-                input, label, weight=weight, reduction='sum')
-
-            exe = fluid.Executor(place)
-            static_ret = exe.run(prog,
-                                 feed={
-                                     'input': input_np,
-                                     'label': label_np,
-                                     "weight": weight_np
-                                 },
-                                 fetch_list=[ret])
-            self.assertIsNotNone(static_ret)
-        with fluid.dygraph.guard():
-            dy_ret = paddle.nn.functional.cross_entropy(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np),
-                weight=fluid.dygraph.to_variable(weight_np),
-                reduction='sum')
-            dy_ret_value = dy_ret.numpy()
-            self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_2d(
-            input_np, label_np, weight=weight_np, reduction='sum')[0]
-        self.assertTrue(np.allclose(static_ret, dy_ret_value))
-        self.assertTrue(np.allclose(static_ret, expected))
-        self.assertTrue(np.allclose(dy_ret_value, expected))
-
-    #10
-    def test_cross_entropy_loss_2d_none(self):
-        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
-        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
-            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
-            ret = paddle.nn.functional.cross_entropy(
-                input, label, reduction='none')
-
-            exe = fluid.Executor(place)
-            static_ret = exe.run(prog,
-                                 feed={
-                                     'input': input_np,
-                                     'label': label_np,
-                                 },
-                                 fetch_list=[ret])
-            self.assertIsNotNone(static_ret)
-        with fluid.dygraph.guard():
-            dy_ret = paddle.nn.functional.cross_entropy(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np),
-                reduction='none')
-            dy_ret_value = dy_ret.numpy()
-            self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_2d(input_np, label_np, reduction='none')
-        self.assertTrue(np.allclose(static_ret, dy_ret_value))
-        self.assertTrue(np.allclose(static_ret, expected))
-        self.assertTrue(np.allclose(dy_ret_value, expected))
-
-    #11
-    def test_cross_entropy_loss_2d_mean(self):
-        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
-        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
-            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
-            ret = paddle.nn.functional.cross_entropy(
-                input, label, reduction='mean')
-
-            exe = fluid.Executor(place)
-            static_ret = exe.run(prog,
-                                 feed={
-                                     'input': input_np,
-                                     'label': label_np,
-                                 },
-                                 fetch_list=[ret])
-            self.assertIsNotNone(static_ret)
-        with fluid.dygraph.guard():
-            dy_ret = paddle.nn.functional.cross_entropy(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np),
-                reduction='mean')
-            dy_ret_value = dy_ret.numpy()
-            self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_2d(
-            input_np, label_np, reduction='mean')[0]
-        self.assertTrue(np.allclose(static_ret, dy_ret_value))
-        self.assertTrue(np.allclose(static_ret, expected))
-        self.assertTrue(np.allclose(dy_ret_value, expected))
-
-    #12
-    def test_cross_entropy_loss_2d_sum(self):
-        input_np = np.random.random(size=(5, 3, 5, 5)).astype(np.float64)
-        label_np = np.random.randint(0, 3, size=(5, 5, 5)).astype(np.int64)
-        prog = fluid.Program()
-        startup_prog = fluid.Program()
-        place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
-        ) else fluid.CPUPlace()
-        with fluid.program_guard(prog, startup_prog):
-            input = fluid.data(
-                name='input', shape=[5, 3, 5, 5], dtype='float64')
-            label = fluid.data(name='label', shape=[5, 5, 5], dtype='int64')
-            ret = paddle.nn.functional.cross_entropy(
-                input, label, reduction='sum')
-
-            exe = fluid.Executor(place)
-            static_ret = exe.run(prog,
-                                 feed={
-                                     'input': input_np,
-                                     'label': label_np,
-                                 },
-                                 fetch_list=[ret])
-            self.assertIsNotNone(static_ret)
-        with fluid.dygraph.guard():
-            dy_ret = paddle.nn.functional.cross_entropy(
-                fluid.dygraph.to_variable(input_np),
-                fluid.dygraph.to_variable(label_np),
-                reduction='sum')
-            dy_ret_value = dy_ret.numpy()
-            self.assertIsNotNone(dy_ret_value)
-        expected = cross_entropy_loss_2d(input_np, label_np, reduction='sum')[0]
-        self.assertTrue(np.allclose(static_ret, dy_ret_value))
-        self.assertTrue(np.allclose(static_ret, expected))
-        self.assertTrue(np.allclose(dy_ret_value, expected))
-
-
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 00a4034ead..c2d6fce670 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -128,6 +128,8 @@ from .loss import binary_cross_entropy  #DEFINE_ALIAS
 from .loss import binary_cross_entropy_with_logits  #DEFINE_ALIAS
 # from .loss import bpr_loss  #DEFINE_ALIAS
 # from .loss import center_loss  #DEFINE_ALIAS
+#from .loss import cross_entropy  #DEFINE_ALIAS
+from .loss import softmax_cross_entropy  #DEFINE_ALIAS
 from .loss import cross_entropy  #DEFINE_ALIAS
 from .loss import dice_loss  #DEFINE_ALIAS
 from .loss import hsigmoid_loss  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index fb923e0567..7bfe51c2ec 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -42,6 +42,7 @@ __all__ = [
     'binary_cross_entropy',
     'binary_cross_entropy_with_logits',
     'cross_entropy',
+    'softmax_cross_entropy',
     'dice_loss',
     'hsigmoid_loss',
     'kl_div',
@@ -1120,39 +1121,73 @@ def cross_entropy(input,
                   label,
                   weight=None,
                   ignore_index=-100,
-                  reduction='mean'):
-    r"""
-    This operator implements the cross entropy loss function. This OP combines ``LogSoftmax``,
-    and ``NLLLoss`` together.
+                  reduction='mean',
+                  soft_label=False,
+                  axis=-1,
+                  name=None):
+    return softmax_cross_entropy(
+        input=input,
+        label=label,
+        weight=weight,
+        ignore_index=ignore_index,
+        reduction=reduction,
+        soft_label=soft_label,
+        axis=axis,
+        name=name)
+
+
+def softmax_cross_entropy(input,
+                          label,
+                          weight=None,
+                          ignore_index=-100,
+                          reduction='mean',
+                          soft_label=False,
+                          axis=-1,
+                          name=None):
+    """
+    This operator implements the cross entropy loss function with softmax. This function 
+    combines the calculation of the softmax operation and the cross entropy loss function 
+    to provide a more numerically stable gradient.
+    Because this operator performs a softmax on logits internally, it expects
+    unscaled logits. This operator should not be used with the output of
+    softmax operator since that would produce incorrect results.
 
-    It is useful when training a classification problem with ``C`` classes.
-    If provided, the optional argument ``weight`` should be a 1D Variable assigning
-    weight to each of the classes.
+    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
+    expects mutually exclusive hard labels, each sample in a batch is in exactly 
+    one class with a probability of 1.0. Each sample in the batch will have a 
+    single label.
 
-    For predictions label, and target label, the loss is calculated as follows.
+    The equation is as follows:
+
+    1) Hard label (one-hot label, so every sample has exactly one class)
 
     .. math::
 
-        loss_j =  -\\text{input[class]} +
-        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{input}_i)\\right), j = 1,..., K
+        loss_j =  -\\text{logits}_{label_j} +
+        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logits}_i)\\right), j = 1,..., K
 
-    If weight is not ``None``:
+    2) Soft label (each sample can have a distribution over all classes)
 
     .. math::
 
-        loss_j =  \\text{weight[class]}(-\\text{input[class]} +
-        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{input}_i)\\right)), j = 1,..., K
+        loss_j =  -\\sum_{i=0}^{K}\\text{label}_i
+        \\left(\\text{logits}_i - \\log\\left(\\sum_{i=0}^{K}
+        \\exp(\\text{logits}_i)\\right)\\right), j = 1,...,K
+
+ 
+    It is useful when training a classification problem with ``C`` classes.
+
 
     Parameters:
         input (Tensor): Input tensor, the data type is float32, float64. Shape is
 	    (N, C), where C is number of classes, and if shape is more than 2D, this
-	    is (N, C, D1, D2,..., Dk), k >= 1.
+	    is (N, D1, D2,..., Dk, C), k >= 1.
         label (Tensor): Label tensor, the data type is int64. Shape is (N), where each
 	    value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
 	    (N, D1, D2,..., Dk), k >= 1.
-        weight (Tensor, optional): Weight tensor, a manual rescaling weight given
-            to each class and the shape is (C). It has the same dimensions as class
-	    number and the data type is float32, float64. Default is ``'None'``.
+        weight (Tensor, optional):a manual rescaling weight given to each class. 
+            If given, has to be a Tensor of size C and the data type is float32, float64. 
+            Default is ``'None'``.
         reduction (str, optional): Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
@@ -1161,88 +1196,103 @@ def cross_entropy(input,
             Default is ``'mean'``.
         ignore_index (int64, optional): Specifies a target value that is ignored
             and does not contribute to the input gradient. Default is ``-100``.
+        soft_label (bool): indicate whether label is soft. Default False, meaning that
+                the label is hard. If soft_label=True, the label is soft.
+        axis (int, optional): The index of dimension to perform softmax calculations. It 
+                              should be in range :math:`[-1, rank - 1]`, while :math:`rank`
+                              is the rank of input :attr:`logits`. Default: -1.
+
 
     Returns:
         The tensor variable storing the cross_entropy_loss of input and label.
 
-    Return type: Tensor.
+    Return type: Variable.
 
     Examples:
         .. code-block:: python
-
             import paddle
-            paddle.disable_static()
-            input_data = np.random.random([5, 100]).astype("float64")
-            label_data = np.random.randint(0, 100, size=(5)).astype(np.int64)
-            weight_data = np.random.random([100]).astype("float64")
-            input =  paddle.to_tensor(input_data)
-            label =  paddle.to_tensor(label_data)
-            weight = paddle.to_tensor(weight_data)
-            loss = paddle.nn.functional.cross_entropy(input=input, label=label, weight=weight)
-            print(loss.numpy())
-
+            import paddle.nn.functional as F
+            import numpy as np
+            input_np = np.random.random([2, 4]).astype(np.float64)
+            label_np = np.random.randint(0, 4, size=(2)).astype(np.int64)
+            weight_np = np.random.random([4]).astype(np.float64) #shape:C
+            output = F.softmax_cross_entropy(
+                paddle.to_tensor(input_np),
+                paddle.to_tensor(label_np),
+                weight=paddle.to_tensor(weight_np))
+            print(output.numpy()) #[1.30719427]
     """
-    if not in_dygraph_mode():
-        fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'cross_entropy_loss')
-        fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
-                                                   'cross_entropy_loss')
 
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
-            "The value of 'reduction' in cross_entropy_loss should be 'sum', 'mean' or"
-            " 'none', but received %s, which is not allowed." % reduction)
-
-    #step 1. log_softmax
-    log_softmax_out = paddle.nn.functional.log_softmax(input, axis=1)
-    if weight is not None and not isinstance(weight, Variable):
+            "The value of 'reduction' in softmax_cross_entropy"
+            "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
+            % reduction)
+    input_dims = len(list(input.shape))
+    label_dims = len(list(label.shape))
+    if input_dims - 1 != label_dims and input_dims != label_dims:
         raise ValueError(
-            "The weight' is not a Variable, please convert to Variable.")
-
-    #step 2. nll_loss
-    input = log_softmax_out
-    helper = LayerHelper('nll_loss', **locals())
-    dtype = helper.input_dtype(input)
+            'Expected nput_dims - 1 = label_dims or input_dims == label_dims\
+             (got nput_dims{}, label_dims{})'.format(input_dims, label_dims))
+    if input_dims - 1 == label_dims:
+        label = paddle.unsqueeze(label, axis=axis)
+    if in_dygraph_mode():
+        out = softmax_with_cross_entropy(
+            input,
+            label,
+            soft_label=soft_label,
+            ignore_index=ignore_index,
+            axis=axis)
+        if weight is not None:
+            weight_gather = core.ops.gather_nd(weight, label)  #trans to sample
+            input_shape = list(label.shape)
+            weight_gather_reshape, _ = core.ops.reshape2(weight_gather, 'shape',
+                                                         input_shape)
+            out = core.ops.elementwise_mul(out, weight_gather_reshape)
 
-    if not in_dygraph_mode():
-        fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'nll_loss')
-        fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
-                                                   'nll_loss')
-
-    x_shape = list(input.shape)
-    n = x_shape[0]
-    c = x_shape[1]
-    x_dims = len(x_shape)
-    if x_dims < 2:
-        raise ValueError('Expected 2 or more dimensions (got {})'.format(
-            x_dims))
-    if x_dims != 2 and x_dims != 4:
-        input = reshape(input, shape=[n, c, 1, -1])
-        label = reshape(label, shape=[n, 1, -1])
-        out_shape = [n] + x_shape[2:]
+        if reduction == "sum":
+            return core.ops.reduce_sum(out, 'reduce_all', True)
+        elif reduction == "mean":
+            if weight is not None:
+                out_sum = core.ops.reduce_sum(out, 'reduce_all', True)
+                total_weight = core.ops.reduce_sum(weight_gather_reshape,
+                                                   'reduce_all', True)
+                return out_sum / total_weight
+            else:
+                return core.ops.mean(out)
+        else:
+            return out
 
-    if not in_dygraph_mode():
-        fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'nll_loss')
-        fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
-                                                   'nll_loss')
-    inputs = {'X': input, 'Label': label}
-    attrs = {'reduction': reduction, 'ignore_index': ignore_index}
+    fluid.data_feeder.check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'softmax_cross_entropy')
+    fluid.data_feeder.check_variable_and_dtype(
+        label, 'label', ['int32', 'int64'], 'softmax_cross_entropy')
+    out = softmax_with_cross_entropy(
+        input,
+        label,
+        soft_label=soft_label,
+        ignore_index=ignore_index,
+        axis=axis)
     if weight is not None:
-        if isinstance(weight, Variable):
-            inputs['Weight'] = weight
-
-    out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    total_weight = helper.create_variable_for_type_inference(dtype=input.dtype)
-    outputs = {'Out': out, 'Total_weight': total_weight}
-
-    helper.append_op(
-        type='nll_loss', inputs=inputs, outputs=outputs, attrs=attrs)
-    if x_dims != 2 and x_dims != 4 and reduction == 'none':
-        out = reshape(out, shape=out_shape)
+        fluid.data_feeder.check_variable_and_dtype(
+            weight, 'weight', ['float32', 'float64'], 'softmax_cross_entropy')
+        weight_name = name if reduction == 'none' else None
+        weight_gather = paddle.gather_nd(weight, label)  #trans to sample
+        input_shape = list(label.shape)
+        weight_gather_reshape = reshape(weight_gather, shape=input_shape)
+        out = paddle.multiply(out, weight_gather_reshape, name=weight_name)
 
-    return out
+    if reduction == "sum":
+        return paddle.sum(out, name=name)
+    elif reduction == "mean":
+        if weight is not None:
+            out_sum = paddle.sum(out, name=name)
+            total_weight = paddle.sum(weight_gather_reshape)
+            return out_sum / total_weight
+        else:
+            return paddle.mean(out, name=name)
+    else:
+        return out
 
 
 def sigmoid_focal_loss(logit,
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index faf1345c7b..a6d1152adf 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -141,30 +141,40 @@ class BCEWithLogitsLoss(fluid.dygraph.Layer):
 
 
 class CrossEntropyLoss(fluid.dygraph.Layer):
-    r"""
-	:alias_main: paddle.nn.CrossEntropyLoss
-	:alias: paddle.nn.CrossEntropyLoss,paddle.nn.layer.CrossEntropyLoss,paddle.nn.layer.loss.CrossEntropyLoss
+    """
+    This operator implements the cross entropy loss function with softmax. This function 
+    combines the calculation of the softmax operation and the cross entropy loss function 
+    to provide a more numerically stable gradient.
 
-    This operator implements the cross entropy loss function. This OP combines ``LogSoftmax``,
-    and ``NLLLoss`` together.
+    Because this operator performs a softmax on logits internally, it expects
+    unscaled logits. This operator should not be used with the output of
+    softmax operator since that would produce incorrect results.
 
-    It is useful when training a classification problem with ``C`` classes.
-    If provided, the optional argument ``weight`` should be a 1D Variable assigning
-    weight to each of the classes.
+    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
+    expects mutually exclusive hard labels, each sample in a batch is in exactly 
+    one class with a probability of 1.0. Each sample in the batch will have a 
+    single label.
 
-    For predictions label, and target label, the loss is calculated as follows.
+    The equation is as follows:
+
+    1) Hard label (one-hot label, so every sample has exactly one class)
 
     .. math::
 
-        loss_j =  -\\text{input[class]} +
-        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{input}_i)\\right), j = 1,..., K
+        loss_j =  -\\text{logits}_{label_j} +
+        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{logits}_i)\\right), j = 1,..., K
 
-    If weight is not ``None``:
+    2) Soft label (each sample can have a distribution over all classes)
 
     .. math::
 
-        loss_j =  \\text{weight[class]}(-\\text{input[class]} +
-        \\log\\left(\\sum_{i=0}^{K}\\exp(\\text{input}_i)\\right)), j = 1,..., K
+        loss_j =  -\\sum_{i=0}^{K}\\text{label}_i
+        \\left(\\text{logits}_i - \\log\\left(\\sum_{i=0}^{K}
+        \\exp(\\text{logits}_i)\\right)\\right), j = 1,...,K
+
+ 
+    It is useful when training a classification problem with ``C`` classes.
+
 
     Parameters:
         input (Variable): Input tensor, the data type is float32, float64. Shape is
@@ -173,9 +183,9 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
         label (Variable): Label tensor, the data type is int64. Shape is (N), where each
 	    value is 0 <= label[i] <= C-1, and if shape is more than 2D, this is
 	    (N, D1, D2,..., Dk), k >= 1.
-        weight (Variable, optional): Weight tensor, a manual rescaling weight given
-            to each class and the shape is (C). It has the same dimensions as class
-	    number and the data type is float32, float64. Default is ``'None'``.
+        weight (Variable, optional): Weight tensor, a manual rescaling weight for each
+            sample relative to each class. It has the same shape as label.
+	    and the data type is float32, float64. Default is ``'None'``.
         reduction (str, optional): Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
@@ -184,6 +194,12 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
             Default is ``'mean'``.
         ignore_index (int64, optional): Specifies a target value that is ignored
             and does not contribute to the input gradient. Default is ``-100``.
+        soft_label (bool): indicate whether label is soft. Default False, meaning that
+                the label is hard. If soft_label=True, the label is soft.
+        axis (int, optional): The index of dimension to perform softmax calculations. It 
+                              should be in range :math:`[-1, rank - 1]`, while :math:`rank`
+                              is the rank of input :attr:`logits`. Default: -1.
+
 
     Returns:
         The tensor variable storing the cross_entropy_loss of input and label.
@@ -192,64 +208,47 @@ class CrossEntropyLoss(fluid.dygraph.Layer):
 
     Examples:
         .. code-block:: python
-
-            # declarative mode
             import paddle
-            import paddle.fluid as fluid
             import numpy as np
-
-            input = fluid.data(name='input', shape=[5, 100], dtype='float64')
-            label = fluid.data(name='label', shape=[5], dtype='int64')
-            weight = fluid.data(name='weight', shape=[100], dtype='float64')
-            ce_loss = paddle.nn.loss.CrossEntropyLoss(weight=weight, reduction='mean')
-            output = ce_loss(input, label)
-            place = fluid.CPUPlace()
-            exe = fluid.Executor(place)
-            exe.run(fluid.default_startup_program())
-            input_data = np.random.random([5, 100]).astype("float64")
-            label_data = np.random.randint(0, 100, size=(5)).astype(np.int64)
-            weight_data = np.random.random([100]).astype("float64")
-            output = exe.run(fluid.default_main_program(),
-                        feed={"input": input_data, "label": label_data,"weight": weight_data},
-                        fetch_list=[output],
-                        return_numpy=True)
-            print(output)
-
-            # imperative mode
-            import paddle.fluid.dygraph as dg
-            with dg.guard(place) as g:
-                input = dg.to_variable(input_data)
-                label = dg.to_variable(label_data)
-                weight = dg.to_variable(weight_data)
-                ce_loss = paddle.nn.loss.CrossEntropyLoss(weight=weight, reduction='mean')
-                output = ce_loss(input, label)
-                print(output.numpy())
+            input_np = np.random.random([2, 4]).astype(np.float64)
+            label_np = np.random.randint(0, 4, size=(2, 1)).astype(np.int64)
+            weight_np = np.random.random([4]).astype(np.float64) #shape:C
+            weight_ce = weight_np[label_np]  #shape:N,1
+            cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
+                weight=paddle.to_tensor(weight_ce))
+            output = cross_entropy_loss(
+                paddle.to_tensor(input_np),
+                paddle.to_tensor(label_np))
+            print(output.numpy()) #[1.44375251]
     """
 
-    def __init__(self, weight=None, ignore_index=-100, reduction='mean'):
+    def __init__(self,
+                 weight=None,
+                 ignore_index=-100,
+                 reduction='mean',
+                 soft_label=False,
+                 axis=-1,
+                 name=None):
         super(CrossEntropyLoss, self).__init__()
         self.weight = weight
         self.reduction = reduction
         self.ignore_index = ignore_index
+        self.soft_label = soft_label
+        self.axis = axis
+        self.name = name
 
     def forward(self, input, label):
-        fluid.data_feeder.check_variable_and_dtype(
-            input, 'input', ['float32', 'float64'], 'cross_entropy_loss')
-        fluid.data_feeder.check_variable_and_dtype(label, 'label', ['int64'],
-                                                   'cross_entropy_loss')
-
-        if self.reduction not in ['sum', 'mean', 'none']:
-            raise ValueError(
-                "The value of 'reduction' in cross_entropy_loss should be 'sum', 'mean' or"
-                " 'none', but received %s, which is not allowed." %
-                self.reduction)
-
-        return paddle.nn.functional.cross_entropy(
+        ret = paddle.nn.functional.softmax_cross_entropy(
             input,
             label,
             weight=self.weight,
             ignore_index=self.ignore_index,
-            reduction=self.reduction)
+            reduction=self.reduction,
+            soft_label=self.soft_label,
+            axis=self.axis,
+            name=self.name)
+
+        return ret
 
 
 class HSigmoidLoss(fluid.dygraph.Layer):
@@ -491,29 +490,31 @@ class L1Loss(fluid.dygraph.Layer):
             If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
 
     Examples:
-        
         .. code-block:: python
-
             import paddle
+            import numpy as np
 
-            input = paddle.to_tensor([[1.5, 0.8], [0.2, 1.3]])
-            label = paddle.to_tensor([[1.7, 1.0], [0.4, 0.5]])
+            paddle.disable_static()
+            input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32")
+            label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32")
+            input = paddle.to_tensor(input_data)
+            label = paddle.to_tensor(label_data)
 
             l1_loss = paddle.nn.loss.L1Loss()
             output = l1_loss(input, label)
-            print(output)
+            print(output.numpy())
             # [0.35]
 
             l1_loss = paddle.nn.loss.L1Loss(reduction='sum')
             output = l1_loss(input, label)
-            print(output)
+            print(output.numpy())
             # [1.4]
 
             l1_loss = paddle.nn.loss.L1Loss(reduction='none')
             output = l1_loss(input, label)
-            print(output)
+            print(output.numpy())
             # [[0.20000005 0.19999999]
-            #  [0.2        0.79999995]]
+            # [0.2        0.79999995]]
     """
 
     def __init__(self, reduction='mean', name=None):
@@ -622,7 +623,9 @@ class BCELoss(fluid.dygraph.Layer):
 
 
 class NLLLoss(fluid.dygraph.Layer):
-    r"""
+    """
+	:alias_main: paddle.nn.NLLLoss
+	:alias: paddle.nn.NLLLoss,paddle.nn.layer.NLLLoss,paddle.nn.layer.loss.NLLLoss
 
     This class accepts input and target label and returns negative log likelihood
     cross error. It is useful to train a classification problem with C classes.
@@ -689,7 +692,7 @@ class NLLLoss(fluid.dygraph.Layer):
                 import paddle
                 import numpy as np
 
-                nll_loss = paddle.nn.NLLLoss()
+                nll_loss = paddle.nn.layer.NLLLoss()
                 log_softmax = paddle.nn.LogSoftmax(axis=1)
 
                 input_np = np.array([[0.88103855, 0.9908683 , 0.6226845 ],
@@ -699,11 +702,13 @@ class NLLLoss(fluid.dygraph.Layer):
                                  [0.05689114, 0.0862954 , 0.6325046 ]]).astype(np.float32)
                 label_np = np.array([0, 2, 1, 1, 0]).astype(np.int64)
 
+                place = paddle.CPUPlace()
+                paddle.disable_static(place)
                 input = paddle.to_tensor(input_np)
                 log_out = log_softmax(input)
                 label = paddle.to_tensor(label_np)
                 result = nll_loss(log_out, label)
-                print(result) # [1.0720209]
+                print(result.numpy()) # [1.0720209]
 
     """
 
@@ -999,7 +1004,7 @@ class SmoothL1Loss(fluid.dygraph.Layer):
             is the same as the shape of input.
 
     Returns:
-        The tensor storing the smooth_l1_loss of input and label.
+        The tensor variable storing the smooth_l1_loss of input and label.
 
     Return type: Tensor.
 
-- 
GitLab