diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index db87fc7c202e7e34da9f6e28f33da46f715ccb10..32532f7ed008ecd72154a298bbedbc6dcefd7e9a 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1190,10 +1190,6 @@ def chunk_eval(input,
 
 def softmax(input, use_cudnn=False, name=None, axis=-1):
     """
-    :alias_main: paddle.nn.functional.softmax
-	:alias: paddle.nn.functional.softmax,paddle.nn.functional.activation.softmax
-	:old_api: paddle.fluid.layers.softmax
-
     This operator implements the softmax layer. The calculation process is as follows:
 
     1. The dimension :attr:`axis` of the ``input`` will be permuted to the last.
@@ -1307,8 +1303,8 @@ def softmax(input, use_cudnn=False, name=None, axis=-1):
     attrs = {"axis": axis, "use_cudnn": use_cudnn}
 
     helper = LayerHelper('softmax', **locals())
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'softmax')
+    check_variable_and_dtype(input, 'input/x',
+                             ['float16', 'float32', 'float64'], 'softmax')
 
     dtype = helper.input_dtype()
     softmax_out = helper.create_variable_for_type_inference(dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 1df50d63e3f67424ed1f42b94c317030ed69c6e9..1d04a4a271603b5bd6fa7d9ee52798c178607fd6 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -20,6 +20,9 @@ from op_test import OpTest
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
+import paddle
+
+np.random.seed(10)
 
 
 def stable_softmax(x):
@@ -220,5 +223,43 @@ class TestSoftmaxFP16CUDNNOp2(TestSoftmaxFP16CUDNNOp):
         return [2, 3, 4, 5]
 
 
+class TestNnFunctionalSoftmaxApi(unittest.TestCase):
+    def setUp(self):
+        self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else paddle.CPUPlace()
+        self.x_np = np.random.uniform(-1., 1., [2, 3, 4, 5]).astype('float32')
+        self.out_ref = np.apply_along_axis(stable_softmax, -1, self.x_np)
+
+    def test_api_static(self):
+        train_program = Program()
+        startup_program = Program()
+        with program_guard(train_program, startup_program):
+            x = paddle.data('X', self.x_np.shape, 'float32')
+            out = paddle.nn.functional.softmax(x)
+
+        exe = paddle.Executor(self.place)
+        res = exe.run(train_program, feed={'X': self.x_np}, fetch_list=[out])
+
+        assert np.allclose(self.out_ref, res[0])
+
+    def test_api_imperative(self):
+        with paddle.imperative.guard(self.place):
+            x = paddle.imperative.to_variable(self.x_np)
+            out = paddle.nn.functional.softmax(x)
+            assert np.allclose(self.out_ref, out.numpy())
+
+            out = paddle.nn.functional.softmax(x, axis=0)
+            out_ref = np.apply_along_axis(stable_softmax, 0, self.x_np)
+            assert np.allclose(out_ref, out.numpy())
+
+    def test_error(self):
+        with program_guard(Program(), Program()):
+            # The x should be variable and its dtype should be float32, float64.
+            self.assertRaises(TypeError, paddle.nn.functional.softmax, [1])
+
+            x = paddle.data(name='x', shape=[2, 3], dtype='int32')
+            self.assertRaises(TypeError, paddle.nn.functional.softmax, x)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index dad6b252ed4d54db4a85d1912503ce5401a1ca4c..b75dd22429b87bcd3e02969d86344fec29a9419b 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -26,7 +26,6 @@ from ...fluid.layers import maxout  #DEFINE_ALIAS
 from ...fluid.layers import relu6  #DEFINE_ALIAS
 from ...fluid.layers import selu  #DEFINE_ALIAS
 from ...fluid.layers import soft_relu  #DEFINE_ALIAS
-from ...fluid.layers import softmax  #DEFINE_ALIAS
 from ...fluid.layers import softplus  #DEFINE_ALIAS
 from ...fluid.layers import softshrink  #DEFINE_ALIAS
 from ...fluid.layers import softsign  #DEFINE_ALIAS
@@ -67,6 +66,7 @@ from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import in_dygraph_mode, convert_np_dtype_to_dtype_
 from ...fluid import core
 from ...fluid.data_feeder import check_variable_and_dtype
+import paddle
 
 
 def hsigmoid(input,
@@ -305,6 +305,122 @@ def sigmoid(input, inplace=False, name=None):
     return outputs
 
 
+def softmax(x, axis=-1, name=None):
+    """
+    This operator implements the softmax layer. The calculation process is as follows:
+
+    1. The dimension :attr:`axis` of ``x`` will be permuted to the last.
+
+    2. Then ``x`` will be logically flattened to a 2-D matrix. The matrix's second
+    dimension(row length) is the same as the dimension :attr:`axis` of ``x``,
+    and the first dimension(column length) is the product of all other dimensions
+    of ``x``. For each row of the matrix, the softmax operator squashes the
+    K-dimensional(K is the width of the matrix, which is also the size of ``x``'s
+    dimension :attr:`axis`) vector of arbitrary real values to a K-dimensional
+    vector of real values in the range [0, 1] that add up to 1.
+
+    3. After the softmax operation is completed, the inverse operations of steps 1 and 2
+    are performed to restore the two-dimensional matrix to the same dimension as the ``x`` .
+
+    It computes the exponential of the given dimension and the sum of exponential
+    values of all the other dimensions in the K-dimensional vector input.
+    Then the ratio of the exponential of the given dimension and the sum of
+    exponential values of all the other dimensions is the output of the softmax
+    operator.
+
+    For each row :math:`i` and each column :math:`j` in the matrix, we have:
+
+    .. math::
+
+        out[i, j] = \\frac{\exp(x[i, j])}{\sum_j(exp(x[i, j])}
+
+    Example:
+
+    .. code-block:: text
+
+        Case 1:
+          Input:
+            x.shape = [2, 3, 4]
+            x.data = [[[2.0, 3.0, 4.0, 5.0],
+                       [3.0, 4.0, 5.0, 6.0],
+                       [7.0, 8.0, 8.0, 9.0]],
+                      [[1.0, 2.0, 3.0, 4.0],
+                       [5.0, 6.0, 7.0, 8.0],
+                       [6.0, 7.0, 8.0, 9.0]]]
+
+          Attrs:
+            axis = -1
+
+          Output:
+            out.shape = [2, 3, 4]
+            out.data = [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
+                        [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
+
+        Case 2:
+          Input:
+            x.shape = [2, 3, 4]
+            x.data = [[[2.0, 3.0, 4.0, 5.0],
+                       [3.0, 4.0, 5.0, 6.0],
+                       [7.0, 8.0, 8.0, 9.0]],
+                      [[1.0, 2.0, 3.0, 4.0],
+                       [5.0, 6.0, 7.0, 8.0],
+                       [6.0, 7.0, 8.0, 9.0]]]
+          Attrs:
+            axis = 1
+
+          Output:
+            out.shape = [2, 3, 4]
+            out.data = [[[0.00657326, 0.00657326, 0.01714783, 0.01714783],
+                         [0.01786798, 0.01786798, 0.04661262, 0.04661262],
+                         [0.97555875, 0.97555875, 0.93623955, 0.93623955]],
+                        [[0.00490169, 0.00490169, 0.00490169, 0.00490169],
+                         [0.26762315, 0.26762315, 0.26762315, 0.26762315],
+                         [0.72747516, 0.72747516, 0.72747516, 0.72747516]]]
+
+    Args:
+        x (Tensor): The input multi-dimension Tensor with data type float32, float64.
+        axis (int, optional): The axis along which to perform softmax calculations.
+            It should be in range [-D, D), where D is the dimensions of ``x`` .
+            When ``axis`` < 0, it works the same way as :math:`axis + D` .
+            Default is -1.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+
+        .. code-block:: python
+
+        import paddle
+        import paddle.nn.functional as F
+        import numpy as np
+
+        paddle.enable_imperative()
+
+        x = np.array([[[2.0, 3.0, 4.0, 5.0],
+                       [3.0, 4.0, 5.0, 6.0],
+                       [7.0, 8.0, 8.0, 9.0]],
+                      [[1.0, 2.0, 3.0, 4.0],
+                       [5.0, 6.0, 7.0, 8.0],
+                       [6.0, 7.0, 8.0, 9.0]]], 'float32')
+        x = paddle.imperative.to_variable(x)
+        out = F.softmax(x)
+        # [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+        #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+        #   [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
+        # [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+        #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
+        #   [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
+    """
+    return paddle.fluid.layers.softmax(input=x, axis=axis, name=name)
+
+
 def log_softmax(input, axis=None, dtype=None, name=None):
     """
 	:alias_main: paddle.nn.functional.log_softmax