From a3ae080aa1eda28277795f049ba37f2df6b8d68a Mon Sep 17 00:00:00 2001
From: Charles-hit <56987902+Charles-hit@users.noreply.github.com>
Date: Fri, 2 Dec 2022 21:50:11 +0800
Subject: [PATCH] remove softmax api from fluid (#48388)

* move softmax to paddle2.0

* fix some bugs

* resolve conflict

* remove some code

* modify code style

* fix bugs

* fix code

* fix move code

* fix some bugs

* fix code

* fix some code

* modify the header file

* fix bugs

* fix some examples

* fix mish example

* fix code
---
 python/paddle/fluid/layers/detection.py       |   2 +-
 python/paddle/fluid/layers/nn.py              | 164 ++----------------
 python/paddle/fluid/layers/rnn.py             |   4 +-
 .../fleet/parallel_dygraph_se_resnext.py      |   2 +-
 .../fleet/parallel_dygraph_transformer.py     |   4 +-
 .../fluid/tests/unittests/dist_transformer.py |   6 +-
 .../seq2seq_dygraph_model.py                  |   6 +-
 .../unittests/dygraph_to_static/test_dict.py  |   4 +-
 .../dygraph_to_static/test_ifelse.py          |   2 +-
 .../dygraph_to_static/test_mobile_net.py      |   2 +-
 .../test_reinforcement_learning.py            |   2 +-
 .../dygraph_to_static/test_se_resnet.py       |   2 +-
 .../transformer_dygraph_model.py              |   4 +-
 .../unittests/ipu/test_dy2static_fp16_ipu.py  |   2 +-
 .../tests/unittests/ipu/test_dy2static_ipu.py |   2 +-
 .../unittests/ipu/test_modelruntime_ipu.py    |   2 +-
 .../tests/unittests/ipu/test_print_op_ipu.py  |   2 +-
 .../unittests/ipu/test_softmax_op_ipu.py      |   2 +-
 .../test_mkldnn_inplace_fuse_pass.py          |   2 +-
 .../ir/inference/test_trt_activation_pass.py  |   2 +-
 .../ir/inference/test_trt_fc_fuse_pass.py     |  19 +-
 .../ir/inference/test_trt_gather_op.py        |   4 +-
 .../unittests/ir/test_ir_fc_fuse_pass.py      |   3 +-
 .../unittests/npu/test_softmax_op_npu.py      |   2 +-
 .../test_imperative_ocr_attention_model.py    |   2 +-
 .../test_imperative_reinforcement.py          |   2 +-
 .../unittests/test_imperative_se_resnext.py   |   4 +-
 ..._imperative_transformer_sorted_gradient.py |   4 +-
 .../fluid/tests/unittests/test_layers.py      |   6 +-
 .../fluid/tests/unittests/test_mean_op.py     |   2 +-
 .../tests/unittests/test_recurrent_op.py      |   2 +-
 .../tests/unittests/test_rnn_decode_api.py    |   4 +-
 .../tests/unittests/xpu/test_mean_op_xpu.py   |   2 +-
 33 files changed, 69 insertions(+), 205 deletions(-)

diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index f021ab8f3d..d490b0457d 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -626,7 +626,7 @@ def detection_output(
         target_box=loc,
         code_type='decode_center_size',
     )
-    scores = nn.softmax(input=scores)
+    scores = paddle.nn.functional.softmax(scores)
     scores = paddle.transpose(scores, perm=[0, 2, 1])
     scores.stop_gradient = True
     nmsed_outs = helper.create_variable_for_type_inference(
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 717c965727..4dab44ebe5 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -68,7 +68,6 @@ __all__ = [
     'linear_chain_crf',
     'crf_decoding',
     'conv2d',
-    'softmax',
     'pool2d',
     'batch_norm',
     'dropout',
@@ -145,7 +144,7 @@ def _get_reduce_dim(dim, input):
         else:
             raise TypeError(
                 "The type of dim must be int, list, tuple or range, but received {}".format(
-                    type(axis)
+                    type(dim)
                 )
             )
     if dim is None:
@@ -679,7 +678,7 @@ def _pull_gpups_sparse(
         size(int|list of int): The embedding size parameter of each input, which indicates the size of
             each embedding vector respectively.
         dtype(str): The dtype refers to the data type of output tensor. Only supports
-	    float32 now.
+        float32 now.
 
     Returns:
         Variable|list of Variable: The tensor variable storing the embeddings of the \
@@ -742,7 +741,7 @@ def _pull_box_sparse(
         size(int): The embedding size parameter, which indicates the size of
             each embedding vector respectively.
         dtype(str): The dtype refers to the data type of output tensor. Only supports
-	    float32 now.
+        float32 now.
 
     Returns:
         Variable|list of Variable: The tensor variable storing the embeddings of the \
@@ -1123,147 +1122,6 @@ def dropout(
     return out
 
 
-@deprecated(since="2.0.0", update_to="paddle.nn.functional.softmax")
-def softmax(input, use_cudnn=True, name=None, axis=-1):
-    r"""
-    This operator implements the softmax layer. The calculation process is as follows:
-
-    1. The dimension :attr:`axis` of the ``input`` will be permuted to the last.
-
-    2. Then the input tensor will be logically flattened to a 2-D matrix. The matrix's
-    second dimension(row length) is the same as the dimension :attr:`axis` of the input
-    tensor, and the first dimension(column length) is the product of all other
-    dimensions of the input tensor. For each row of the matrix, the softmax operator
-    squashes the K-dimensional(K is the width of the matrix, which is also the size
-    of the input tensor's dimension :attr:`axis`) vector of arbitrary real values to a
-    K-dimensional vector of real values in the range [0, 1] that add up to 1.
-
-    3. After the softmax operation is completed, the inverse operations of steps 1 and 2
-    are performed to restore the two-dimensional matrix to the same dimension as the ``input``.
-
-    It computes the exponential of the given dimension and the sum of exponential
-    values of all the other dimensions in the K-dimensional vector input.
-    Then the ratio of the exponential of the given dimension and the sum of
-    exponential values of all the other dimensions is the output of the softmax
-    operator.
-
-    For each row :math:`i` and each column :math:`j` in the matrix, we have:
-
-    .. math::
-
-        Out[i, j] = \\frac{\\exp(X[i, j])}{\\sum_j(exp(X[i, j])}
-
-    Example:
-
-    .. code-block:: text
-
-        Case 1:
-          Input:
-            X.shape = [2, 3, 4]
-            X.data = [[[2.0, 3.0, 4.0, 5.0],
-                       [3.0, 4.0, 5.0, 6.0],
-                       [7.0, 8.0, 8.0, 9.0]],
-                      [[1.0, 2.0, 3.0, 4.0],
-                       [5.0, 6.0, 7.0, 8.0],
-                       [6.0, 7.0, 8.0, 9.0]]]
-
-          Attrs:
-            axis = -1
-
-          Output:
-            Out.shape = [2, 3, 4]
-            Out.data = [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-                         [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
-                        [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
-
-        Case 2:
-          Input:
-            X.shape = [2, 3, 4]
-            X.data = [[[2.0, 3.0, 4.0, 5.0],
-                       [3.0, 4.0, 5.0, 6.0],
-                       [7.0, 8.0, 8.0, 9.0]],
-                      [[1.0, 2.0, 3.0, 4.0],
-                       [5.0, 6.0, 7.0, 8.0],
-                       [6.0, 7.0, 8.0, 9.0]]]
-          Attrs:
-            axis = 1
-
-          Output:
-            Out.shape = [2, 3, 4]
-            Out.data = [[[0.00657326, 0.00657326, 0.01714783, 0.01714783],
-                         [0.01786798, 0.01786798, 0.04661262, 0.04661262],
-                         [0.97555875, 0.97555875, 0.93623955, 0.93623955]],
-                        [[0.00490169, 0.00490169, 0.00490169, 0.00490169],
-                         [0.26762315, 0.26762315, 0.26762315, 0.26762315],
-                         [0.72747516, 0.72747516, 0.72747516, 0.72747516]]]
-
-    Args:
-        input (Tensor): The input tensor. A multi-dimension ``Tensor`` with type float32 or float64.
-        use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn \
-            library is installed. To improve performance, set use_cudnn to True by default.
-        name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` . Default: None.
-            will be named automatically. Default: None.
-        axis (int, optional): The index of dimension to perform softmax calculations, it should
-            be in range :math:`[-1, rank - 1]`, while :math:`rank` is the rank of
-            input tensor. Default: -1. -1 means the last dimension.
-
-    Returns:
-        Tensor: ``Tensor`` indicates the output of softmax. The data type and shape are the same as ``input`` .
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.nn.functional as F
-
-            x = paddle.to_tensor([[[2.0, 3.0, 4.0, 5.0],
-                                [3.0, 4.0, 5.0, 6.0],
-                                [7.0, 8.0, 8.0, 9.0]],
-                                [[1.0, 2.0, 3.0, 4.0],
-                                [5.0, 6.0, 7.0, 8.0],
-                                [6.0, 7.0, 8.0, 9.0]]], dtype='float32')
-            y = F.softmax(x, axis=1)
-            print(y)
-            # [[[0.00657326, 0.00657326, 0.01714783, 0.01714783],
-            #   [0.01786798, 0.01786798, 0.04661262, 0.04661262],
-            #   [0.97555870, 0.97555870, 0.93623954, 0.93623954]],
-            #  [[0.00490169, 0.00490169, 0.00490169, 0.00490169],
-            #   [0.26762316, 0.26762316, 0.26762316, 0.26762316],
-            #   [0.72747517, 0.72747517, 0.72747517, 0.72747517]]]
-
-    """
-
-    if in_dygraph_mode():
-        return _C_ops.softmax(input, axis)
-
-    if _non_static_mode():
-        return _legacy_C_ops.softmax(
-            input, 'axis', axis, 'use_cudnn', use_cudnn
-        )
-
-    inputs = {"X": [input]}
-    attrs = {"axis": axis, "use_cudnn": use_cudnn}
-
-    helper = LayerHelper('softmax', **locals())
-    check_variable_and_dtype(
-        input, 'input/x', ['float16', 'float32', 'float64'], 'softmax'
-    )
-
-    dtype = helper.input_dtype()
-    softmax_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="softmax",
-        inputs={"X": input},
-        outputs={"Out": softmax_out},
-        attrs=attrs,
-    )
-    return softmax_out
-
-
 def conv2d(
     input,
     num_filters,
@@ -1788,7 +1646,7 @@ def pool2d(
         if pool_padding == "VALID":
             padding_algorithm = "VALID"
             pool_padding = [0, 0]
-            if ceil_mode != False:
+            if ceil_mode is not False:
                 raise ValueError(
                     "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode) must be False. "
                     "Received ceil_mode: True."
@@ -6643,7 +6501,7 @@ def deformable_roi_pooling(
         )
 
     input_channels = input.shape[1]
-    if position_sensitive == False:
+    if position_sensitive is False:
         output_channels = input_channels
     else:
         output_channels = input_channels / pooled_height / pooled_width
@@ -6841,11 +6699,11 @@ def mish(x, threshold=20, name=None):
 
     .. math::
 
-	out = \\begin{cases}
-		x \\ast \\tanh(x), \\text{if } x > \\text{threshold} \\\\
-		x \\ast \\tanh(e^{x}), \\text{if } x < -\\text{threshold} \\\\
-		x \\ast \\tanh(\\ln(1 + e^{x})),  \\text{otherwise}
-	      \\end{cases}
+    out = \\begin{cases}
+        x \\ast \\tanh(x), \\text{if } x > \\text{threshold} \\\\
+        x \\ast \\tanh(e^{x}), \\text{if } x < -\\text{threshold} \\\\
+        x \\ast \\tanh(\\ln(1 + e^{x})),  \\text{otherwise}
+          \\end{cases}
 
     Args:
         x (Variable): Input feature, multi-dimensional Tensor. The data type
@@ -6867,9 +6725,11 @@ def mish(x, threshold=20, name=None):
 
     .. code-block:: python
 
+        import paddle
         import paddle.fluid as fluid
         import numpy as np
 
+        paddle.enable_static()
         DATATYPE='float32'
 
         x_data = np.array([i for i in range(1,5)]).reshape([1,1,4]).astype(DATATYPE)
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 83c4d6c2cb..60ac537ffc 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -1304,7 +1304,7 @@ class BeamSearchDecoder(Decoder):
                 self.noend_mask_tensor, "float64"
             )
 
-        step_log_probs = paddle.log(nn.softmax(logits))
+        step_log_probs = paddle.log(paddle.nn.functional.softmax(logits))
         step_log_probs = self._mask_probs(step_log_probs, beam_state.finished)
         log_probs = nn.elementwise_add(
             x=step_log_probs, y=beam_state.log_probs, axis=0
@@ -2330,7 +2330,7 @@ class SampleEmbeddingHelper(GreedyEmbeddingHelper):
             if self.softmax_temperature is not None
             else outputs
         )
-        probs = nn.softmax(logits)
+        probs = paddle.nn.functional.softmax(logits)
         # TODO: remove this stop_gradient. The stop_gradient of sample_ids can
         # not pass to probs, since sampling_id op does not have corresponding
         # grad op and thus can not pass.
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
index 13e83741ea..164f1410ed 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
@@ -354,7 +354,7 @@ class TestSeResNeXt(TestParallelDyGraphRunnerBase):
         label.stop_gradient = True
 
         out = model(img)
-        softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+        softmax_out = paddle.nn.functional.softmax(out, use_cudnn=False)
         loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
         avg_loss = paddle.mean(x=loss)
         return avg_loss
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
index 5cfd8a6078..41c8afd629 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
@@ -342,7 +342,7 @@ class MultiHeadAttentionLayer(Layer):
         )
         if attn_bias is not None:
             product += attn_bias
-        weights = fluid.layers.softmax(product)
+        weights = paddle.nn.functional.softmax(product)
         if self._dropout_rate:
             weights_droped = fluid.layers.dropout(
                 weights,
@@ -849,7 +849,7 @@ class WrapDecoderLayer(Layer):
 
         if dec_inputs is None:
             # Return probs for independent decoder program.
-            predict_out = fluid.layers.softmax(predict)
+            predict_out = paddle.nn.functional.softmax(predict)
             return predict_out
         return predict
 
diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py
index 7106c426bc..cb60e1c599 100644
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -1177,7 +1177,7 @@ def multi_head_attention(
         product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
         if attn_bias:
             product += attn_bias
-        weights = layers.softmax(product)
+        weights = paddle.nn.functional.softmax(product)
         if dropout_rate:
             weights = layers.dropout(
                 weights,
@@ -1715,7 +1715,7 @@ def wrap_decoder(
             bias_attr=const_bias_attr,
         )
     if dec_inputs is None:
-        predict = layers.softmax(predict)
+        predict = paddle.nn.functional.softmax(predict)
     return predict
 
 
@@ -1834,7 +1834,7 @@ def fast_decode(
             logits = paddle.reshape(logits, (-1, trg_vocab_size))
 
             topk_scores, topk_indices = layers.topk(
-                input=layers.softmax(logits), k=beam_size
+                input=paddle.nn.functional.softmax(logits), k=beam_size
             )
             accu_scores = layers.elementwise_add(
                 x=paddle.log(topk_scores),
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
index bf1dfdcad2..d364b8a1a5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
@@ -435,7 +435,9 @@ class BaseModel(fluid.dygraph.Layer):
             cell_outputs = self._split_batch_beams(step_input)
             cell_outputs = self.fc(cell_outputs)
 
-            step_log_probs = paddle.log(fluid.layers.softmax(cell_outputs))
+            step_log_probs = paddle.log(
+                paddle.nn.functional.softmax(cell_outputs)
+            )
             noend_array = [-self.kinf] * self.tar_vocab_size
             noend_array[self.beam_end_token] = 0
             noend_mask_tensor = to_variable(
@@ -703,7 +705,7 @@ class AttentionModel(fluid.dygraph.Layer):
             attn = paddle.transpose(attn, [1, 0, 2])
             attn = paddle.add(attn, mask * 1000000000)
             attn = paddle.transpose(attn, [1, 0, 2])
-        weight = fluid.layers.softmax(attn)
+        weight = paddle.nn.functional.softmax(attn)
         weight_memory = fluid.layers.matmul(weight, memory)
 
         return weight_memory
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
index 57bd7c2936..742e828aa9 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
@@ -67,7 +67,7 @@ class SubNetWithDict(fluid.dygraph.Layer):
             cache["k"], cache["v"] = k, v
 
         weight = fluid.layers.matmul(x=q, y=k, transpose_y=True)
-        weight = fluid.layers.softmax(weight)
+        weight = paddle.nn.functional.softmax(weight)
         out = fluid.layers.matmul(weight, v)
 
         return out
@@ -113,7 +113,7 @@ class MainNetWithDict(fluid.dygraph.Layer):
 # Test to call function defined outside of class.
 def update_cache(cache):
     for k, val in cache.items():
-        cache[k] = fluid.layers.softmax(val)
+        cache[k] = paddle.nn.functional.softmax(val)
 
     return cache
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
index c17bfd2508..8cc543a19f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
@@ -308,7 +308,7 @@ class NetWithExternalFunc(fluid.dygraph.Layer):
 
 # Test to call function behind caller.
 def softmax(x):
-    return fluid.layers.softmax(x)
+    return paddle.nn.functional.softmax(x)
 
 
 class TestNetWithExternalFunc(TestDygraphIfElseNet):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index 4c5e306718..8358c12edc 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -535,7 +535,7 @@ def train_mobilenet(args, to_static):
                 out = net(img)
 
                 t_end = time.time()
-                softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+                softmax_out = paddle.nn.functional.softmax(out)
                 loss = fluid.layers.cross_entropy(
                     input=softmax_out, label=label
                 )
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
index b98d9c304d..13aace2003 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
@@ -48,7 +48,7 @@ class Policy(Layer):
         x = fluid.layers.relu(x)
         action_scores = self.affine2(x)
 
-        log_prob = fluid.layers.softmax(action_scores, axis=1)
+        log_prob = paddle.nn.functional.softmax(action_scores, axis=1)
 
         return log_prob
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index 109fc99754..70ee21713c 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -343,7 +343,7 @@ class SeResNeXt(fluid.dygraph.Layer):
         y = paddle.reshape(y, shape=[-1, self.pool2d_avg_output])
         out = self.out(y)
 
-        softmax_out = fluid.layers.softmax(out)
+        softmax_out = paddle.nn.functional.softmax(out)
         loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
         avg_loss = paddle.mean(x=loss)
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
index f4c0815884..e6f03170b4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
@@ -153,7 +153,7 @@ class MultiHeadAttention(Layer):
         )
         if attn_bias is not None:
             product += attn_bias
-        weights = layers.softmax(product)
+        weights = paddle.nn.functional.softmax(product)
         if self.dropout_rate:
             weights = layers.dropout(weights, dropout_prob=self.dropout_rate)
             out = layers.matmul(weights, v)
@@ -840,7 +840,7 @@ class Transformer(Layer):
             )
             caches = map_structure(split_batch_beams, caches)
             step_log_probs = split_batch_beams(
-                paddle.log(fluid.layers.softmax(logits))
+                paddle.log(paddle.nn.functional.softmax(logits))
             )
 
             step_log_probs = mask_probs(
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py
index 8a13e5abb5..f685eac6d3 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py
@@ -33,7 +33,7 @@ class SimpleLayer(paddle.nn.Layer):
         x = self.conv(x)
         x = paddle.flatten(x, 1, -1)
         if target is not None:
-            x = paddle.fluid.layers.softmax(x)
+            x = paddle.nn.functional.softmax(x)
             loss = paddle.fluid.layers.cross_entropy(x, target)
             if self.use_ipu:
                 loss = paddle.incubate.identity_loss(loss, 1)
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py
index dbdfab2882..4ca2599217 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py
@@ -48,7 +48,7 @@ class SimpleLayer(paddle.nn.Layer):
         x = paddle.flatten(x, 1, -1)
         if target is not None:
             if self.use_softmax:
-                x = paddle.fluid.layers.softmax(x)
+                x = paddle.nn.functional.softmax(x)
             if self.loss_op:
                 loss = self.loss_op(x, target)
             else:
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_modelruntime_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_modelruntime_ipu.py
index 9fda7f780e..2e13687df1 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_modelruntime_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_modelruntime_ipu.py
@@ -32,7 +32,7 @@ class SimpleLayer(paddle.nn.Layer):
         x = self.conv(x)
         x = paddle.flatten(x, 1, -1)
         if target is not None:
-            x = paddle.fluid.layers.softmax(x)
+            x = paddle.nn.functional.softmax(x)
             loss = paddle.fluid.layers.cross_entropy(x, target)
             return x, loss
         return x
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
index ccf0a38bbf..782c195c5d 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
@@ -119,7 +119,7 @@ class SimpleLayer(paddle.nn.Layer):
         print(x)
         x = paddle.flatten(x, 1, -1)
         if target is not None:
-            x = paddle.fluid.layers.softmax(x)
+            x = paddle.nn.functional.softmax(x)
             loss = paddle.fluid.layers.cross_entropy(x, target)
             loss = paddle.incubate.identity_loss(loss, 1)
             return x, loss
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
index 485515d7d7..53c7e1ad92 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
@@ -47,7 +47,7 @@ class TestBase(IPUOpTest):
         x = paddle.static.data(
             name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32'
         )
-        out = paddle.fluid.layers.softmax(x, **self.attrs)
+        out = paddle.nn.functional.softmax(x, **self.attrs)
         self.fetch_list = [out.name]
 
     def run_model(self, exec_mode):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_inplace_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_inplace_fuse_pass.py
index 47668a42ec..386dcf7b40 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_inplace_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_inplace_fuse_pass.py
@@ -32,7 +32,7 @@ class MkldnnInplacePassTest(InferencePassTest):
             conv_out_1 = fluid.layers.conv2d(
                 data, num_filters=3, filter_size=3, bias_attr=False
             )
-            softmax_out = fluid.layers.softmax(conv_out_1)
+            softmax_out = paddle.nn.functional.softmax(conv_out_1)
             relu_out = fluid.layers.relu(conv_out_1)
             eltwise_out = fluid.layers.elementwise_add(
                 softmax_out, relu_out, axis=-1
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
index 3597f11c55..29393ff96c 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
@@ -77,7 +77,7 @@ class TensorRTSubgraphPassRelu6Test(TensorRTSubgraphPassActivationTest):
 
 class TensorRTSubgraphPassSoftMaxTest(TensorRTSubgraphPassActivationTest):
     def append_act(self, x):
-        return fluid.layers.softmax(x)
+        return paddle.nn.functional.softmax(x)
 
 
 class TensorRTSubgraphPassSigmoidTest(TensorRTSubgraphPassActivationTest):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
index 3f5daf0d92..a3b297a268 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
@@ -17,6 +17,7 @@ import unittest
 import numpy as np
 from inference_pass_test import InferencePassTest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.core import AnalysisConfig
@@ -31,7 +32,7 @@ class FCFusePassTRTTest(InferencePassTest):
             fc_out1 = fluid.layers.fc(
                 input=data, size=128, num_flatten_dims=1, act="relu"
             )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
 
         self.feeds = {
             "data": np.random.random((32, 128, 2, 2)).astype("float32")
@@ -61,7 +62,7 @@ class FCFusePassTRTStaticDims4Cols1Test(InferencePassTest):
             fc_out1 = fluid.layers.fc(
                 input=data, size=64, num_flatten_dims=1, act="relu"
             )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
 
         self.feeds = {
             "data": np.random.random((32, 128, 32, 8)).astype("float32")
@@ -89,7 +90,7 @@ class FCFusePassTRTStaticDims4Cols2Test(InferencePassTest):
             fc_out1 = fluid.layers.fc(
                 input=data, size=32, num_flatten_dims=2, act="relu"
             )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
 
         self.feeds = {
             "data": np.random.random((3, 24, 16, 16)).astype("float32")
@@ -115,7 +116,7 @@ class FCFusePassTRTDynamicDims2Test(InferencePassTest):
             fc_out1 = fluid.layers.fc(
                 input=data, size=64, num_flatten_dims=1, act="relu"
             )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
 
         self.feeds = {"data": np.random.random((32, 128)).astype("float32")}
         self.enable_trt = True
@@ -147,7 +148,7 @@ class FCFusePassTRTDynamicDims3Cols1Test(InferencePassTest):
             fc_out1 = fluid.layers.fc(
                 input=data, size=64, num_flatten_dims=1, act="relu"
             )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
 
         self.feeds = {"data": np.random.random((32, 128, 32)).astype("float32")}
         self.enable_trt = True
@@ -179,7 +180,7 @@ class FCFusePassTRTDynamicDims3Cols2Test(InferencePassTest):
             fc_out1 = fluid.layers.fc(
                 input=data, size=64, num_flatten_dims=2, act="relu"
             )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
 
         self.feeds = {"data": np.random.random((32, 128, 32)).astype("float32")}
         self.enable_trt = True
@@ -213,7 +214,7 @@ class FCFusePassTRTDynamicDims4Cols1Test(InferencePassTest):
             fc_out1 = fluid.layers.fc(
                 input=data, size=64, num_flatten_dims=1, act="relu"
             )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
 
         self.feeds = {
             "data": np.random.random((32, 12, 4, 6)).astype("float32")
@@ -249,7 +250,7 @@ class FCFusePassTRTDynamicDims4Cols2Test(InferencePassTest):
             fc_out1 = fluid.layers.fc(
                 input=data, size=64, num_flatten_dims=2, act="relu"
             )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
 
         self.feeds = {
             "data": np.random.random((32, 128, 32, 32)).astype("float32")
@@ -285,7 +286,7 @@ class FCFusePassTRTDynamicDims4Cols3Test(InferencePassTest):
             fc_out1 = fluid.layers.fc(
                 input=data, size=64, num_flatten_dims=3, act="relu"
             )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
 
         self.feeds = {
             "data": np.random.random((32, 128, 32, 32)).astype("float32")
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py
index c8b01107eb..3b73ae0744 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py
@@ -30,7 +30,7 @@ class TRTGatherTest1(InferencePassTest):
             data = fluid.data(name='data', shape=[-1, 128], dtype='float32')
             index = fluid.data(name='index', shape=[-1, 1], dtype='int32')
             scale_out = paddle.gather(data, index=index)
-            out = fluid.layers.softmax(input=scale_out)
+            out = paddle.nn.functional.softmax(scale_out)
 
         self.feeds = {
             "data": np.random.random([self.bs, 128]).astype("float32"),
@@ -69,7 +69,7 @@ class TRTGatherTest2(InferencePassTest):
             data = fluid.data(name='data', shape=[16, 64], dtype='float32')
             index = fluid.data(name='index', shape=[2], dtype='int32')
             scale_out = paddle.gather(data, index=index)
-            out = fluid.layers.softmax(input=scale_out)
+            out = paddle.nn.functional.softmax(scale_out)
 
         self.feeds = {
             "data": np.random.random([self.bs, 64]).astype("float32"),
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_fc_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_fc_fuse_pass.py
index 5659ecf3b4..3e958d9d19 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_fc_fuse_pass.py
@@ -17,6 +17,7 @@ import unittest
 import numpy as np
 from pass_test import PassTest
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 
@@ -31,7 +32,7 @@ class FCFusePassTest(PassTest):
                 input=data, size=128, num_flatten_dims=1, act="relu"
             )
             tmp_1 = fluid.layers.fc(input=tmp_0, size=32, num_flatten_dims=1)
-            tmp_2 = fluid.layers.softmax(input=tmp_1)
+            tmp_2 = paddle.nn.functional.softmax(tmp_1)
 
         self.feeds = {"data": np.random.random((32, 128)).astype("float32")}
         self.fetch_list = [tmp_0, tmp_1, tmp_2]
diff --git a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
index 41ccda3dba..2ad4b930f0 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
@@ -79,7 +79,7 @@ class TestSoftmaxNet(unittest.TestCase):
             prediction = fluid.layers.fc(input=fc_1, size=2)
 
             # 4 x 2
-            prob = fluid.layers.softmax(prediction, axis=1)
+            prob = paddle.nn.functional.softmax(prediction, axis=1)
 
             cost = fluid.layers.cross_entropy(input=prob, label=label)
             loss = paddle.mean(cost)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 657774b729..8c46a64162 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -310,7 +310,7 @@ class SimpleAttention(fluid.dygraph.Layer):
             shape=[attention_weight.shape[0], attention_weight.shape[1]],
         )
 
-        weights_reshape = fluid.layers.softmax(weights_reshape)
+        weights_reshape = paddle.nn.functional.softmax(weights_reshape)
         scaled = fluid.layers.elementwise_mul(
             x=encoder_vec, y=weights_reshape, axis=0
         )
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
index dfbaae4926..06982a0fc3 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
@@ -41,7 +41,7 @@ class Policy(fluid.dygraph.Layer):
         x = fluid.layers.dropout(x, self.dropout_ratio)
         x = fluid.layers.relu(x)
         action_scores = self.affine2(x)
-        return fluid.layers.softmax(action_scores, axis=1)
+        return paddle.nn.functional.softmax(action_scores, axis=1)
 
 
 class TestImperativeMnist(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index 95f912d822..6eb5ab1874 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -376,7 +376,7 @@ class TestImperativeResneXt(unittest.TestCase):
                     label.stop_gradient = True
 
                     out = se_resnext(img)
-                    softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+                    softmax_out = paddle.nn.functional.softmax(out)
                     loss = fluid.layers.cross_entropy(
                         input=softmax_out, label=label
                     )
@@ -456,7 +456,7 @@ class TestImperativeResneXt(unittest.TestCase):
             )
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
             out = se_resnext(img)
-            softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+            softmax_out = paddle.nn.function.softmax(out)
             loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
             avg_loss = paddle.mean(x=loss)
             optimizer.minimize(avg_loss)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index e850905141..a88c31dd3f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -503,7 +503,7 @@ class MultiHeadAttentionLayer(Layer):
         )
         if attn_bias is not None:
             product += attn_bias
-        weights = fluid.layers.softmax(product)
+        weights = paddle.nn.functional.softmax(product)
         if self._dropout_rate:
             weights_droped = fluid.layers.dropout(
                 weights,
@@ -1013,7 +1013,7 @@ class WrapDecoderLayer(Layer):
 
         if dec_inputs is None:
             # Return probs for independent decoder program.
-            predict_out = fluid.layers.softmax(predict)
+            predict_out = paddle.nn.functional.softmax(predict)
             return predict_out
         return predict
 
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 67cfdfeceb..25b6d0513d 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -2748,7 +2748,7 @@ class TestLayer(LayerTest):
             data = fluid.data(name="input", shape=[-1, 32, 32], dtype="float32")
             label = fluid.data(name="label", shape=[-1, 1], dtype="int")
             fc_out = fluid.layers.fc(input=data, size=10)
-            predict = fluid.layers.softmax(input=fc_out)
+            predict = paddle.nn.functional.softmax(fc_out)
             result = paddle.static.accuracy(input=predict, label=label, k=5)
             place = fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -2764,7 +2764,7 @@ class TestLayer(LayerTest):
             data = base.to_variable(x)
             label = base.to_variable(y)
             fc_out = fluid.layers.fc(data, size=10)
-            predict = fluid.layers.softmax(fc_out)
+            predict = paddle.nn.functional.softmax(fc_out)
             dynamic_out = paddle.static.accuracy(
                 input=predict, label=label, k=5
             )
@@ -3056,7 +3056,7 @@ class TestBook(LayerTest):
         ):
             data = self._get_data(name='data', shape=[10], dtype='float32')
             hid = layers.fc(input=data, size=20)
-            return layers.softmax(hid, axis=1)
+            return paddle.nn.functional.softmax(hid, axis=1)
 
     @prog_scope()
     def make_nce(self):
diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
index 83f07bf747..33f95b439c 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -89,7 +89,7 @@ class TestMeanOpError(unittest.TestCase):
             input3 = fluid.layers.data(
                 name='input3', shape=[4], dtype="float16"
             )
-            fluid.layers.softmax(input3)
+            paddle.nn.functional.softmax(input3)
 
 
 @unittest.skipIf(
diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
index 6e01ee1d4f..2b06de33f2 100644
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -617,7 +617,7 @@ class RecurrentOpSubBlockTest(RecurrentOpTest1):
 
         def dot_attention(query, memory):
             attn = layers.matmul(query, memory, transpose_y=True)
-            weight = layers.softmax(attn)
+            weight = paddle.nn.functional.softmax(attn)
             weight_memory = layers.matmul(weight, memory)
 
             return weight_memory, weight
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index a557fb9df0..3b3539c486 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -76,7 +76,7 @@ class DecoderCell(layers.RNNCell):
         )
         if encoder_padding_mask is not None:
             attn_scores = paddle.add(attn_scores, encoder_padding_mask)
-        attn_scores = layers.softmax(attn_scores)
+        attn_scores = paddle.nn.functional.softmax(attn_scores)
         attn_out = paddle.squeeze(
             layers.matmul(attn_scores, encoder_output), [1]
         )
@@ -295,7 +295,7 @@ class Seq2SeqModel:
             decoder_output.sample_ids,
             dec_seq_lengths,
         )
-        probs = layers.softmax(logits)
+        probs = paddle.nn.functional.softmax(logits)
         return probs, samples, sample_length
 
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
index 6021256f69..22f759b46f 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
@@ -99,7 +99,7 @@ class TestMeanOpError(unittest.TestCase):
             input3 = fluid.layers.data(
                 name='input3', shape=[4], dtype="float16"
             )
-            fluid.layers.softmax(input3)
+            paddle.nn.functional.softmax(input3)
 
 
 support_types = get_xpu_op_support_types('mean')
-- 
GitLab