remove softmax api from fluid (#48388)

* move softmax to paddle2.0 * fix some bugs * resolve conflict * remove some code * modify code style * fix bugs * fix code * fix move code * fix some bugs * fix code * fix some code * modify the header file * fix bugs * fix some examples * fix mish example * fix code

remove softmax api from fluid (#48388)
* move softmax to paddle2.0 * fix some bugs * resolve conflict * remove some code * modify code style * fix bugs * fix code * fix move code * fix some bugs * fix code * fix some code * modify the header file * fix bugs * fix some examples * fix mish example * fix code
a3ae080a · Charles-hit · GitHub · ea5ca555 · a3ae080a · a3ae080a
33 changed file
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -626,7 +626,7 @@ def detection_output(
        target_box=loc,
        code_type='decode_center_size',
    )
-    scores = nn.softmax(input=scores)
+    scores = paddle.nn.functional.softmax(scores)
    scores = paddle.transpose(scores, perm=[0, 2, 1])
    scores.stop_gradient = True
    nmsed_outs = helper.create_variable_for_type_inference(

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -68,7 +68,6 @@ __all__ = [
    'linear_chain_crf',
    'crf_decoding',
    'conv2d',
-    'softmax',
    'pool2d',
    'batch_norm',
    'dropout',
@@ -145,7 +144,7 @@ def _get_reduce_dim(dim, input):
        else:
            raise TypeError(
                "The type of dim must be int, list, tuple or range, but received {}".format(
-                    type(axis)
+                    type(dim)
                )
            )
    if dim is None:
@@ -1123,147 +1122,6 @@ def dropout(
    return out
-@deprecated(since="2.0.0", update_to="paddle.nn.functional.softmax")
-def softmax(input, use_cudnn=True, name=None, axis=-1):
-    r"""
-    This operator implements the softmax layer. The calculation process is as follows:
-    1. The dimension :attr:`axis` of the ``input`` will be permuted to the last.
-    2. Then the input tensor will be logically flattened to a 2-D matrix. The matrix's
-    second dimension(row length) is the same as the dimension :attr:`axis` of the input
-    tensor, and the first dimension(column length) is the product of all other
-    dimensions of the input tensor. For each row of the matrix, the softmax operator
-    squashes the K-dimensional(K is the width of the matrix, which is also the size
-    of the input tensor's dimension :attr:`axis`) vector of arbitrary real values to a
-    K-dimensional vector of real values in the range [0, 1] that add up to 1.
-    3. After the softmax operation is completed, the inverse operations of steps 1 and 2
-    are performed to restore the two-dimensional matrix to the same dimension as the ``input``.
-    It computes the exponential of the given dimension and the sum of exponential
-    values of all the other dimensions in the K-dimensional vector input.
-    Then the ratio of the exponential of the given dimension and the sum of
-    exponential values of all the other dimensions is the output of the softmax
-    operator.
-    For each row :math:`i` and each column :math:`j` in the matrix, we have:
-    .. math::
-        Out[i, j] = \\frac{\\exp(X[i, j])}{\\sum_j(exp(X[i, j])}
-    Example:
-    .. code-block:: text
-        Case 1:
-          Input:
-            X.shape = [2, 3, 4]
-            X.data = [[[2.0, 3.0, 4.0, 5.0],
-                       [3.0, 4.0, 5.0, 6.0],
-                       [7.0, 8.0, 8.0, 9.0]],
-                      [[1.0, 2.0, 3.0, 4.0],
-                       [5.0, 6.0, 7.0, 8.0],
-                       [6.0, 7.0, 8.0, 9.0]]]
-          Attrs:
-            axis = -1
-          Output:
-            Out.shape = [2, 3, 4]
-            Out.data = [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-                         [0.07232949, 0.19661193, 0.19661193, 0.53444665]],
-                        [[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426],
-                         [0.0320586 , 0.08714432, 0.23688282, 0.64391426]]]
-        Case 2:
-          Input:
-            X.shape = [2, 3, 4]
-            X.data = [[[2.0, 3.0, 4.0, 5.0],
-                       [3.0, 4.0, 5.0, 6.0],
-                       [7.0, 8.0, 8.0, 9.0]],
-                      [[1.0, 2.0, 3.0, 4.0],
-                       [5.0, 6.0, 7.0, 8.0],
-                       [6.0, 7.0, 8.0, 9.0]]]
-          Attrs:
-            axis = 1
-          Output:
-            Out.shape = [2, 3, 4]
-            Out.data = [[[0.00657326, 0.00657326, 0.01714783, 0.01714783],
-                         [0.01786798, 0.01786798, 0.04661262, 0.04661262],
-                         [0.97555875, 0.97555875, 0.93623955, 0.93623955]],
-                        [[0.00490169, 0.00490169, 0.00490169, 0.00490169],
-                         [0.26762315, 0.26762315, 0.26762315, 0.26762315],
-                         [0.72747516, 0.72747516, 0.72747516, 0.72747516]]]
-    Args:
-        input (Tensor): The input tensor. A multi-dimension ``Tensor`` with type float32 or float64.
-        use_cudnn (bool, optional): Use cudnn kernel or not, it is valid only when the cudnn \
-            library is installed. To improve performance, set use_cudnn to True by default.
-        name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` . Default: None.
-            will be named automatically. Default: None.
-        axis (int, optional): The index of dimension to perform softmax calculations, it should
-            be in range :math:`[-1, rank - 1]`, while :math:`rank` is the rank of
-            input tensor. Default: -1. -1 means the last dimension.
-    Returns:
-        Tensor: ``Tensor`` indicates the output of softmax. The data type and shape are the same as ``input`` .
-    Examples:
-        .. code-block:: python
-            import paddle
-            import paddle.nn.functional as F
-            x = paddle.to_tensor([[[2.0, 3.0, 4.0, 5.0],
-                                [3.0, 4.0, 5.0, 6.0],
-                                [7.0, 8.0, 8.0, 9.0]],
-                                [[1.0, 2.0, 3.0, 4.0],
-                                [5.0, 6.0, 7.0, 8.0],
-                                [6.0, 7.0, 8.0, 9.0]]], dtype='float32')
-            y = F.softmax(x, axis=1)
-            print(y)
-            # [[[0.00657326, 0.00657326, 0.01714783, 0.01714783],
-            #   [0.01786798, 0.01786798, 0.04661262, 0.04661262],
-            #   [0.97555870, 0.97555870, 0.93623954, 0.93623954]],
-            #  [[0.00490169, 0.00490169, 0.00490169, 0.00490169],
-            #   [0.26762316, 0.26762316, 0.26762316, 0.26762316],
-            #   [0.72747517, 0.72747517, 0.72747517, 0.72747517]]]
-    """
-    if in_dygraph_mode():
-        return _C_ops.softmax(input, axis)
-    if _non_static_mode():
-        return _legacy_C_ops.softmax(
-            input, 'axis', axis, 'use_cudnn', use_cudnn
-        )
-    inputs = {"X": [input]}
-    attrs = {"axis": axis, "use_cudnn": use_cudnn}
-    helper = LayerHelper('softmax', **locals())
-    check_variable_and_dtype(
-        input, 'input/x', ['float16', 'float32', 'float64'], 'softmax'
-    )
-    dtype = helper.input_dtype()
-    softmax_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="softmax",
-        inputs={"X": input},
-        outputs={"Out": softmax_out},
-        attrs=attrs,
-    )
-    return softmax_out
 def conv2d(
    input,
    num_filters,
@@ -1788,7 +1646,7 @@ def pool2d(
        if pool_padding == "VALID":
            padding_algorithm = "VALID"
            pool_padding = [0, 0]
-            if ceil_mode != False:
+            if ceil_mode is not False:
                raise ValueError(
                    "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode) must be False. "
                    "Received ceil_mode: True."
@@ -6643,7 +6501,7 @@ def deformable_roi_pooling(
        )
    input_channels = input.shape[1]
-    if position_sensitive == False:
+    if position_sensitive is False:
        output_channels = input_channels
    else:
        output_channels = input_channels / pooled_height / pooled_width
@@ -6867,9 +6725,11 @@ def mish(x, threshold=20, name=None):
    .. code-block:: python
+        import paddle
        import paddle.fluid as fluid
        import numpy as np
+        paddle.enable_static()
        DATATYPE='float32'
        x_data = np.array([i for i in range(1,5)]).reshape([1,1,4]).astype(DATATYPE)

--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -1304,7 +1304,7 @@ class BeamSearchDecoder(Decoder):
                self.noend_mask_tensor, "float64"
            )
-        step_log_probs = paddle.log(nn.softmax(logits))
+        step_log_probs = paddle.log(paddle.nn.functional.softmax(logits))
        step_log_probs = self._mask_probs(step_log_probs, beam_state.finished)
        log_probs = nn.elementwise_add(
            x=step_log_probs, y=beam_state.log_probs, axis=0
@@ -2330,7 +2330,7 @@ class SampleEmbeddingHelper(GreedyEmbeddingHelper):
            if self.softmax_temperature is not None
            else outputs
        )
-        probs = nn.softmax(logits)
+        probs = paddle.nn.functional.softmax(logits)
        # TODO: remove this stop_gradient. The stop_gradient of sample_ids can
        # not pass to probs, since sampling_id op does not have corresponding
        # grad op and thus can not pass.

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_se_resnext.py
@@ -354,7 +354,7 @@ class TestSeResNeXt(TestParallelDyGraphRunnerBase):
        label.stop_gradient = True
        out = model(img)
-        softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+        softmax_out = paddle.nn.functional.softmax(out, use_cudnn=False)
        loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
        avg_loss = paddle.mean(x=loss)
        return avg_loss

--- a/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/parallel_dygraph_transformer.py
@@ -342,7 +342,7 @@ class MultiHeadAttentionLayer(Layer):
        )
        if attn_bias is not None:
            product += attn_bias
-        weights = fluid.layers.softmax(product)
+        weights = paddle.nn.functional.softmax(product)
        if self._dropout_rate:
            weights_droped = fluid.layers.dropout(
                weights,
@@ -849,7 +849,7 @@ class WrapDecoderLayer(Layer):
        if dec_inputs is None:
            # Return probs for independent decoder program.
-            predict_out = fluid.layers.softmax(predict)
+            predict_out = paddle.nn.functional.softmax(predict)
            return predict_out
        return predict

--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
@@ -1177,7 +1177,7 @@ def multi_head_attention(
        product = layers.matmul(x=scaled_q, y=k, transpose_y=True)
        if attn_bias:
            product += attn_bias
-        weights = layers.softmax(product)
+        weights = paddle.nn.functional.softmax(product)
        if dropout_rate:
            weights = layers.dropout(
                weights,
@@ -1715,7 +1715,7 @@ def wrap_decoder(
            bias_attr=const_bias_attr,
        )
    if dec_inputs is None:
-        predict = layers.softmax(predict)
+        predict = paddle.nn.functional.softmax(predict)
    return predict
@@ -1834,7 +1834,7 @@ def fast_decode(
            logits = paddle.reshape(logits, (-1, trg_vocab_size))
            topk_scores, topk_indices = layers.topk(
-                input=layers.softmax(logits), k=beam_size
+                input=paddle.nn.functional.softmax(logits), k=beam_size
            )
            accu_scores = layers.elementwise_add(
                x=paddle.log(topk_scores),

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
@@ -435,7 +435,9 @@ class BaseModel(fluid.dygraph.Layer):
            cell_outputs = self._split_batch_beams(step_input)
            cell_outputs = self.fc(cell_outputs)
-            step_log_probs = paddle.log(fluid.layers.softmax(cell_outputs))
+            step_log_probs = paddle.log(
+                paddle.nn.functional.softmax(cell_outputs)
+            )
            noend_array = [-self.kinf] * self.tar_vocab_size
            noend_array[self.beam_end_token] = 0
            noend_mask_tensor = to_variable(
@@ -703,7 +705,7 @@ class AttentionModel(fluid.dygraph.Layer):
            attn = paddle.transpose(attn, [1, 0, 2])
            attn = paddle.add(attn, mask * 1000000000)
            attn = paddle.transpose(attn, [1, 0, 2])
-        weight = fluid.layers.softmax(attn)
+        weight = paddle.nn.functional.softmax(attn)
        weight_memory = fluid.layers.matmul(weight, memory)
        return weight_memory

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_dict.py
@@ -67,7 +67,7 @@ class SubNetWithDict(fluid.dygraph.Layer):
            cache["k"], cache["v"] = k, v
        weight = fluid.layers.matmul(x=q, y=k, transpose_y=True)
-        weight = fluid.layers.softmax(weight)
+        weight = paddle.nn.functional.softmax(weight)
        out = fluid.layers.matmul(weight, v)
        return out
@@ -113,7 +113,7 @@ class MainNetWithDict(fluid.dygraph.Layer):
 # Test to call function defined outside of class.
 def update_cache(cache):
    for k, val in cache.items():
-        cache[k] = fluid.layers.softmax(val)
+        cache[k] = paddle.nn.functional.softmax(val)
    return cache

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse.py
@@ -308,7 +308,7 @@ class NetWithExternalFunc(fluid.dygraph.Layer):
 # Test to call function behind caller.
 def softmax(x):
-    return fluid.layers.softmax(x)
+    return paddle.nn.functional.softmax(x)
 class TestNetWithExternalFunc(TestDygraphIfElseNet):

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -535,7 +535,7 @@ def train_mobilenet(args, to_static):
                out = net(img)
                t_end = time.time()
-                softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+                softmax_out = paddle.nn.functional.softmax(out)
                loss = fluid.layers.cross_entropy(
                    input=softmax_out, label=label
                )

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
@@ -48,7 +48,7 @@ class Policy(Layer):
        x = fluid.layers.relu(x)
        action_scores = self.affine2(x)
-        log_prob = fluid.layers.softmax(action_scores, axis=1)
+        log_prob = paddle.nn.functional.softmax(action_scores, axis=1)
        return log_prob

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -343,7 +343,7 @@ class SeResNeXt(fluid.dygraph.Layer):
        y = paddle.reshape(y, shape=[-1, self.pool2d_avg_output])
        out = self.out(y)
-        softmax_out = fluid.layers.softmax(out)
+        softmax_out = paddle.nn.functional.softmax(out)
        loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
        avg_loss = paddle.mean(x=loss)

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/transformer_dygraph_model.py
@@ -153,7 +153,7 @@ class MultiHeadAttention(Layer):
        )
        if attn_bias is not None:
            product += attn_bias
-        weights = layers.softmax(product)
+        weights = paddle.nn.functional.softmax(product)
        if self.dropout_rate:
            weights = layers.dropout(weights, dropout_prob=self.dropout_rate)
            out = layers.matmul(weights, v)
@@ -840,7 +840,7 @@ class Transformer(Layer):
            )
            caches = map_structure(split_batch_beams, caches)
            step_log_probs = split_batch_beams(
-                paddle.log(fluid.layers.softmax(logits))
+                paddle.log(paddle.nn.functional.softmax(logits))
            )
            step_log_probs = mask_probs(

--- a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_fp16_ipu.py
@@ -33,7 +33,7 @@ class SimpleLayer(paddle.nn.Layer):
        x = self.conv(x)
        x = paddle.flatten(x, 1, -1)
        if target is not None:
-            x = paddle.fluid.layers.softmax(x)
+            x = paddle.nn.functional.softmax(x)
            loss = paddle.fluid.layers.cross_entropy(x, target)
            if self.use_ipu:
                loss = paddle.incubate.identity_loss(loss, 1)

--- a/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_dy2static_ipu.py
@@ -48,7 +48,7 @@ class SimpleLayer(paddle.nn.Layer):
        x = paddle.flatten(x, 1, -1)
        if target is not None:
            if self.use_softmax:
-                x = paddle.fluid.layers.softmax(x)
+                x = paddle.nn.functional.softmax(x)
            if self.loss_op:
                loss = self.loss_op(x, target)
            else:

--- a/python/paddle/fluid/tests/unittests/ipu/test_modelruntime_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_modelruntime_ipu.py
@@ -32,7 +32,7 @@ class SimpleLayer(paddle.nn.Layer):
        x = self.conv(x)
        x = paddle.flatten(x, 1, -1)
        if target is not None:
-            x = paddle.fluid.layers.softmax(x)
+            x = paddle.nn.functional.softmax(x)
            loss = paddle.fluid.layers.cross_entropy(x, target)
            return x, loss
        return x

--- a/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_print_op_ipu.py
@@ -119,7 +119,7 @@ class SimpleLayer(paddle.nn.Layer):
        print(x)
        x = paddle.flatten(x, 1, -1)
        if target is not None:
-            x = paddle.fluid.layers.softmax(x)
+            x = paddle.nn.functional.softmax(x)
            loss = paddle.fluid.layers.cross_entropy(x, target)
            loss = paddle.incubate.identity_loss(loss, 1)
            return x, loss

--- a/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_softmax_op_ipu.py
@@ -47,7 +47,7 @@ class TestBase(IPUOpTest):
        x = paddle.static.data(
            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32'
        )
-        out = paddle.fluid.layers.softmax(x, **self.attrs)
+        out = paddle.nn.functional.softmax(x, **self.attrs)
        self.fetch_list = [out.name]
    def run_model(self, exec_mode):

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_inplace_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_inplace_fuse_pass.py
@@ -32,7 +32,7 @@ class MkldnnInplacePassTest(InferencePassTest):
            conv_out_1 = fluid.layers.conv2d(
                data, num_filters=3, filter_size=3, bias_attr=False
            )
-            softmax_out = fluid.layers.softmax(conv_out_1)
+            softmax_out = paddle.nn.functional.softmax(conv_out_1)
            relu_out = fluid.layers.relu(conv_out_1)
            eltwise_out = fluid.layers.elementwise_add(
                softmax_out, relu_out, axis=-1

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
@@ -77,7 +77,7 @@ class TensorRTSubgraphPassRelu6Test(TensorRTSubgraphPassActivationTest):
 class TensorRTSubgraphPassSoftMaxTest(TensorRTSubgraphPassActivationTest):
    def append_act(self, x):
-        return fluid.layers.softmax(x)
+        return paddle.nn.functional.softmax(x)
 class TensorRTSubgraphPassSigmoidTest(TensorRTSubgraphPassActivationTest):

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_fc_fuse_pass.py
@@ -17,6 +17,7 @@ import unittest
 import numpy as np
 from inference_pass_test import InferencePassTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.core import AnalysisConfig
@@ -31,7 +32,7 @@ class FCFusePassTRTTest(InferencePassTest):
            fc_out1 = fluid.layers.fc(
                input=data, size=128, num_flatten_dims=1, act="relu"
            )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
        self.feeds = {
            "data": np.random.random((32, 128, 2, 2)).astype("float32")
@@ -61,7 +62,7 @@ class FCFusePassTRTStaticDims4Cols1Test(InferencePassTest):
            fc_out1 = fluid.layers.fc(
                input=data, size=64, num_flatten_dims=1, act="relu"
            )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
        self.feeds = {
            "data": np.random.random((32, 128, 32, 8)).astype("float32")
@@ -89,7 +90,7 @@ class FCFusePassTRTStaticDims4Cols2Test(InferencePassTest):
            fc_out1 = fluid.layers.fc(
                input=data, size=32, num_flatten_dims=2, act="relu"
            )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
        self.feeds = {
            "data": np.random.random((3, 24, 16, 16)).astype("float32")
@@ -115,7 +116,7 @@ class FCFusePassTRTDynamicDims2Test(InferencePassTest):
            fc_out1 = fluid.layers.fc(
                input=data, size=64, num_flatten_dims=1, act="relu"
            )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
        self.feeds = {"data": np.random.random((32, 128)).astype("float32")}
        self.enable_trt = True
@@ -147,7 +148,7 @@ class FCFusePassTRTDynamicDims3Cols1Test(InferencePassTest):
            fc_out1 = fluid.layers.fc(
                input=data, size=64, num_flatten_dims=1, act="relu"
            )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
        self.feeds = {"data": np.random.random((32, 128, 32)).astype("float32")}
        self.enable_trt = True
@@ -179,7 +180,7 @@ class FCFusePassTRTDynamicDims3Cols2Test(InferencePassTest):
            fc_out1 = fluid.layers.fc(
                input=data, size=64, num_flatten_dims=2, act="relu"
            )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
        self.feeds = {"data": np.random.random((32, 128, 32)).astype("float32")}
        self.enable_trt = True
@@ -213,7 +214,7 @@ class FCFusePassTRTDynamicDims4Cols1Test(InferencePassTest):
            fc_out1 = fluid.layers.fc(
                input=data, size=64, num_flatten_dims=1, act="relu"
            )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
        self.feeds = {
            "data": np.random.random((32, 12, 4, 6)).astype("float32")
@@ -249,7 +250,7 @@ class FCFusePassTRTDynamicDims4Cols2Test(InferencePassTest):
            fc_out1 = fluid.layers.fc(
                input=data, size=64, num_flatten_dims=2, act="relu"
            )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
        self.feeds = {
            "data": np.random.random((32, 128, 32, 32)).astype("float32")
@@ -285,7 +286,7 @@ class FCFusePassTRTDynamicDims4Cols3Test(InferencePassTest):
            fc_out1 = fluid.layers.fc(
                input=data, size=64, num_flatten_dims=3, act="relu"
            )
-            out = fluid.layers.softmax(input=fc_out1)
+            out = paddle.nn.functional.softmax(fc_out1)
        self.feeds = {
            "data": np.random.random((32, 128, 32, 32)).astype("float32")

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_gather_op.py
@@ -30,7 +30,7 @@ class TRTGatherTest1(InferencePassTest):
            data = fluid.data(name='data', shape=[-1, 128], dtype='float32')
            index = fluid.data(name='index', shape=[-1, 1], dtype='int32')
            scale_out = paddle.gather(data, index=index)
-            out = fluid.layers.softmax(input=scale_out)
+            out = paddle.nn.functional.softmax(scale_out)
        self.feeds = {
            "data": np.random.random([self.bs, 128]).astype("float32"),
@@ -69,7 +69,7 @@ class TRTGatherTest2(InferencePassTest):
            data = fluid.data(name='data', shape=[16, 64], dtype='float32')
            index = fluid.data(name='index', shape=[2], dtype='int32')
            scale_out = paddle.gather(data, index=index)
-            out = fluid.layers.softmax(input=scale_out)
+            out = paddle.nn.functional.softmax(scale_out)
        self.feeds = {
            "data": np.random.random([self.bs, 64]).astype("float32"),

--- a/python/paddle/fluid/tests/unittests/ir/test_ir_fc_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_fc_fuse_pass.py
@@ -17,6 +17,7 @@ import unittest
 import numpy as np
 from pass_test import PassTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -31,7 +32,7 @@ class FCFusePassTest(PassTest):
                input=data, size=128, num_flatten_dims=1, act="relu"
            )
            tmp_1 = fluid.layers.fc(input=tmp_0, size=32, num_flatten_dims=1)
-            tmp_2 = fluid.layers.softmax(input=tmp_1)
+            tmp_2 = paddle.nn.functional.softmax(tmp_1)
        self.feeds = {"data": np.random.random((32, 128)).astype("float32")}
        self.fetch_list = [tmp_0, tmp_1, tmp_2]

--- a/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_softmax_op_npu.py
@@ -79,7 +79,7 @@ class TestSoftmaxNet(unittest.TestCase):
            prediction = fluid.layers.fc(input=fc_1, size=2)
            # 4 x 2
-            prob = fluid.layers.softmax(prediction, axis=1)
+            prob = paddle.nn.functional.softmax(prediction, axis=1)
            cost = fluid.layers.cross_entropy(input=prob, label=label)
            loss = paddle.mean(cost)

--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -310,7 +310,7 @@ class SimpleAttention(fluid.dygraph.Layer):
            shape=[attention_weight.shape[0], attention_weight.shape[1]],
        )
-        weights_reshape = fluid.layers.softmax(weights_reshape)
+        weights_reshape = paddle.nn.functional.softmax(weights_reshape)
        scaled = fluid.layers.elementwise_mul(
            x=encoder_vec, y=weights_reshape, axis=0
        )

--- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
@@ -41,7 +41,7 @@ class Policy(fluid.dygraph.Layer):
        x = fluid.layers.dropout(x, self.dropout_ratio)
        x = fluid.layers.relu(x)
        action_scores = self.affine2(x)
-        return fluid.layers.softmax(action_scores, axis=1)
+        return paddle.nn.functional.softmax(action_scores, axis=1)
 class TestImperativeMnist(unittest.TestCase):

--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -376,7 +376,7 @@ class TestImperativeResneXt(unittest.TestCase):
                    label.stop_gradient = True
                    out = se_resnext(img)
-                    softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+                    softmax_out = paddle.nn.functional.softmax(out)
                    loss = fluid.layers.cross_entropy(
                        input=softmax_out, label=label
                    )
@@ -456,7 +456,7 @@ class TestImperativeResneXt(unittest.TestCase):
            )
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
            out = se_resnext(img)
-            softmax_out = fluid.layers.softmax(out, use_cudnn=False)
+            softmax_out = paddle.nn.function.softmax(out)
            loss = fluid.layers.cross_entropy(input=softmax_out, label=label)
            avg_loss = paddle.mean(x=loss)
            optimizer.minimize(avg_loss)

--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -503,7 +503,7 @@ class MultiHeadAttentionLayer(Layer):
        )
        if attn_bias is not None:
            product += attn_bias
-        weights = fluid.layers.softmax(product)
+        weights = paddle.nn.functional.softmax(product)
        if self._dropout_rate:
            weights_droped = fluid.layers.dropout(
                weights,
@@ -1013,7 +1013,7 @@ class WrapDecoderLayer(Layer):
        if dec_inputs is None:
            # Return probs for independent decoder program.
-            predict_out = fluid.layers.softmax(predict)
+            predict_out = paddle.nn.functional.softmax(predict)
            return predict_out
        return predict

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -2748,7 +2748,7 @@ class TestLayer(LayerTest):
            data = fluid.data(name="input", shape=[-1, 32, 32], dtype="float32")
            label = fluid.data(name="label", shape=[-1, 1], dtype="int")
            fc_out = fluid.layers.fc(input=data, size=10)
-            predict = fluid.layers.softmax(input=fc_out)
+            predict = paddle.nn.functional.softmax(fc_out)
            result = paddle.static.accuracy(input=predict, label=label, k=5)
            place = fluid.CPUPlace()
            exe = fluid.Executor(place)
@@ -2764,7 +2764,7 @@ class TestLayer(LayerTest):
            data = base.to_variable(x)
            label = base.to_variable(y)
            fc_out = fluid.layers.fc(data, size=10)
-            predict = fluid.layers.softmax(fc_out)
+            predict = paddle.nn.functional.softmax(fc_out)
            dynamic_out = paddle.static.accuracy(
                input=predict, label=label, k=5
            )
@@ -3056,7 +3056,7 @@ class TestBook(LayerTest):
        ):
            data = self._get_data(name='data', shape=[10], dtype='float32')
            hid = layers.fc(input=data, size=20)
-            return layers.softmax(hid, axis=1)
+            return paddle.nn.functional.softmax(hid, axis=1)
    @prog_scope()
    def make_nce(self):

--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -89,7 +89,7 @@ class TestMeanOpError(unittest.TestCase):
            input3 = fluid.layers.data(
                name='input3', shape=[4], dtype="float16"
            )
-            fluid.layers.softmax(input3)
+            paddle.nn.functional.softmax(input3)
 @unittest.skipIf(

--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -617,7 +617,7 @@ class RecurrentOpSubBlockTest(RecurrentOpTest1):
        def dot_attention(query, memory):
            attn = layers.matmul(query, memory, transpose_y=True)
-            weight = layers.softmax(attn)
+            weight = paddle.nn.functional.softmax(attn)
            weight_memory = layers.matmul(weight, memory)
            return weight_memory, weight

--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -76,7 +76,7 @@ class DecoderCell(layers.RNNCell):
        )
        if encoder_padding_mask is not None:
            attn_scores = paddle.add(attn_scores, encoder_padding_mask)
-        attn_scores = layers.softmax(attn_scores)
+        attn_scores = paddle.nn.functional.softmax(attn_scores)
        attn_out = paddle.squeeze(
            layers.matmul(attn_scores, encoder_output), [1]
        )
@@ -295,7 +295,7 @@ class Seq2SeqModel:
            decoder_output.sample_ids,
            dec_seq_lengths,
        )
-        probs = layers.softmax(logits)
+        probs = paddle.nn.functional.softmax(logits)
        return probs, samples, sample_length

--- a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
@@ -99,7 +99,7 @@ class TestMeanOpError(unittest.TestCase):
            input3 = fluid.layers.data(
                name='input3', shape=[4], dtype="float16"
            )
-            fluid.layers.softmax(input3)
+            paddle.nn.functional.softmax(input3)
 support_types = get_xpu_op_support_types('mean')