[Fluid Clean] remove apis in fluid.layers.ops (#47867)

* remove apis in fluid.ops * fix test_activation_nn_grad * fix circle import error * fix ops * fix cos * fix divide not inplace * remove lazy-import part

[Fluid Clean] remove apis in fluid.layers.ops (#47867)
* remove apis in fluid.ops * fix test_activation_nn_grad * fix circle import error * fix ops * fix cos * fix divide not inplace * remove lazy-import part
208f625b · JYChen · GitHub · 70589379 · 208f625b · 208f625b
86 changed file
--- a/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/localsgd_optimizer.py
@@ -449,8 +449,8 @@ class AdaptiveLocalSGDOptimizer(MetaOptimizerBase):
                communicate()
                self._generate_avg_loss(main_block, loss, avg_loss)
                next_local_steps = layers.cast(
-                    layers.ceil(
+                    paddle.ceil(
-                        layers.sqrt(
+                        paddle.sqrt(
                            lr_0
                            * avg_loss
                            / (global_lr * loss_0)

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -68,7 +68,7 @@ class GroupShardedClipGrad:
                merge_grad = layers.get_tensor_from_selected_rows(
                    layers.merge_selected_rows(g)
                )
-            square = layers.square(merge_grad)
+            square = paddle.square(merge_grad)
            sum_square = layers.reduce_sum(square)
            if p.dtype == paddle.float16:
@@ -133,7 +133,7 @@ class GroupShardedClipGrad:
        with device_guard(dev_id, "gpu"):
            paddle.distributed.all_reduce(global_norm_var, group=self._group)
-        global_norm_var = layers.sqrt(global_norm_var)
+        global_norm_var = paddle.sqrt(global_norm_var)
        max_global_norm = layers.fill_constant(
            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
        )

--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -69,7 +69,7 @@ class ShardingClipGrad:
                merge_grad = layers.get_tensor_from_selected_rows(
                    layers.merge_selected_rows(g)
                )
-            square = layers.square(merge_grad)
+            square = paddle.square(merge_grad)
            sum_square = layers.reduce_sum(square)
            if p.dtype == paddle.float16:
@@ -131,7 +131,7 @@ class ShardingClipGrad:
        with device_guard(dev_id, "gpu"):
            paddle.distributed.all_reduce(global_norm_var, group=self._group)
-        global_norm_var = layers.sqrt(global_norm_var)
+        global_norm_var = paddle.sqrt(global_norm_var)
        max_global_norm = layers.fill_constant(
            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
        )

--- a/python/paddle/distribution/categorical.py
+++ b/python/paddle/distribution/categorical.py
@@ -17,7 +17,7 @@ import paddle
 from paddle.distribution import distribution
 from paddle.fluid.data_feeder import check_type, convert_dtype
 from paddle.fluid.framework import _non_static_mode
-from paddle.fluid.layers import ops, tensor
+from paddle.fluid.layers import tensor
 from paddle.tensor import multinomial
@@ -214,8 +214,8 @@ class Categorical(distribution.Distribution):
        other_logits = other.logits - paddle.max(
            other.logits, axis=-1, keepdim=True
        )
-        e_logits = ops.exp(logits)
+        e_logits = paddle.exp(logits)
-        other_e_logits = ops.exp(other_logits)
+        other_e_logits = paddle.exp(other_logits)
        z = paddle.sum(e_logits, axis=-1, keepdim=True)
        other_z = paddle.sum(other_e_logits, axis=-1, keepdim=True)
        prob = e_logits / z
@@ -255,7 +255,7 @@ class Categorical(distribution.Distribution):
        """
        name = self.name + '_entropy'
        logits = self.logits - paddle.max(self.logits, axis=-1, keepdim=True)
-        e_logits = ops.exp(logits)
+        e_logits = paddle.exp(logits)
        z = paddle.sum(e_logits, axis=-1, keepdim=True)
        prob = e_logits / z

--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -23,7 +23,6 @@ from paddle.fluid.layers import (
    elementwise_div,
    elementwise_sub,
    nn,
-    ops,
    tensor,
 )
@@ -288,7 +287,7 @@ class Normal(distribution.Distribution):
        var = self.scale * self.scale
        return elementwise_div(
-            ops.exp(
+            paddle.exp(
                -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var)
            ),
            (math.sqrt(2 * math.pi) * self.scale),

--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -72,7 +72,7 @@ def _squared_l2_norm(x):
        or x.dtype == core.VarDesc.VarType.FP16
        or x.dtype == core.VarDesc.VarType.BF16
    ):
-        square = layers.square(x)
+        square = paddle.square(x)
        sum_square = layers.reduce_sum(square)
        return sum_square
@@ -540,7 +540,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
            global_norm_var_fp64 = paddle.add_n(sum_square_list)
            global_norm_var.append(global_norm_var_fp64)
        global_norm_var = paddle.add_n(global_norm_var)
-        global_norm_var = layers.sqrt(global_norm_var)
+        global_norm_var = paddle.sqrt(global_norm_var)
        max_global_norm = layers.fill_constant(
            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
        )
@@ -648,7 +648,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
                    if len(global_norm_var) > 1
                    else global_norm_var[0]
                )
-                global_norm_var = layers.sqrt(x=global_norm_var)
+                global_norm_var = paddle.sqrt(x=global_norm_var)
                max_global_norm = layers.fill_constant(
                    shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
                )
@@ -727,7 +727,7 @@ class ClipGradByGlobalNorm(ClipGradBase):
        group_scale_name = self.group_name + "_scale"
        if group_scale_name not in self.context:
            group_norm_var = layers.sums(input=self.context[self.group_name])
-            group_norm_var = layers.sqrt(x=group_norm_var)
+            group_norm_var = paddle.sqrt(x=group_norm_var)
            clip_var = self.context[self.group_name + "_clip"]
            group_scale_var = layers.elementwise_div(
                x=clip_var,

--- a/python/paddle/fluid/contrib/layers/rnn_impl.py
+++ b/python/paddle/fluid/contrib/layers/rnn_impl.py
@@ -14,6 +14,7 @@
 import copy
+import paddle
 from paddle.fluid import layers, unique_name
 from paddle.fluid.dygraph import Layer
 from paddle.fluid.dygraph.layer_object_helper import LayerObjectHelper
@@ -95,8 +96,8 @@ class BasicGRUUnit(Layer):
        self._hiden_size = hidden_size
        self._param_attr = param_attr
        self._bias_attr = bias_attr
-        self._gate_activation = gate_activation or layers.sigmoid
+        self._gate_activation = gate_activation or paddle.nn.functional.sigmoid
-        self._activation = activation or layers.tanh
+        self._activation = activation or paddle.tanh
        self._dtype = dtype
    def _build_once(self, input, pre_hidden):
@@ -845,8 +846,8 @@ class BasicLSTMUnit(Layer):
        self._hiden_size = hidden_size
        self._param_attr = param_attr
        self._bias_attr = bias_attr
-        self._gate_activation = gate_activation or layers.sigmoid
+        self._gate_activation = gate_activation or paddle.nn.functional.sigmoid
-        self._activation = activation or layers.tanh
+        self._activation = activation or paddle.tanh
        self._forget_bias = layers.fill_constant(
            [1], dtype=dtype, value=forget_bias
        )
@@ -879,10 +880,14 @@ class BasicLSTMUnit(Layer):
        new_cell = layers.elementwise_add(
            layers.elementwise_mul(
                pre_cell,
-                layers.sigmoid(layers.elementwise_add(f, self._forget_bias)),
+                paddle.nn.functional.sigmoid(
+                    layers.elementwise_add(f, self._forget_bias)
+                ),
+            ),
+            layers.elementwise_mul(
+                paddle.nn.functional.sigmoid(i), paddle.tanh(j)
            ),
-            layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j)),
        )
-        new_hidden = layers.tanh(new_cell) * layers.sigmoid(o)
+        new_hidden = paddle.tanh(new_cell) * paddle.nn.functional.sigmoid(o)
        return new_hidden, new_cell
--- a/python/paddle/fluid/contrib/slim/quantization/adaround.py
+++ b/python/paddle/fluid/contrib/slim/quantization/adaround.py
@@ -17,6 +17,7 @@ import time
 import sys
 import logging
+import paddle
 import paddle.fluid as fluid
 from ....log_helper import get_logger
@@ -41,7 +42,9 @@ ZETA = 1.1
 def compute_soft_rounding(alpha_v):
    return fluid.layers.clip(
-        fluid.layers.sigmoid(alpha_v) * (ZETA - GAMMA) + GAMMA, min=0, max=1
+        paddle.nn.functional.sigmoid(alpha_v) * (ZETA - GAMMA) + GAMMA,
+        min=0,
+        max=1,
    )
@@ -73,8 +76,7 @@ class AdaRoundLoss:
            # calculate regularization term - which ensures parameter to converge to exactly zeros and ones
            # at the end of optimization
            reg_term = fluid.layers.reduce_sum(
-                -fluid.layers.pow(fluid.layers.abs(2 * h_v - 1), factor=beta)
+                -fluid.layers.pow(paddle.abs(2 * h_v - 1), factor=beta) + 1
-                + 1
            )
            # calculate the rounding loss

--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
@@ -82,7 +82,7 @@ def bow_net(
        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]
    )
    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bow_tanh = fluid.layers.tanh(bow)
+    bow_tanh = paddle.tanh(bow)
    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")

--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -270,12 +270,10 @@ class NaturalExpDecay(LearningRateDecay):
        self.staircase = staircase
    def step(self):
-        from .. import layers
        div_res = self.create_lr_var(self.step_num / self.decay_steps)
        if self.staircase:
-            div_res = layers.floor(div_res)
+            div_res = paddle.floor(div_res)
-        decayed_lr = self.learning_rate * layers.exp(
+        decayed_lr = self.learning_rate * paddle.exp(
            -1 * self.decay_rate * div_res
        )
@@ -356,11 +354,9 @@ class ExponentialDecay(LearningRateDecay):
        self.staircase = staircase
    def step(self):
-        from .. import layers
        div_res = self.create_lr_var(self.step_num / self.decay_steps)
        if self.staircase:
-            div_res = layers.floor(div_res)
+            div_res = paddle.floor(div_res)
        decayed_lr = self.learning_rate * (self.decay_rate**div_res)
@@ -437,11 +433,9 @@ class InverseTimeDecay(LearningRateDecay):
        self.staircase = staircase
    def step(self):
-        from .. import layers
        div_res = self.create_lr_var(self.step_num / self.decay_steps)
        if self.staircase:
-            div_res = layers.floor(div_res)
+            div_res = paddle.floor(div_res)
        decayed_lr = self.learning_rate / (1 + self.decay_rate * div_res)
@@ -524,12 +518,10 @@ class PolynomialDecay(LearningRateDecay):
        self.cycle = cycle
    def step(self):
-        from .. import layers
        tmp_step_num = self.step_num
        tmp_decay_steps = self.decay_steps
        if self.cycle:
-            div_res = layers.ceil(
+            div_res = paddle.ceil(
                self.create_lr_var(tmp_step_num / float(self.decay_steps))
            )
@@ -601,15 +593,13 @@ class CosineDecay(LearningRateDecay):
        self.epochs = epochs
    def step(self):
-        from .. import layers
+        cur_epoch = paddle.floor(
-        cur_epoch = layers.floor(
            self.create_lr_var(self.step_num / self.step_each_epoch)
        )
        decayed_lr = (
            self.learning_rate
            * 0.5
-            * (layers.cos(cur_epoch * math.pi / self.epochs) + 1)
+            * (paddle.cos(cur_epoch * math.pi / self.epochs) + 1)
        )
        return decayed_lr

--- a/python/paddle/fluid/dygraph/rnn.py
+++ b/python/paddle/fluid/dygraph/rnn.py
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import paddle
 from . import Layer
 from ..layers import (
-    sigmoid,
-    tanh,
    concat,
    fill_constant,
    matmul,
@@ -139,8 +138,8 @@ class LSTMCell(Layer):
        self._param_attr = param_attr
        self._bias_attr = bias_attr
        self._dtype = dtype
-        self._gate_activation = gate_activation or sigmoid
+        self._gate_activation = gate_activation or paddle.nn.functional.sigmoid
-        self._activation = activation or tanh
+        self._activation = activation or paddle.tanh
        self._use_cudnn_impl = use_cudnn_impl
        if self._use_cudnn_impl:
@@ -254,7 +253,9 @@ class LSTMCell(Layer):
                        elementwise_add(f, self._forget_bias)
                    ),
                ),
-                elementwise_mul(sigmoid(i), tanh(j)),
+                elementwise_mul(
+                    paddle.nn.functional.sigmoid(i), paddle.tanh(j)
+                ),
            )
            new_hidden = self._activation(new_cell) * self._gate_activation(o)
@@ -357,8 +358,8 @@ class GRUCell(Layer):
        self._param_attr = param_attr
        self._bias_attr = bias_attr
        self._dtype = dtype
-        self._gate_activation = gate_activation or sigmoid
+        self._gate_activation = gate_activation or paddle.nn.functional.sigmoid
-        self._activation = activation or tanh
+        self._activation = activation or paddle.tanh
        self._use_cudnn_impl = use_cudnn_impl
        if self._use_cudnn_impl:

--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from . import ops
-from .ops import *
 from . import nn
 from .nn import *
 from . import io
@@ -43,7 +41,6 @@ __all__ += nn.__all__
 __all__ += io.__all__
 __all__ += tensor.__all__
 __all__ += control_flow.__all__
-__all__ += ops.__all__
 __all__ += device.__all__
 __all__ += detection.__all__
 __all__ += metric_op.__all__

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -14,7 +14,7 @@
 from ..wrapped_decorator import signature_safe_contextmanager
-from .layer_function_generator import autodoc, templatedoc
+from .layer_function_generator import templatedoc
 from .tensor import assign, cast, fill_constant
 from .. import core
 from ..framework import (

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -17,15 +17,13 @@ All layers just related to the detection neural network.
 import paddle
-from .layer_function_generator import generate_layer_fn
+from .layer_function_generator import templatedoc
-from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
 from ..framework import Variable, _non_static_mode, static_only, in_dygraph_mode
 from .. import core
 from .loss import softmax_with_cross_entropy
 from . import tensor
 from . import nn
-from . import ops
 from ..data_feeder import check_variable_and_dtype, check_type, check_dtype
 import math
 import numpy as np

--- a/python/paddle/fluid/layers/distributions.py
+++ b/python/paddle/fluid/layers/distributions.py
@@ -14,7 +14,6 @@
 from . import control_flow
 from . import tensor
-from . import ops
 from . import nn
 import math
 import numpy as np
@@ -535,8 +534,8 @@ class Categorical(Distribution):
        other_logits = other.logits - nn.reduce_max(
            other.logits, dim=-1, keep_dim=True
        )
-        e_logits = ops.exp(logits)
+        e_logits = paddle.exp(logits)
-        other_e_logits = ops.exp(other_logits)
+        other_e_logits = paddle.exp(other_logits)
        z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True)
        other_z = nn.reduce_sum(other_e_logits, dim=-1, keep_dim=True)
        prob = e_logits / z
@@ -556,7 +555,7 @@ class Categorical(Distribution):
        """
        logits = self.logits - nn.reduce_max(self.logits, dim=-1, keep_dim=True)
-        e_logits = ops.exp(logits)
+        e_logits = paddle.exp(logits)
        z = nn.reduce_sum(e_logits, dim=-1, keep_dim=True)
        prob = e_logits / z
        entropy = -1.0 * nn.reduce_sum(

--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -19,7 +19,6 @@ import threading
 from ..data_feeder import DataFeeder
 from .control_flow import BlockGuard
-from .layer_function_generator import templatedoc
 from .. import core
 from ..executor import global_scope
 from ..framework import (

--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
@@ -45,14 +45,11 @@ __all__ = [
 def _convert_(name):
    """
    Formatting.
    Args:
       name: The name/alias
    This function takes in a name and converts it to a standard format of
    group1_group2. Where as per the regular expression, group1 can have
    alphabets and numbers and group2 has capital alphabets.
    """
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
@@ -80,10 +77,8 @@ def _generate_doc_string_(
 ):
    """
    Generate docstring by OpProto
    Args:
        op_proto (framework_pb2.OpProto): a protobuf message typed OpProto
    Returns:
        str: the document string
    """
@@ -148,13 +143,10 @@ def _generate_doc_string_(
 def generate_layer_fn(op_type):
    """Register the Python layer for an Operator.
    Args:
       op_type: The name of the operator to be created.
    This function takes in the operator type (sigmoid, mean , average etc) and
    creates the operator functionality.
    """
    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
    not_intermediate_outputs = [
@@ -271,13 +263,10 @@ def generate_layer_fn(op_type):
 def generate_activation_fn(op_type):
    """Register the Python layer for an Operator without Attribute.
    Args:
       op_type: The name of the operator to be created.
    This function takes in the operator type (sigmoid, exp , tanh etc) and
    creates the operator functionality.
    """
    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
@@ -330,10 +319,8 @@ def generate_activation_fn(op_type):
 def generate_inplace_fn(inplace_op_type):
    """Register the Python layer for an Inplace Operator without Attribute.
    Args:
       inplace_op_type: The name of the inplace operator to be created.
    This function takes in the inplace operator type (exp_ , ceil_ etc) and
    creates the operator functionality.
    """
@@ -378,12 +365,10 @@ def templatedoc(op_type=None):
    """
    Decorator of layer function. It will use the docstring from the layer
    function as the template. The template arguments are:
    * ${comment}: The operator comment written in CPP.
    * ${{name}_comment}: The comment of ${name} written with AddAttr, AddOutput,
        and AddInput. The ${name} is Python snake style. i.e., xxx_xxx.
    * ${{name}_type}: The type of ${name}.
    Returns:
        Decorated function.
    """
@@ -438,7 +423,6 @@ def templatedoc(op_type=None):
 def add_sample_code(func, sample_code):
    """
    Append sample code for dynamically generated functions.
    Args:
       func: The function of the function to be append sample code to.
       sample_code: sample code session in rst format.

--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -26,7 +26,6 @@ import numbers
 import paddle
 from . import control_flow
 from . import nn
-from . import ops
 from . import tensor
 from ..framework import default_main_program, Parameter, unique_name, name_scope
 from ..framework import Variable
@@ -171,7 +170,7 @@ def exponential_decay(learning_rate, decay_steps, decay_rate, staircase=False):
            div_res = global_step / decay_steps
            if staircase:
-                div_res = ops.floor(div_res)
+                div_res = paddle.floor(div_res)
            decayed_lr = learning_rate * (decay_rate**div_res)
            return decayed_lr
@@ -233,8 +232,8 @@ def natural_exp_decay(learning_rate, decay_steps, decay_rate, staircase=False):
            div_res = global_step / decay_steps
            if staircase:
-                div_res = ops.floor(div_res)
+                div_res = paddle.floor(div_res)
-            decayed_lr = learning_rate * ops.exp(-1 * decay_rate * div_res)
+            decayed_lr = learning_rate * paddle.exp(-1 * decay_rate * div_res)
            return decayed_lr
@@ -293,7 +292,7 @@ def inverse_time_decay(learning_rate, decay_steps, decay_rate, staircase=False):
            div_res = global_step / decay_steps
            if staircase:
-                div_res = ops.floor(div_res)
+                div_res = paddle.floor(div_res)
            decayed_lr = learning_rate / (1 + decay_rate * div_res)
@@ -347,7 +346,7 @@ def polynomial_decay(
            global_step = _decay_step_counter()
            if cycle:
-                div_res = ops.ceil(global_step / decay_steps)
+                div_res = paddle.ceil(global_step / decay_steps)
                zero_var = tensor.fill_constant(
                    shape=[1], dtype='float32', value=0.0
                )
@@ -497,11 +496,11 @@ def cosine_decay(learning_rate, step_each_epoch, epochs):
        else:
            global_step = _decay_step_counter()
-            cur_epoch = ops.floor(global_step / step_each_epoch)
+            cur_epoch = paddle.floor(global_step / step_each_epoch)
            decayed_lr = (
                learning_rate
                * 0.5
-                * (ops.cos(cur_epoch * math.pi / epochs) + 1)
+                * (paddle.cos(cur_epoch * math.pi / epochs) + 1)
            )
            return decayed_lr

--- a/python/paddle/fluid/layers/loss.py
+++ b/python/paddle/fluid/layers/loss.py
@@ -1737,7 +1737,6 @@ def kldiv_loss(x, target, reduction='mean', name=None):
    return loss
-from .ops import square
 from .control_flow import equal

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-from .layer_function_generator import (
-    generate_layer_fn,
-    generate_activation_fn,
-    generate_inplace_fn,
-    add_sample_code,
-)
-from .. import core
-from ..framework import convert_np_dtype_to_dtype_, Variable, in_dygraph_mode
-from ..data_feeder import (
-    convert_dtype,
-    check_variable_and_dtype,
-    check_type,
-    check_dtype,
-)
-from paddle.utils import deprecated
-from paddle import _C_ops, _legacy_C_ops
-import paddle
-__deprecated_func_name__ = {
-    'tanh_shrink': 'tanhshrink',
-    'logsigmoid': 'log_sigmoid',
-}
-__activations_noattr__ = [
-    'sigmoid',
-    'silu',
-    'logsigmoid',
-    'tanh_shrink',
-    'softsign',
-    'tanh',
-]
-__unary_func__ = [
-    'exp',
-    'expm1',
-    'atan',
-    'sqrt',
-    'rsqrt',
-    'abs',
-    'ceil',
-    'floor',
-    'cos',
-    'tan',
-    'acos',
-    'sin',
-    'sinh',
-    'asin',
-    'cosh',
-    'round',
-    'reciprocal',
-    'square',
-    'acosh',
-    'asinh',
-    'atanh',
-    'lgamma',
-]
-__inplace_unary_func__ = [
-    'exp_',
-    'sqrt_',
-    'rsqrt_',
-    'ceil_',
-    'floor_',
-    'round_',
-    'reciprocal_',
-]
-__all__ = [
-    'softplus',
-    'softshrink',
-    'hard_shrink',
-    'cumsum',
-    'thresholded_relu',
-    'gelu',
-    'erf',
-]
-for _OP in set(__all__):
-    globals()[_OP] = generate_layer_fn(_OP)
-# It is a hot fix in some unittest using:
-#   fluid.layers.scale(x=x, scale=10.0, out=out_var)
-# e.g.: test_program_code.py, test_dist_train.py
-globals()['_scale'] = generate_layer_fn('scale')
-globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
-__all__ += __activations_noattr__
-__all__ += __unary_func__
-__all__ += __inplace_unary_func__
-for _OP in set(__activations_noattr__):
-    _new_OP = _OP
-    if _OP in __deprecated_func_name__:
-        _new_OP = __deprecated_func_name__[_OP]
-    _func = generate_activation_fn(_OP)
-    _func = deprecated(
-        since="2.0.0", update_to="paddle.nn.functional.%s" % (_new_OP)
-    )(_func)
-    globals()[_OP] = _func
-for _OP in set(__unary_func__):
-    _new_OP = _OP
-    if _OP in __deprecated_func_name__:
-        _new_OP = __deprecated_func_name__[_OP]
-    _func = generate_activation_fn(_OP)
-    _func = deprecated(since="2.0.0", update_to="paddle.%s" % (_new_OP))(_func)
-    globals()[_OP] = _func
-for _OP in set(__inplace_unary_func__):
-    _new_OP = _OP
-    if _OP in __deprecated_func_name__:
-        _new_OP = __deprecated_func_name__[_OP]
-    _func = generate_inplace_fn(_OP)
-    _func = deprecated(since="2.0.0", update_to="paddle.%s" % (_new_OP))(_func)
-    globals()[_OP] = _func
-add_sample_code(
-    globals()["sigmoid"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        import paddle.nn.functional as F
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = F.sigmoid(x)
-        print(out)
-        # [0.40131234 0.450166   0.52497919 0.57444252]
-""",
-)
-add_sample_code(
-    globals()["silu"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        import paddle.nn.functional as F
-        x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
-        out = F.silu(x)
-        print(out)
-        # [ 0.7310586 1.7615942 2.8577224, 3.9280552 ]
-""",
-)
-add_sample_code(
-    globals()["logsigmoid"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        import paddle.nn.functional as F
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = F.log_sigmoid(x)
-        print(out)
-        # [-0.91301525 -0.79813887 -0.64439666 -0.55435524]
-""",
-)
-add_sample_code(
-    globals()["exp"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.exp(x)
-        print(out)
-        # [0.67032005 0.81873075 1.10517092 1.34985881]
-""",
-)
-add_sample_code(
-    globals()["expm1"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.expm1(x)
-        print(out)
-        # [-0.32967997, -0.18126924,  0.10517092,  0.34985882]
-""",
-)
-add_sample_code(
-    globals()["tanh"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.tanh(x)
-        print(out)
-        # [-0.37994896 -0.19737532  0.09966799  0.29131261]
-""",
-)
-add_sample_code(
-    globals()["atan"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.atan(x)
-        print(out)
-        # [-0.38050638 -0.19739556  0.09966865  0.29145679]
-""",
-)
-add_sample_code(
-    globals()["tanh_shrink"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        import paddle.nn.functional as F
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = F.tanhshrink(x)
-        print(out)
-        # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
-""",
-)
-add_sample_code(
-    globals()["sqrt"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([0.1, 0.2, 0.3, 0.4])
-        out = paddle.sqrt(x)
-        print(out)
-        # [0.31622777 0.4472136  0.54772256 0.63245553]
-""",
-)
-add_sample_code(
-    globals()["rsqrt"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([0.1, 0.2, 0.3, 0.4])
-        out = paddle.rsqrt(x)
-        print(out)
-        # [3.16227766 2.23606798 1.82574186 1.58113883]
-""",
-)
-add_sample_code(
-    globals()["abs"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.abs(x)
-        print(out)
-        # [0.4 0.2 0.1 0.3]
-""",
-)
-add_sample_code(
-    globals()["ceil"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.ceil(x)
-        print(out)
-        # [-0. -0.  1.  1.]
-""",
-)
-add_sample_code(
-    globals()["floor"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.floor(x)
-        print(out)
-        # [-1. -1.  0.  0.]
-""",
-)
-add_sample_code(
-    globals()["cos"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.cos(x)
-        print(out)
-        # [0.92106099 0.98006658 0.99500417 0.95533649]
-""",
-)
-add_sample_code(
-    globals()["tan"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.tan(x)
-        print(out)
-        # [-0.42279324, -0.20271005, 0.10033467, 0.30933627]
-""",
-)
-add_sample_code(
-    globals()["acos"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.acos(x)
-        print(out)
-        # [1.98231317 1.77215425 1.47062891 1.26610367]
-""",
-)
-add_sample_code(
-    globals()["sin"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.sin(x)
-        print(out)
-        # [-0.38941834 -0.19866933  0.09983342  0.29552021]
-""",
-)
-add_sample_code(
-    globals()["asin"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.asin(x)
-        print(out)
-        # [-0.41151685 -0.20135792  0.10016742  0.30469265]
-""",
-)
-add_sample_code(
-    globals()["cosh"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.cosh(x)
-        print(out)
-        # [1.08107237 1.02006676 1.00500417 1.04533851]
-""",
-)
-add_sample_code(
-    globals()["sinh"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.sinh(x)
-        print(out)
-        # [-0.41075233 -0.201336    0.10016675  0.30452029]
-""",
-)
-add_sample_code(
-    globals()["asinh"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.asinh(x)
-        print(out)
-        # [-0.39003533, -0.19869010,  0.09983408,  0.29567307]
-""",
-)
-add_sample_code(
-    globals()["acosh"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([1., 3., 4., 5.])
-        out = paddle.acosh(x)
-        print(out)
-        # [0.        , 1.76274729, 2.06343699, 2.29243159]
-""",
-)
-add_sample_code(
-    globals()["atanh"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.atanh(x)
-        print(out)
-        # [-0.42364895, -0.20273256,  0.10033535,  0.30951962]
-""",
-)
-add_sample_code(
-    globals()["round"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.5, -0.2, 0.6, 1.5])
-        out = paddle.round(x)
-        print(out)
-        # [-1. -0.  1.  2.]
-""",
-)
-add_sample_code(
-    globals()["reciprocal"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.reciprocal(x)
-        print(out)
-        # [-2.5        -5.         10.          3.33333333]
-""",
-)
-add_sample_code(
-    globals()["square"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.square(x)
-        print(out)
-        # [0.16 0.04 0.01 0.09]
-""",
-)
-_softplus_ = generate_layer_fn('softplus')
-def softplus(x, beta: float = 1.0, threshold: float = 20.0, name=None):
-    check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'softplus')
-    locals_val = locals().copy()
-    kwargs = dict()
-    for name, val in locals_val.items():
-        if val is not None:
-            kwargs[name] = val
-    return _softplus_(**kwargs)
-softplus.__doc__ = r"""
-    :alias_main: paddle.nn.functional.softplus
-    :alias: paddle.nn.functional.softplus, paddle.nn.functional.activation.softplus
-    :old_api: paddle.fluid.layers.softplus
-:strong:`Softplus Activation Operator`
-Equation:
-    .. math::
-        out = \\frac{1}{beta} * log(1 + e^{beta * x})
-        For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.
-Args:
-    x(Tensor): Input of Softplus op, Tensor, dtype: float32 or float64
-    beta(float, optional): The value of beta for softplus. Default is 1
-    threshold (float, optional): The value of threshold for softplus. Default is 20
-    name(str, optional): Name for the operation (optional, default is None)
-Returns:
-    Variable: The output of Softplus op, Tensor, dtype: float32 or float64
-Examples:
-    .. code-block:: python
-        import paddle
-        import paddle.nn.functional as F
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = F.softplus(x)
-        print(out)
-        # [0.513015, 0.598139, 0.744397, 0.854355]
-"""
-add_sample_code(
-    globals()["softsign"],
-    r"""
-Examples:
-    .. code-block:: python
-        import paddle
-        import paddle.nn.functional as F
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = F.softsign(x)
-        print(out)
-        # [-0.285714, -0.166667, 0.0909091, 0.230769]
-""",
-)
-_softshrink_ = generate_layer_fn('softshrink')
-def softshrink(x, alpha=None):
-    check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64'], 'softshrink'
-    )
-    locals_var = locals().copy()
-    kwargs = dict()
-    for name, val in locals_var.items():
-        if val is not None:
-            if name == 'alpha':
-                kwargs['lambda'] = val
-            else:
-                kwargs[name] = val
-    return _softshrink_(**kwargs)
-softshrink.__doc__ = r"""
-	:alias_main: paddle.nn.functional.softshrink
-	:alias: paddle.nn.functional.softshrink,paddle.nn.functional.activation.softshrink
-	:old_api: paddle.fluid.layers.softshrink
-:strong:`Softshrink Activation Operator`
-..  math::
-    out = \\begin{cases}
-            x - \\alpha, \\text{if } x > \\alpha \\\\
-            x + \\alpha, \\text{if } x < -\\alpha \\\\
-            0,  \\text{otherwise}
-          \\end{cases}
-Args:
-    x: Input of Softshrink operator, an N-D Tensor, with data type float32, float64 or float16.
-    alpha (float): non-negative offset
-Returns:
-    Output of Softshrink operator with the same type of input.
-Examples:
-    .. code-block:: python
-        import paddle.fluid as fluid
-        data = fluid.data(name="input", shape=[None, 784])
-        result = fluid.layers.softshrink(x=data, alpha=0.3)
-"""
-_hard_shrink_ = generate_layer_fn('hard_shrink')
-@deprecated(since="2.0.0", update_to="paddle.nn.functional.hardshrink")
-def hard_shrink(x, threshold=None):
-    check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64'], 'hard_shrink'
-    )
-    locals_var = locals().copy()
-    kwargs = dict()
-    for name, val in locals_var.items():
-        if val is not None:
-            kwargs[name] = val
-    return _hard_shrink_(**kwargs)
-hard_shrink.__doc__ = (
-    _hard_shrink_.__doc__
-    + """
-Examples:
-    >>> import paddle.fluid as fluid
-    >>> data = fluid.layers.data(name="input", shape=[784])
-    >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
-"""
-)
-_cum_sum_ = generate_layer_fn('cumsum')
-@deprecated(
-    since="2.0.0",
-    update_to="paddle.cumsum",
-    reason="New APIs for Paddle 2.0 are coming.",
-)
-def cumsum(x, axis=None, exclusive=None, reverse=None):
-    check_type(x, 'x', (Variable), 'cumsum')
-    locals_var = locals().copy()
-    kwargs = dict()
-    for name, val in locals_var.items():
-        if val is not None:
-            kwargs[name] = val
-    return _cum_sum_(**kwargs)
-cumsum.__doc__ = """
-	:alias_main: paddle.cumsum
-	:alias: paddle.cumsum,paddle.tensor.cumsum,paddle.tensor.math.cumsum
-	:old_api: paddle.fluid.layers.cumsum
-The cumulative sum of the elements along a given axis. By default, the first element of the result is the same of the first element of the input. If exlusive is true, the first element of the result is 0.
-Args:
-    x (Variable): Input of cumsum operator, the Tensor/LoDTensor needed to be cumsumed.
-    axis (int, optional): The dimension to accumulate along. -1 means the last dimension. Default is -1.
-    exclusive (bool, optional): Whether to perform exclusive cumsum. Default is False.
-    reverse (bool, optional): If true, the cumsum is performed in the reversed direction. Default is False.
-Returns:
-    Variable(Tensor/LoDTensor): The result of cumsum operator, output of cumsum operator.
-Examples:
-    .. code-block:: python
-        import paddle.fluid as fluid
-        data = fluid.layers.data(name="input", shape=[32, 784])
-        result = fluid.layers.cumsum(data, axis=0)
-"""
-_thresholded_relu_ = generate_layer_fn('thresholded_relu')
-def thresholded_relu(x, threshold=None):
-    check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64'], 'thresholded_relu'
-    )
-    locals_var = locals().copy()
-    kwargs = dict()
-    for name, val in locals_var.items():
-        if val is not None:
-            kwargs[name] = val
-    return _thresholded_relu_(**kwargs)
-thresholded_relu.__doc__ = r"""
-	:alias_main: paddle.nn.functional.thresholded_relu
-	:alias: paddle.nn.functional.thresholded_relu,paddle.nn.functional.activation.thresholded_relu
-	:old_api: paddle.fluid.layers.thresholded_relu
-:strong:`Thresholded ReLU Activation Operator`
-Equation:
-    ..  math::
-        out = \\begin{cases}
-            x, &if x > threshold \\\\
-            0, &otherwise
-            \\end{cases}
-Args:
-    x(Variable): The input of Thresholded ReLU op, Tensor or LoDTensor, dtype: float32 or float64.
-    threshold(float, optional): The threshold value. Note that if the arg `threshold` is not set, the threshold in the equation is 1.0.
-Returns:
-    Variable: The output of Thresholded ReLU op, Tensor or LoDTensor, dtype: float32 or float64, the same as the input, shape: the same as the input.
-Examples:
-    .. code-block:: python
-        # declarative mode
-        import numpy as np
-        from paddle import fluid
-        x = fluid.data(name="x", shape=(-1, 3), dtype="float32")
-        y = fluid.layers.thresholded_relu(x, threshold=0.1)
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        start = fluid.default_startup_program()
-        main = fluid.default_main_program()
-        data = np.random.randn(2, 3).astype("float32")
-        exe.run(start)
-        y_np, = exe.run(main, feed={"x": data}, fetch_list=[y])
-        data
-        # array([[ 0.21134382, -1.1805999 ,  0.32876605],
-        #        [-1.2210793 , -0.7365624 ,  1.0013918 ]], dtype=float32)
-        y_np
-        # array([[ 0.21134382, -0.        ,  0.32876605],
-        #        [-0.        , -0.        ,  1.0013918 ]], dtype=float32)
-    .. code-block:: python
-        # imperative mode
-        import numpy as np
-        from paddle import fluid
-        import paddle.fluid.dygraph as dg
-        data = np.random.randn(2, 3).astype("float32")
-        place = fluid.CPUPlace()
-        with dg.guard(place) as g:
-            x = dg.to_variable(data)
-            y = fluid.layers.thresholded_relu(x, threshold=0.1)
-            y_np = y.numpy()
-        data
-        # array([[ 0.21134382, -1.1805999 ,  0.32876605],
-        #        [-1.2210793 , -0.7365624 ,  1.0013918 ]], dtype=float32)
-        y_np
-        # array([[ 0.21134382, -0.        ,  0.32876605],
-        #        [-0.        , -0.        ,  1.0013918 ]], dtype=float32)
-"""
-_gelu_ = generate_layer_fn('gelu')
-@deprecated(since="2.0.0", update_to="paddle.nn.functional.gelu")
-def gelu(x, approximate=False):
-    locals_var = locals().copy()
-    kwargs = dict()
-    for name, val in locals_var.items():
-        if val is not None:
-            kwargs[name] = val
-    return _gelu_(**kwargs)
-gelu.__doc__ = r"""
-:strong:`GeLU Activation Operator`
-For more details, see [Gaussian Error Linear Units](https://arxiv.org/abs/1606.08415).
-Equation:
-    if approximate is True
-    ..  math::
-        out = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3})))
-    else
-    ..  math::
-        out = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}}))
-Args:
-    x(Variable): The input of GeLU op, Tensor or LoDTensor, dtype: float32 or float64.
-Returns:
-    Variable: The output of GeLU op, Tensor or LoDTensor, dtype: float32 or float64, the same as the input, shape: the same as the input.
-Examples:
-    .. code-block:: python
-        # declarative mode
-        import numpy as np
-        from paddle import fluid
-        x = fluid.data(name="x", shape=(-1, 3), dtype="float32")
-        y = fluid.layers.gelu(x)
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        start = fluid.default_startup_program()
-        main = fluid.default_main_program()
-        data = np.random.randn(2, 3).astype("float32")
-        exe.run(start)
-        y_np, = exe.run(main, feed={"x": data}, fetch_list=[y])
-        data
-        # array([[ 0.87165993, -1.0541513 , -0.37214822],
-        #         [ 0.15647964,  0.32496083,  0.33045998]], dtype=float32)
-        y_np
-        # array([[ 0.70456535, -0.15380788, -0.13207214],
-        #        [ 0.08796856,  0.20387867,  0.2080159 ]], dtype=float32)
-    .. code-block:: python
-        # imperative mode
-        import numpy as np
-        from paddle import fluid
-        import paddle.fluid.dygraph as dg
-        data = np.random.randn(2, 3).astype("float32")
-        place = fluid.CPUPlace()
-        with dg.guard(place) as g:
-            x = dg.to_variable(data)
-            y = fluid.layers.gelu(x)
-            y_np = y.numpy()
-        data
-        # array([[ 0.87165993, -1.0541513 , -0.37214822],
-        #        [ 0.15647964,  0.32496083,  0.33045998]], dtype=float32)
-        y_np
-        # array([[ 0.70456535, -0.15380788, -0.13207214],
-        #        [ 0.08796856,  0.20387867,  0.2080159 ]], dtype=float32)
-"""
-_erf_ = generate_layer_fn('erf')
-def erf(x, name=None):
-    if in_dygraph_mode():
-        return _C_ops.erf(x)
-    locals_var = locals().copy()
-    kwargs = dict()
-    for name, val in locals_var.items():
-        if val is not None:
-            kwargs[name] = val
-    return _erf_(**kwargs)
-erf.__doc__ = r"""
-:strong:`Erf Operator`
-For more details, see [Error function](https://en.wikipedia.org/wiki/Error_function).
-Equation:
-    ..  math::
-        out = \\frac{2}{\\sqrt{\\pi}} \\int_{0}^{x}e^{- \\eta^{2}}d\\eta
-Args:
-    x (Tensor): The input tensor, it's data type should be float32, float64.
-Returns:
-    Tensor: The output of Erf op, dtype: float32 or float64, the same as the input, shape: the same as the input.
-Examples:
-    .. code-block:: python
-        import paddle
-        x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-        out = paddle.erf(x)
-        print(out)
-        # [-0.42839236 -0.22270259  0.11246292  0.32862676]
-"""
-def lgamma(x, name=None):
-    r"""
-    Calculates the lgamma of the given input tensor, element-wise.
-    This operator performs elementwise lgamma for input $X$.
-    :math:`out = log\Gamma(x)`
-    Args:
-        x (Tensor): Input Tensor. Must be one of the following types: float32, float64.
-        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-    Returns:
-        Tensor, the lgamma of the input Tensor, the shape and data type is the same with input.
-    Examples:
-        .. code-block:: python
-            import paddle
-            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
-            out = paddle.lgamma(x)
-            print(out)
-            # [1.31452441, 1.76149750, 2.25271273, 1.09579802]
-    """
-    return paddle.Tensor.lgamma(x)
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import paddle
 from . import layers
 from .data_feeder import check_variable_and_dtype, convert_dtype
 from ..utils import deprecated
@@ -387,7 +388,7 @@ def glu(input, dim=-1):
        input, 'input', ['float16', 'float32', 'float64'], "glu"
    )
    a, b = layers.split(input, num_or_sections=2, dim=dim)
-    act_b = layers.sigmoid(x=b)
+    act_b = paddle.nn.functional.sigmoid(x=b)
    out = layers.elementwise_mul(x=a, y=act_b)
    return out

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -48,7 +48,6 @@ from .clip import (
 from .framework import program_guard
 from .initializer import Constant
 from .layer_helper import LayerHelper
-from .layers import ops
 from .dygraph import base as imperative_base
 from .dygraph import no_grad
 from .dygraph.learning_rate_scheduler import (
@@ -4457,7 +4456,7 @@ class ModelAverage(Optimizer):
        sum = layers.cast(
            x=sum, dtype='float32' if self._dtype is None else self._dtype
        )
-        ops._elementwise_div(x=sum, y=tmp, out=param)
+        paddle.assign(paddle.divide(sum, tmp), output=param)
    def _add_average_restore_op(self, block, param_grad):
        param = block._clone_variable(param_grad[0])

--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -70,21 +70,21 @@ def dyn_rnn_lstm(
            gate1 = fluid.layers.fc(input=hidden, size=size, bias_attr=False)
            return gate0 + gate1
-        forget_gate = fluid.layers.sigmoid(
+        forget_gate = paddle.nn.functional.sigmoid(
            x=gate_common(word, prev_hidden, lstm_size)
        )
-        input_gate = fluid.layers.sigmoid(
+        input_gate = paddle.nn.functional.sigmoid(
            x=gate_common(word, prev_hidden, lstm_size)
        )
-        output_gate = fluid.layers.sigmoid(
+        output_gate = paddle.nn.functional.sigmoid(
            x=gate_common(word, prev_hidden, lstm_size)
        )
-        cell_gate = fluid.layers.sigmoid(
+        cell_gate = paddle.nn.functional.sigmoid(
            x=gate_common(word, prev_hidden, lstm_size)
        )
        cell = forget_gate * prev_cell + input_gate * cell_gate
-        hidden = output_gate * fluid.layers.tanh(x=cell)
+        hidden = output_gate * paddle.tanh(x=cell)
        rnn.update_memory(prev_cell, cell)
        rnn.update_memory(prev_hidden, hidden)
        rnn.output(hidden)

--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -70,10 +70,10 @@ def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
    def linear(inputs):
        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
-    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    forget_gate = paddle.nn.functional.sigmoid(x=linear([hidden_t_prev, x_t]))
-    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    input_gate = paddle.nn.functional.sigmoid(x=linear([hidden_t_prev, x_t]))
-    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    output_gate = paddle.nn.functional.sigmoid(x=linear([hidden_t_prev, x_t]))
-    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
+    cell_tilde = paddle.tanh(x=linear([hidden_t_prev, x_t]))
    cell_t = fluid.layers.sums(
        input=[
@@ -83,7 +83,7 @@ def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
    )
    hidden_t = fluid.layers.elementwise_mul(
-        x=output_gate, y=fluid.layers.tanh(x=cell_t)
+        x=output_gate, y=paddle.tanh(x=cell_t)
    )
    return hidden_t, cell_t

--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -175,12 +175,12 @@ class TestIfElse(unittest.TestCase):
            ie = layers.IfElse(ifcond)
            with ie.true_block():
                true_target = ie.input(src)
-                true_target = fluid.layers.exp(true_target)
+                true_target = paddle.exp(true_target)
                ie.output(true_target)
            with ie.false_block():
                false_target = ie.input(src)
-                false_target = fluid.layers.tanh(false_target)
+                false_target = paddle.tanh(false_target)
                ie.output(false_target)
            if_out = ie()
            out = layers.reduce_sum(if_out[0])
@@ -244,7 +244,7 @@ class TestIfElseError(unittest.TestCase):
                ie = layers.IfElse(ifcond)
                with ie.true_block():
                    true_target = ie.input(src)
-                    true_target = fluid.layers.exp(true_target)
+                    true_target = paddle.exp(true_target)
                    ie.output([])

--- a/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_fleet_simnet_bow.py
@@ -130,7 +130,7 @@ def train_network(
    q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
    # vsum
    q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
-    q_ss = fluid.layers.softsign(q_sum)
+    q_ss = paddle.nn.functional.softsign(q_sum)
    # fc layer after conv
    q_fc = fluid.layers.fc(
        input=q_ss,
@@ -157,7 +157,7 @@ def train_network(
    pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
    # vsum
    pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
-    pt_ss = fluid.layers.softsign(pt_sum)
+    pt_ss = paddle.nn.functional.softsign(pt_sum)
    # fc layer
    pt_fc = fluid.layers.fc(
        input=pt_ss,
@@ -181,7 +181,7 @@ def train_network(
    nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
    # vsum
    nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
-    nt_ss = fluid.layers.softsign(nt_sum)
+    nt_ss = paddle.nn.functional.softsign(nt_sum)
    # fc layer
    nt_fc = fluid.layers.fc(
        input=nt_ss,

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
@@ -61,7 +61,7 @@ def dyfunc_with_if_else2(x, col=100):
        y = fluid.layers.relu(x)
    else:
        x_pow = fluid.layers.pow(x, 2)
-        y = fluid.layers.tanh(x_pow)
+        y = paddle.tanh(x_pow)
    return y
@@ -161,7 +161,7 @@ def nested_if_else(x_v):
            tmp = y * w
            y = fluid.layers.relu(tmp)
            if paddle.mean(y).numpy()[0] < batch_size:
-                y = fluid.layers.abs(y)
+                y = paddle.abs(y)
            else:
                tmp = fluid.layers.fill_constant(
                    y.shape, dtype='float32', value=-1
@@ -276,7 +276,7 @@ class NetWithControlFlowIf(fluid.dygraph.Layer):
                    self.constant_vars['w'] = fluid.layers.fill_constant(
                        [hidden_dim], dtype='float32', value=9
                    )
-                    y = fluid.layers.abs(y)
+                    y = paddle.abs(y)
                else:
                    tmp = fluid.layers.fill_constant(
                        y.shape, dtype='float32', value=-1

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/seq2seq_dygraph_model.py
@@ -49,8 +49,8 @@ class BasicLSTMUnit(Layer):
        self._hiden_size = hidden_size
        self._param_attr = param_attr
        self._bias_attr = bias_attr
-        self._gate_activation = gate_activation or layers.sigmoid
+        self._gate_activation = gate_activation or paddle.nn.functional.sigmoid
-        self._activation = activation or layers.tanh
+        self._activation = activation or paddle.tanh
        self._forget_bias = forget_bias
        self._dtype = dtype
        self._input_size = input_size
@@ -76,12 +76,14 @@ class BasicLSTMUnit(Layer):
        i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
        new_cell = layers.elementwise_add(
            layers.elementwise_mul(
-                pre_cell, layers.sigmoid(f + self._forget_bias)
+                pre_cell, paddle.nn.functional.sigmoid(f + self._forget_bias)
+            ),
+            layers.elementwise_mul(
+                paddle.nn.functional.sigmoid(i), paddle.tanh(j)
            ),
-            layers.elementwise_mul(layers.sigmoid(i), layers.tanh(j)),
        )
-        new_hidden = layers.tanh(new_cell) * layers.sigmoid(o)
+        new_hidden = paddle.tanh(new_cell) * paddle.nn.functional.sigmoid(o)
        return new_hidden, new_cell

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/simnet_dygraph_model.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.param_attr as attr
@@ -232,7 +231,7 @@ class SoftsignLayer:
        """
        operation
        """
-        softsign = fluid.layers.softsign(input)
+        softsign = paddle.nn.functional.softsign(input)
        return softsign

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_cycle_gan.py
@@ -89,28 +89,22 @@ class Cycle_Gan(fluid.dygraph.Layer):
        cyc_A = self.build_generator_resnet_9blocks_b(fake_B)
        cyc_B = self.build_generator_resnet_9blocks_a(fake_A)
-        diff_A = fluid.layers.abs(
+        diff_A = paddle.abs(fluid.layers.elementwise_sub(x=input_A, y=cyc_A))
-            fluid.layers.elementwise_sub(x=input_A, y=cyc_A)
+        diff_B = paddle.abs(fluid.layers.elementwise_sub(x=input_B, y=cyc_B))
-        )
-        diff_B = fluid.layers.abs(
-            fluid.layers.elementwise_sub(x=input_B, y=cyc_B)
-        )
        cyc_A_loss = fluid.layers.reduce_mean(diff_A) * lambda_A
        cyc_B_loss = fluid.layers.reduce_mean(diff_B) * lambda_B
        cyc_loss = cyc_A_loss + cyc_B_loss
        fake_rec_A = self.build_gen_discriminator_a(fake_B)
-        g_A_loss = fluid.layers.reduce_mean(fluid.layers.square(fake_rec_A - 1))
+        g_A_loss = paddle.mean(paddle.square(fake_rec_A - 1))
        fake_rec_B = self.build_gen_discriminator_b(fake_A)
-        g_B_loss = fluid.layers.reduce_mean(fluid.layers.square(fake_rec_B - 1))
+        g_B_loss = paddle.mean(paddle.square(fake_rec_B - 1))
        G = g_A_loss + g_B_loss
        idt_A = self.build_generator_resnet_9blocks_a(input_B)
        idt_loss_A = (
            fluid.layers.reduce_mean(
-                fluid.layers.abs(
+                paddle.abs(fluid.layers.elementwise_sub(x=input_B, y=idt_A))
-                    fluid.layers.elementwise_sub(x=input_B, y=idt_A)
-                )
            )
            * lambda_B
            * lambda_identity
@@ -119,9 +113,7 @@ class Cycle_Gan(fluid.dygraph.Layer):
        idt_B = self.build_generator_resnet_9blocks_b(input_A)
        idt_loss_B = (
            fluid.layers.reduce_mean(
-                fluid.layers.abs(
+                paddle.abs(fluid.layers.elementwise_sub(x=input_A, y=idt_B))
-                    fluid.layers.elementwise_sub(x=input_A, y=idt_B)
-                )
            )
            * lambda_A
            * lambda_identity
@@ -271,7 +263,7 @@ class build_generator_resnet_9blocks(fluid.dygraph.Layer):
        y = self.deconv1(y)
        y = fluid.layers.pad2d(y, [3, 3, 3, 3], mode="reflect")
        y = self.conv3(y)
-        y = fluid.layers.tanh(y)
+        y = paddle.tanh(y)
        return y
@@ -647,8 +639,7 @@ def train(args, to_static):
                    data_B, fake_pool_B
                )
                d_loss_A = (
-                    fluid.layers.square(fake_pool_rec_B)
+                    paddle.square(fake_pool_rec_B) + paddle.square(rec_B - 1)
-                    + fluid.layers.square(rec_B - 1)
                ) / 2.0
                d_loss_A = fluid.layers.reduce_mean(d_loss_A)
@@ -661,8 +652,7 @@ def train(args, to_static):
                    data_A, fake_pool_A
                )
                d_loss_B = (
-                    fluid.layers.square(fake_pool_rec_A)
+                    paddle.square(fake_pool_rec_A) + paddle.square(rec_A - 1)
-                    + fluid.layers.square(rec_A - 1)
                ) / 2.0
                d_loss_B = fluid.layers.reduce_mean(d_loss_B)

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
@@ -99,10 +99,10 @@ class SimpleLSTMRNN(fluid.Layer):
                i, j, f, o = fluid.layers.split(
                    gate_input, num_or_sections=4, dim=-1
                )
-                c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
+                c = pre_cell * paddle.nn.functional.sigmoid(
-                    i
+                    f
-                ) * fluid.layers.tanh(j)
+                ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j)
-                m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
+                m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o)
                hidden_array[k] = m
                cell_array[k] = c
                step_input = m

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
@@ -145,7 +145,7 @@ class BOW(fluid.dygraph.Layer):
        emb = emb * mask_emb
        emb = fluid.layers.reshape(emb, shape=[-1, self.seq_len, self.hid_dim])
        bow_1 = fluid.layers.reduce_sum(emb, dim=1)
-        bow_1 = fluid.layers.tanh(bow_1)
+        bow_1 = paddle.tanh(bow_1)
        fc_1 = self._fc1(bow_1)
        fc_2 = self._fc2(fc_1)
        prediction = self._fc_prediction(fc_2)
@@ -197,7 +197,7 @@ class GRU(fluid.dygraph.Layer):
        fc_1 = self._fc1(emb)
        gru_hidden = self._gru(fc_1)
        gru_hidden = fluid.layers.reduce_max(gru_hidden, dim=1)
-        tanh_1 = fluid.layers.tanh(gru_hidden)
+        tanh_1 = paddle.tanh(gru_hidden)
        fc_2 = self._fc2(tanh_1)
        prediction = self._fc_prediction(fc_2)
@@ -253,8 +253,8 @@ class BiGRU(fluid.dygraph.Layer):
        fc_1 = self._fc1(emb)
        gru_forward = self._gru_forward(fc_1)
        gru_backward = self._gru_backward(fc_1)
-        gru_forward_tanh = fluid.layers.tanh(gru_forward)
+        gru_forward_tanh = paddle.tanh(gru_forward)
-        gru_backward_tanh = fluid.layers.tanh(gru_backward)
+        gru_backward_tanh = paddle.tanh(gru_backward)
        encoded_vector = fluid.layers.concat(
            input=[gru_forward_tanh, gru_backward_tanh], axis=2
        )

--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_word2vec.py
@@ -18,6 +18,7 @@ import numpy as np
 import paddle.fluid as fluid
 import unittest
+import paddle
 from paddle.fluid.dygraph.nn import Embedding
 from paddle.fluid.dygraph import ProgramTranslator
 from paddle.fluid.dygraph import declarative
@@ -260,7 +261,7 @@ class SkipGram(fluid.dygraph.Layer):
        )
        word_sim = fluid.layers.reduce_sum(word_sim, dim=-1)
-        pred = fluid.layers.sigmoid(word_sim)
+        pred = paddle.nn.functional.sigmoid(word_sim)
        loss = fluid.layers.sigmoid_cross_entropy_with_logits(word_sim, label)
        loss = fluid.layers.reduce_mean(loss)

--- a/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_cumsum_op_ipu.py
@@ -51,7 +51,7 @@ class TestBase(IPUOpTest):
        x = paddle.static.data(
            name=self.feed_list[0], shape=self.feed_shape[0], dtype="float32"
        )
-        out = paddle.fluid.layers.cumsum(x, **self.attrs)
+        out = paddle.cumsum(x, **self.attrs)
        self.fetch_list = [out.name]
    def run_model(self, exec_mode):
@@ -90,7 +90,7 @@ class TestCase4(TestBase):
        x = paddle.static.data(
            name=self.feed_list[0], shape=self.feed_shape[0], dtype="int32"
        )
-        out = paddle.fluid.layers.cumsum(x, **self.attrs)
+        out = paddle.cumsum(x, **self.attrs)
        self.fetch_list = [out.name]
@@ -104,7 +104,7 @@ class TestCase5(TestBase):
        x = paddle.static.data(
            name=self.feed_list[0], shape=self.feed_shape[0], dtype="int64"
        )
-        out = paddle.fluid.layers.cumsum(x, **self.attrs)
+        out = paddle.cumsum(x, **self.attrs)
        self.fetch_list = [out.name]

--- a/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_gelu_op_ipu.py
@@ -45,7 +45,7 @@ class TestBase(IPUOpTest):
        x = paddle.static.data(
            name=self.feed_list[0], shape=self.feed_shape[0], dtype='float32'
        )
-        out = paddle.fluid.layers.gelu(x, **self.attrs)
+        out = paddle.nn.functional.gelu(x, **self.attrs)
        self.fetch_list = [out.name]
    def run_model(self, exec_mode):

--- a/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_unary_ops_ipu.py
@@ -29,7 +29,7 @@ class TestBase(IPUOpTest):
        self.set_feed_attr()
    def set_test_op(self):
-        self.op = paddle.fluid.layers.abs
+        self.op = paddle.abs
        self.op_attrs = {}
    def set_data_feed(self):
@@ -70,55 +70,55 @@ class TestAcos(TestBase):
        self.atol = 1e-6
    def set_test_op(self):
-        self.op = paddle.fluid.layers.acos
+        self.op = paddle.acos
        self.op_attrs = {}
 class TestAsin(TestAcos):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.asin
+        self.op = paddle.asin
        self.op_attrs = {}
 class TestSinh(TestAcos):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.sinh
+        self.op = paddle.sinh
        self.op_attrs = {}
 class TestAtan(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.atan
+        self.op = paddle.atan
        self.op_attrs = {}
 class TestCeil(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.ceil
+        self.op = paddle.ceil
        self.op_attrs = {}
 class TestCos(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.cos
+        self.op = paddle.cos
        self.op_attrs = {}
 class TestCosh(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.cosh
+        self.op = paddle.cosh
        self.op_attrs = {}
 class TestErf(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.erf
+        self.op = paddle.erf
        self.op_attrs = {}
 class TestExp(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.exp
+        self.op = paddle.exp
        self.op_attrs = {}
@@ -128,19 +128,19 @@ class TestFloor(TestBase):
        return False
    def set_test_op(self):
-        self.op = paddle.fluid.layers.floor
+        self.op = paddle.floor
        self.op_attrs = {}
 class TestLog(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.log
+        self.op = paddle.log
        self.op_attrs = {}
 class TestReciprocal(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.reciprocal
+        self.op = paddle.reciprocal
        self.op_attrs = {}
@@ -152,55 +152,55 @@ class TestRelu(TestBase):
 class TestRound(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.round
+        self.op = paddle.round
        self.op_attrs = {}
 class TestSigmoid(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.sigmoid
+        self.op = paddle.nn.functional.sigmoid
        self.op_attrs = {}
 class TestSign(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.sign
+        self.op = paddle.sign
        self.op_attrs = {}
 class TestSin(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.sin
+        self.op = paddle.sin
        self.op_attrs = {}
 class TestSoftplus(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.softplus
+        self.op = paddle.nn.functional.softplus
        self.op_attrs = {}
 class TestSoftsign(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.softsign
+        self.op = paddle.nn.functional.softsign
        self.op_attrs = {}
 class TestSqrt(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.sqrt
+        self.op = paddle.sqrt
        self.op_attrs = {}
 class TestTan(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.tan
+        self.op = paddle.tan
        self.op_attrs = {}
 class TestTanh(TestBase):
    def set_test_op(self):
-        self.op = paddle.fluid.layers.tanh
+        self.op = paddle.tanh
        self.op_attrs = {}

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
@@ -75,7 +75,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Tanh(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_add
-        self.act = fluid.layers.tanh
+        self.act = paddle.tanh
 class ElementwiseActivationMkldnnFusePassTest_Add_LeakyRelu(
@@ -108,7 +108,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_SQRT(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_add
-        self.act = fluid.layers.sqrt
+        self.act = paddle.sqrt
 class ElementwiseActivationMkldnnFusePassTest_Add_ABS(
@@ -116,7 +116,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_ABS(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_add
-        self.act = fluid.layers.abs
+        self.act = paddle.abs
 class ElementwiseActivationMkldnnFusePassTest_Add_Clip(
@@ -134,7 +134,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Gelu(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_add
-        self.act = fluid.layers.gelu
+        self.act = paddle.nn.functional.gelu
 class ElementwiseActivationMkldnnFusePassTest_Add_Gelu_Tanh(
@@ -142,7 +142,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Gelu_Tanh(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_add
-        self.act = fluid.layers.gelu
+        self.act = paddle.nn.functional.gelu
        self.act_alpha = True
@@ -159,7 +159,7 @@ class ElementwiseActivationMkldnnFusePassTest_Add_Sigmoid(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_add
-        self.act = fluid.layers.sigmoid
+        self.act = paddle.nn.functional.sigmoid
 class ElementwiseActivationMkldnnFusePassTest_Sub_Relu(
@@ -175,7 +175,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Tanh(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_sub
-        self.act = fluid.layers.tanh
+        self.act = paddle.tanh
 class ElementwiseActivationMkldnnFusePassTest_Sub_LeakyRelu(
@@ -208,7 +208,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_ABS(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_sub
-        self.act = fluid.layers.abs
+        self.act = paddle.abs
 class ElementwiseActivationMkldnnFusePassTest_Sub_Clip(
@@ -226,7 +226,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_sub
-        self.act = fluid.layers.gelu
+        self.act = paddle.nn.functional.gelu
 class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu_Tanh(
@@ -234,7 +234,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu_Tanh(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_sub
-        self.act = fluid.layers.gelu
+        self.act = paddle.nn.functional.gelu
        self.act_alpha = True
@@ -251,7 +251,7 @@ class ElementwiseActivationMkldnnFusePassTest_Sub_Sigmoid(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_sub
-        self.act = fluid.layers.sigmoid
+        self.act = paddle.nn.functional.sigmoid
 class ElementwiseActivationMkldnnFusePassTest_Mul_Relu(
@@ -267,7 +267,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Tanh(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_mul
-        self.act = fluid.layers.tanh
+        self.act = paddle.tanh
 class ElementwiseActivationMkldnnFusePassTest_Mul_LeakyRelu(
@@ -300,7 +300,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_SQRT(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_mul
-        self.act = fluid.layers.sqrt
+        self.act = paddle.sqrt
 class ElementwiseActivationMkldnnFusePassTest_Mul_ABS(
@@ -308,7 +308,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_ABS(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_mul
-        self.act = fluid.layers.abs
+        self.act = paddle.abs
 class ElementwiseActivationMkldnnFusePassTest_Mul_Clip(
@@ -326,7 +326,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_mul
-        self.act = fluid.layers.gelu
+        self.act = paddle.nn.functional.gelu
 class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu_Tanh(
@@ -334,7 +334,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu_Tanh(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_mul
-        self.act = fluid.layers.gelu
+        self.act = paddle.nn.functional.gelu
        self.act_alpha = True
@@ -351,7 +351,7 @@ class ElementwiseActivationMkldnnFusePassTest_Mul_Sigmoid(
 ):
    def set_params(self):
        self.operand = fluid.layers.elementwise_mul
-        self.act = fluid.layers.sigmoid
+        self.act = paddle.nn.functional.sigmoid
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py
@@ -17,6 +17,7 @@ import shutil
 import unittest
 import numpy as np
 from inference_pass_test import InferencePassTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.core import PassVersionChecker
@@ -81,7 +82,7 @@ class TensorRTSubgraphPassSoftMaxTest(TensorRTSubgraphPassActivationTest):
 class TensorRTSubgraphPassSigmoidTest(TensorRTSubgraphPassActivationTest):
    def append_act(self, x):
-        return fluid.layers.sigmoid(x)
+        return paddle.nn.functional.sigmoid(x)
 class TensorRTSubgraphPassHardSwishTest(TensorRTSubgraphPassActivationTest):
@@ -108,7 +109,7 @@ class TensorRTSubgraphPassClipTest(TensorRTSubgraphPassActivationTest):
 class TensorRTSubgraphPassTanhTest(TensorRTSubgraphPassActivationTest):
    def append_act(self, x):
-        return fluid.layers.tanh(x)
+        return paddle.tanh(x)
 class TensorRTSubgraphPassSwishTest(TensorRTSubgraphPassActivationTest):
@@ -303,7 +304,7 @@ class TensorRTSubgraphPassPreluFp16DynamicSerializeTest(
 class TensorRTSubgraphPassGeluTest(TensorRTSubgraphPassActivationTest):
    def append_act(self, x):
-        return fluid.layers.gelu(x)
+        return paddle.nn.functional.gelu(x)
 class TensorRTSubgraphPassGeluDynamicTest(TensorRTSubgraphPassActivationTest):
@@ -322,7 +323,7 @@ class TensorRTSubgraphPassGeluDynamicTest(TensorRTSubgraphPassActivationTest):
        )
    def append_act(self, x):
-        return fluid.layers.gelu(x)
+        return paddle.nn.functional.gelu(x)
 class TensorRTSubgraphPassGeluFp16Test(TensorRTSubgraphPassActivationTest):
@@ -333,7 +334,7 @@ class TensorRTSubgraphPassGeluFp16Test(TensorRTSubgraphPassActivationTest):
        )
    def append_act(self, x):
-        return fluid.layers.gelu(x)
+        return paddle.nn.functional.gelu(x)
 class TensorRTSubgraphPassGeluFp16SerializeTest(
@@ -346,7 +347,7 @@ class TensorRTSubgraphPassGeluFp16SerializeTest(
        )
    def append_act(self, x):
-        return fluid.layers.gelu(x)
+        return paddle.nn.functional.gelu(x)
 class TensorRTSubgraphPassGeluFp16DynamicTest(
@@ -367,7 +368,7 @@ class TensorRTSubgraphPassGeluFp16DynamicTest(
        )
    def append_act(self, x):
-        return fluid.layers.gelu(x)
+        return paddle.nn.functional.gelu(x)
 class TensorRTSubgraphPassGeluFp16DynamicSerializeTest(
@@ -388,7 +389,7 @@ class TensorRTSubgraphPassGeluFp16DynamicSerializeTest(
        )
    def append_act(self, x):
-        return fluid.layers.gelu(x)
+        return paddle.nn.functional.gelu(x)
 if __name__ == "__main__":

--- a/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_fusion_group_pass.py
@@ -15,6 +15,7 @@
 import unittest
 import numpy as np
+import paddle
 from pass_test import PassTest
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
@@ -85,10 +86,14 @@ class FusionGroupPassComplicatedTest(FusionGroupPassTest):
            one = layers.fill_constant(shape=[1], dtype=dtype, value=1.0)
            tmp_0 = one * self.feed_vars[0]
            # subgraph with 9 op nodes
-            tmp_1 = tmp_0 * layers.sigmoid(self.feed_vars[1]) + layers.sigmoid(
+            tmp_1 = tmp_0 * paddle.nn.functional.sigmoid(
-                self.feed_vars[2]
+                self.feed_vars[1]
-            ) * layers.tanh(self.feed_vars[3])
+            ) + paddle.nn.functional.sigmoid(self.feed_vars[2]) * paddle.tanh(
-            tmp_2 = layers.tanh(tmp_1) + layers.sigmoid(self.feed_vars[4])
+                self.feed_vars[3]
+            )
+            tmp_2 = paddle.tanh(tmp_1) + paddle.nn.functional.sigmoid(
+                self.feed_vars[4]
+            )
        self.append_gradients(tmp_2)
@@ -162,10 +167,10 @@ class FusionGroupPassSumTest(FusionGroupPassTest):
            tmp_0 = layers.sum(
                [self.feed_vars[0], self.feed_vars[1], self.feed_vars[2]]
            )
-            tmp_1 = layers.sqrt(tmp_0)
+            tmp_1 = paddle.sqrt(tmp_0)
            tmp_2 = layers.mul(tmp_0, self.feed_vars[3])
            # subgraph with 2 op nodes
-            tmp_3 = layers.square(layers.sum([tmp_1, tmp_2]))
+            tmp_3 = paddle.square(layers.sum([tmp_1, tmp_2]))
        self.append_gradients(tmp_3)

--- a/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
@@ -97,7 +97,7 @@ class TestSyncBatchNormOpTraining(TestSyncBatchNormRunnerBase):
                )
                if self.bn_dtype == np.float16:
                    bn = fluid.layers.cast(bn, 'float32')
-                sigmoid = fluid.layers.sigmoid(bn)
+                sigmoid = paddle.nn.functional.sigmoid(bn)
                out = fluid.layers.reduce_sum(sigmoid)
                # if not sync_bn:
                #     out = out / core.get_mlu_device_count()

--- a/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_gelu_op_mlu.py
@@ -109,7 +109,7 @@ class TestGeluNet(unittest.TestCase):
            c = paddle.multiply(a, b)
            fc_1 = fluid.layers.fc(input=c, size=128)
-            fc_1_gelu = fluid.layers.gelu(fc_1)
+            fc_1_gelu = paddle.nn.functional.gelu(fc_1)
            prediction = fluid.layers.fc(input=fc_1_gelu, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)

--- a/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/sync_batch_norm_op_npu.py
@@ -99,7 +99,7 @@ class TestSyncBatchNormOpTraining(TestSyncBatchNormRunnerBase):
                )
                # if self.dtype == np.float16:
                #     bn = fluid.layers.cast(bn, 'float32')
-                sigmoid = fluid.layers.sigmoid(bn)
+                sigmoid = paddle.nn.functional.sigmoid(bn)
                out = fluid.layers.reduce_sum(sigmoid)
                # if not sync_bn:
                #     out = out / core.get_npu_device_count()

--- a/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_gelu_op_npu.py
@@ -109,7 +109,7 @@ class TestGeluNet(unittest.TestCase):
            c = paddle.multiply(a, b)
            fc_1 = fluid.layers.fc(input=c, size=128)
-            fc_1_gelu = fluid.layers.gelu(fc_1)
+            fc_1_gelu = paddle.nn.functional.gelu(fc_1)
            prediction = fluid.layers.fc(input=fc_1_gelu, size=2, act='softmax')
            cost = fluid.layers.cross_entropy(input=prediction, label=label)

--- a/python/paddle/fluid/tests/unittests/simple_nets.py
+++ b/python/paddle/fluid/tests/unittests/simple_nets.py
@@ -88,7 +88,7 @@ def bow_net(
        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]
    )
    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bow_tanh = fluid.layers.tanh(bow)
+    bow_tanh = paddle.tanh(bow)
    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")

--- a/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_nn_grad.py
@@ -33,7 +33,7 @@ class TestSigmoidTripleGradCheck(unittest.TestCase):
        dtype = np.float64
        x = layers.data('x', shape, False, dtype=dtype)
        x.persistable = True
-        y = layers.sigmoid(x)
+        y = F.sigmoid(x)
        x_arr = np.random.random(shape).astype(dtype)
        x_arr[np.abs(x_arr) < 0.005] = 0.002
        gradient_checker.triple_grad_check(
@@ -51,7 +51,7 @@ class TestSigmoidTripleGradCheck(unittest.TestCase):
 class TestSigmoidDoubleGradCheck(unittest.TestCase):
    def sigmoid_wrapper(self, x):
-        return fluid.layers.sigmoid(x[0])
+        return F.sigmoid(x[0])
    @prog_scope()
    def func(self, place):
@@ -60,7 +60,7 @@ class TestSigmoidDoubleGradCheck(unittest.TestCase):
        dtype = np.float64
        x = layers.data('x', shape, False, dtype=dtype)
        x.persistable = True
-        y = layers.sigmoid(x)
+        y = F.sigmoid(x)
        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
        x_arr[np.abs(x_arr) < 0.005] = 0.002
        gradient_checker.double_grad_check(
@@ -92,7 +92,7 @@ class TestTanhTripleGradCheck(unittest.TestCase):
        dtype = np.float64
        x = layers.data('x', shape, False, dtype=dtype)
        x.persistable = True
-        y = layers.tanh(x)
+        y = paddle.tanh(x)
        x_arr = np.random.random(shape).astype(dtype)
        x_arr[np.abs(x_arr) < 0.005] = 0.002
        gradient_checker.triple_grad_check(
@@ -322,7 +322,7 @@ class TestSqrtDoubleGradCheck(unittest.TestCase):
        x = layers.data('x', shape, False, dtype)
        x.persistable = True
-        y = layers.sqrt(x)
+        y = paddle.sqrt(x)
        x_arr = np.random.uniform(0.1, 1, shape).astype(dtype)
        gradient_checker.double_grad_check(
@@ -354,7 +354,7 @@ class TestRsqrtDoubleGradCheck(unittest.TestCase):
        x = layers.data('x', shape, False, dtype)
        x.persistable = True
-        y = layers.rsqrt(x)
+        y = paddle.rsqrt(x)
        x_arr = np.random.uniform(0.1, 1, shape).astype(dtype)
        gradient_checker.double_grad_check(
@@ -386,7 +386,7 @@ class TestSquareDoubleGradCheck(unittest.TestCase):
        x = layers.data('x', shape, False, dtype)
        x.persistable = True
-        y = layers.square(x)
+        y = paddle.square(x)
        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
        gradient_checker.double_grad_check(
@@ -417,7 +417,7 @@ class TestAbsDoubleGradCheck(unittest.TestCase):
        x = layers.data('x', shape, False, dtype)
        x.persistable = True
-        y = layers.abs(x)
+        y = paddle.abs(x)
        x_arr = np.random.uniform(-1, 1, shape).astype(dtype)
        # Because we set delta = 0.005 in calculating numeric gradient,
        # if x is too small, the numeric gradient is inaccurate.
@@ -449,7 +449,7 @@ class TestLogDoubleGradCheck(unittest.TestCase):
        x = layers.data('x', shape, False, dtype)
        x.persistable = True
-        y = layers.log(x)
+        y = paddle.log(x)
        x_arr = np.random.uniform(0.1, 1, shape).astype(dtype)
@@ -608,7 +608,7 @@ class TestSinTripleGradCheck(unittest.TestCase):
        dtype = np.float64
        x = layers.data('x', shape, False, dtype=dtype)
        x.persistable = True
-        y = layers.sin(x)
+        y = paddle.sin(x)
        x_arr = np.random.random(shape).astype(dtype)
        x_arr[np.abs(x_arr) < 0.005] = 0.002
        gradient_checker.triple_grad_check(
@@ -733,7 +733,7 @@ class TestCosTripleGradCheck(unittest.TestCase):
        dtype = np.float64
        x = layers.data('x', shape, False, dtype=dtype)
        x.persistable = True
-        y = layers.cos(x)
+        y = paddle.cos(x)
        x_arr = np.random.random(shape).astype(dtype)
        x_arr[np.abs(x_arr) < 0.005] = 0.002
        gradient_checker.triple_grad_check(

--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -33,17 +33,17 @@ class TestSqrtOpError(unittest.TestCase):
        with program_guard(Program(), Program()):
            # The input type of sqrt op must be Variable or numpy.ndarray.
            in1 = 1
-            self.assertRaises(TypeError, fluid.layers.sqrt, in1)
+            self.assertRaises(TypeError, paddle.sqrt, in1)
            # The input dtype of sqrt op must be float16, float32, float64.
            in2 = fluid.layers.data(
                name='input2', shape=[12, 10], dtype="int32"
            )
-            self.assertRaises(TypeError, fluid.layers.sqrt, in2)
+            self.assertRaises(TypeError, paddle.sqrt, in2)
            in3 = fluid.layers.data(
                name='input3', shape=[12, 10], dtype="float16"
            )
-            fluid.layers.sqrt(x=in3)
+            paddle.sqrt(x=in3)
 class TestActivation(OpTest):
@@ -390,16 +390,6 @@ class TestLogSigmoidAPI(unittest.TestCase):
            np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05)
        paddle.enable_static()
-    def test_fluid_api(self):
-        paddle.enable_static()
-        with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.fluid.data('X', [11, 17])
-            out = paddle.fluid.layers.logsigmoid(x)
-            exe = paddle.static.Executor(self.place)
-            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
-        out_ref = np.log(1 / (1 + np.exp(-self.x_np)))
-        np.testing.assert_allclose(out_ref, res[0], rtol=1e-05)
    def test_errors(self):
        paddle.enable_static()
        with paddle.static.program_guard(paddle.static.Program()):
@@ -488,16 +478,6 @@ class TestTanhAPI(unittest.TestCase):
            np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05)
        paddle.enable_static()
-    def test_fluid_api(self):
-        paddle.enable_static()
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data('X', [10, 12], self.dtype)
-            out = fluid.layers.tanh(x)
-            exe = fluid.Executor(self.place)
-            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
-        out_ref = np.tanh(self.x_np)
-        np.testing.assert_allclose(out_ref, res[0], rtol=1e-05)
    def test_errors(self):
        paddle.enable_static()
        with paddle.static.program_guard(paddle.static.Program()):
@@ -593,7 +573,7 @@ class TestSinhAPI(unittest.TestCase):
        with fluid.dygraph.guard():
            np_x = np.array([0.1])
            x = fluid.dygraph.to_variable(np_x)
-            z = fluid.layers.sinh(x).numpy()
+            z = paddle.sinh(x).numpy()
            z_expected = np.sinh(np_x)
            np.testing.assert_allclose(z, z_expected, rtol=1e-05)
@@ -610,7 +590,7 @@ class TestSinhAPI(unittest.TestCase):
                dtype="float32",
            )
-            pd_sinh_out = fluid.layers.sinh(data_x)
+            pd_sinh_out = paddle.sinh(data_x)
            exe = fluid.Executor(place=fluid.CPUPlace())
            exe.run(fluid.default_startup_program())
            (np_sinh_res,) = exe.run(
@@ -630,7 +610,7 @@ class TestSinhAPI(unittest.TestCase):
            )
            var = fluid.dygraph.to_variable(input_x)
            var.stop_gradient = False
-            loss = fluid.layers.sinh(var)
+            loss = paddle.sinh(var)
            loss.backward()
            grad_var = var.gradient()
            self.assertEqual(grad_var.shape, input_x.shape)
@@ -640,13 +620,13 @@ class TestSinhOpError(unittest.TestCase):
    def test_errors(self):
        with program_guard(Program()):
            # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.sinh, 1)
+            self.assertRaises(TypeError, paddle.sinh, 1)
            # The input dtype must be float16, float32, float64.
            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.sinh, x_int32)
+            self.assertRaises(TypeError, paddle.sinh, x_int32)
            # support the input dtype is float16
            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.sinh(x_fp16)
+            paddle.sinh(x_fp16)
 class TestCosh(TestActivation):
@@ -678,7 +658,7 @@ class TestCoshAPI(unittest.TestCase):
        with fluid.dygraph.guard():
            np_x = np.array([0.1])
            x = fluid.dygraph.to_variable(np_x)
-            z = fluid.layers.cosh(x).numpy()
+            z = paddle.cosh(x).numpy()
            z_expected = np.cosh(np_x)
            np.testing.assert_allclose(z, z_expected, rtol=1e-05)
@@ -715,7 +695,7 @@ class TestCoshAPI(unittest.TestCase):
            )
            var = fluid.dygraph.to_variable(input_x)
            var.stop_gradient = False
-            loss = fluid.layers.cosh(var)
+            loss = paddle.cosh(var)
            loss.backward()
            grad_var = var.gradient()
            self.assertEqual(grad_var.shape, input_x.shape)
@@ -725,13 +705,13 @@ class TestCoshOpError(unittest.TestCase):
    def test_errors(self):
        with program_guard(Program()):
            # The input type must be Variable.
-            self.assertRaises(TypeError, fluid.layers.cosh, 1)
+            self.assertRaises(TypeError, paddle.cosh, 1)
            # The input dtype must be float16, float32, float64.
            x_int32 = fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
-            self.assertRaises(TypeError, fluid.layers.cosh, x_int32)
+            self.assertRaises(TypeError, paddle.cosh, x_int32)
            # support the input dtype is float16
            x_fp16 = fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
-            fluid.layers.cosh(x_fp16)
+            paddle.cosh(x_fp16)
 def ref_tanhshrink(x):
@@ -798,16 +778,6 @@ class TestTanhshrinkAPI(unittest.TestCase):
            np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05)
        paddle.enable_static()
-    def test_fluid_api(self):
-        paddle.enable_static()
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
-            out = fluid.layers.tanh_shrink(x)
-            exe = fluid.Executor(self.place)
-            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
-        out_ref = ref_tanhshrink(self.x_np)
-        np.testing.assert_allclose(out_ref, res[0], rtol=1e-05)
    def test_errors(self):
        paddle.enable_static()
        with paddle.static.program_guard(paddle.static.Program()):
@@ -914,16 +884,6 @@ class TestHardShrinkAPI(unittest.TestCase):
            np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05)
        paddle.enable_static()
-    def test_fluid_api(self):
-        paddle.enable_static()
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data('X', [10, 12])
-            out = fluid.layers.hard_shrink(x)
-            exe = fluid.Executor(self.place)
-            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
-        out_ref = ref_hardshrink(self.x_np, 0.5)
-        np.testing.assert_allclose(out_ref, res[0], rtol=1e-05)
    def test_errors(self):
        paddle.enable_static()
        with paddle.static.program_guard(paddle.static.Program()):
@@ -1080,16 +1040,6 @@ class TestSoftshrinkAPI(unittest.TestCase):
            np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05)
        paddle.enable_static()
-    def test_fluid_api(self):
-        paddle.enable_static()
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
-            out = fluid.layers.softshrink(x, self.threshold)
-            exe = fluid.Executor(self.place)
-            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
-        out_ref = ref_softshrink(self.x_np, self.threshold)
-        np.testing.assert_allclose(out_ref, res[0], rtol=1e-05)
    def test_errors(self):
        paddle.enable_static()
        with paddle.static.program_guard(paddle.static.Program()):
@@ -1780,16 +1730,6 @@ class TestLeakyReluAPI(unittest.TestCase):
            np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05)
        paddle.enable_static()
-    def test_fluid_api(self):
-        paddle.enable_static()
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data('X', [10, 12])
-            out = fluid.layers.leaky_relu(x, 0.01)
-            exe = fluid.Executor(self.place)
-            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
-        out_ref = ref_leaky_relu(self.x_np)
-        np.testing.assert_allclose(out_ref, res[0], rtol=1e-05)
    def test_errors(self):
        paddle.enable_static()
        with paddle.static.program_guard(paddle.static.Program()):
@@ -3120,16 +3060,6 @@ class TestSoftplusAPI(unittest.TestCase):
            np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05)
        paddle.enable_static()
-    def test_fluid_api(self):
-        paddle.enable_static()
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
-            out = fluid.layers.softplus(x)
-            exe = fluid.Executor(self.place)
-            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
-        out_ref = ref_softplus(self.x_np)
-        np.testing.assert_allclose(out_ref, res[0], rtol=1e-05)
    def test_errors(self):
        paddle.enable_static()
        with paddle.static.program_guard(paddle.static.Program()):
@@ -3215,16 +3145,6 @@ class TestSoftsignAPI(unittest.TestCase):
            np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05)
        paddle.enable_static()
-    def test_fluid_api(self):
-        paddle.enable_static()
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
-            out = fluid.layers.softsign(x)
-            exe = fluid.Executor(self.place)
-            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
-        out_ref = ref_softsign(self.x_np)
-        np.testing.assert_allclose(out_ref, res[0], rtol=1e-05)
    def test_errors(self):
        paddle.enable_static()
        with paddle.static.program_guard(paddle.static.Program()):
@@ -3314,16 +3234,6 @@ class TestThresholdedReluAPI(unittest.TestCase):
            np.testing.assert_allclose(out_ref, r.numpy(), rtol=1e-05)
        paddle.enable_static()
-    def test_fluid_api(self):
-        paddle.enable_static()
-        with fluid.program_guard(fluid.Program()):
-            x = fluid.data('X', self.x_np.shape, self.x_np.dtype)
-            out = fluid.layers.thresholded_relu(x, self.threshold)
-            exe = fluid.Executor(self.place)
-            res = exe.run(feed={'X': self.x_np}, fetch_list=[out])
-        out_ref = ref_thresholded_relu(self.x_np, self.threshold)
-        np.testing.assert_allclose(out_ref, res[0], rtol=1e-05)
    def test_errors(self):
        paddle.enable_static()
        with paddle.static.program_guard(paddle.static.Program()):
@@ -3660,45 +3570,6 @@ class TestMishAPI(unittest.TestCase):
            F.mish(x_fp16)
-# ------------------ Test Error Activation----------------------
-def create_test_error_class(op_type):
-    class TestOpErrors(unittest.TestCase):
-        def test_errors(self):
-            with program_guard(Program(), Program()):
-                op = getattr(fluid.layers, op_type)
-                # The input dtype of op_type must be float32, float64.
-                in1 = fluid.layers.data(
-                    name='input2', shape=[12, 10], dtype="int32"
-                )
-                in2 = fluid.layers.data(
-                    name='input3', shape=[12, 10], dtype="int64"
-                )
-                self.assertRaises(TypeError, op, in1)
-                self.assertRaises(TypeError, op, in2)
-    cls_name = "{0}_{1}".format(op_type, "test_errors")
-    TestOpErrors.__name__ = cls_name
-    globals()[cls_name] = TestOpErrors
-create_test_error_class('acos')
-create_test_error_class('asin')
-create_test_error_class('atan')
-create_test_error_class('ceil')
-create_test_error_class('cos')
-create_test_error_class('floor')
-create_test_error_class('reciprocal')
-create_test_error_class('round')
-create_test_error_class('rsqrt')
-create_test_error_class('sin')
-create_test_error_class('sqrt')
-create_test_error_class('tanh')
-create_test_error_class('tan')
-create_test_error_class('acosh')
-create_test_error_class('asinh')
-create_test_error_class('atanh')
 # ------------------ Test Cudnn Activation----------------------
 def create_test_act_cudnn_class(parent, atol=1e-3, grad_atol=1e-3):
    @unittest.skipIf(

--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
@@ -371,7 +371,7 @@ class BadInputTest(unittest.TestCase):
            def test_bad_x():
                data = [1, 2, 4]
-                result = fluid.layers.cumsum(data, axis=0)
+                result = paddle.cumsum(data, axis=0)
            self.assertRaises(TypeError, test_bad_x)

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps.py
@@ -87,7 +87,7 @@ class TestPSPassWithBow(unittest.TestCase):
        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
        # vsum
        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
-        q_ss = fluid.layers.softsign(q_sum)
+        q_ss = paddle.nn.functional.softsign(q_sum)
        # fc layer after conv
        q_fc = fluid.layers.fc(
            input=q_ss,
@@ -119,7 +119,7 @@ class TestPSPassWithBow(unittest.TestCase):
        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
        # vsum
        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
-        pt_ss = fluid.layers.softsign(pt_sum)
+        pt_ss = paddle.nn.functional.softsign(pt_sum)
        # fc layer
        pt_fc = fluid.layers.fc(
            input=pt_ss,
@@ -150,7 +150,7 @@ class TestPSPassWithBow(unittest.TestCase):
        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
        # vsum
        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
-        nt_ss = fluid.layers.softsign(nt_sum)
+        nt_ss = paddle.nn.functional.softsign(nt_sum)
        # fc layer
        nt_fc = fluid.layers.fc(
            input=nt_ss,

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps11.py
@@ -83,7 +83,7 @@ class TestPSPassWithBow(unittest.TestCase):
        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
        # vsum
        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
-        q_ss = fluid.layers.softsign(q_sum)
+        q_ss = paddle.nn.functional.softsign(q_sum)
        # fc layer after conv
        q_fc = fluid.layers.fc(
            input=q_ss,
@@ -111,7 +111,7 @@ class TestPSPassWithBow(unittest.TestCase):
        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
        # vsum
        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
-        pt_ss = fluid.layers.softsign(pt_sum)
+        pt_ss = paddle.nn.functional.softsign(pt_sum)
        # fc layer
        pt_fc = fluid.layers.fc(
            input=pt_ss,
@@ -138,7 +138,7 @@ class TestPSPassWithBow(unittest.TestCase):
        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
        # vsum
        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
-        nt_ss = fluid.layers.softsign(nt_sum)
+        nt_ss = paddle.nn.functional.softsign(nt_sum)
        # fc layer
        nt_fc = fluid.layers.fc(
            input=nt_ss,

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
@@ -86,7 +86,7 @@ class TestPSPassWithBow(unittest.TestCase):
        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
        # vsum
        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
-        q_ss = fluid.layers.softsign(q_sum)
+        q_ss = paddle.nn.functional.softsign(q_sum)
        # fc layer after conv
        q_fc = fluid.layers.fc(
            input=q_ss,
@@ -114,7 +114,7 @@ class TestPSPassWithBow(unittest.TestCase):
        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
        # vsum
        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
-        pt_ss = fluid.layers.softsign(pt_sum)
+        pt_ss = paddle.nn.functional.softsign(pt_sum)
        # fc layer
        pt_fc = fluid.layers.fc(
            input=pt_ss,
@@ -141,7 +141,7 @@ class TestPSPassWithBow(unittest.TestCase):
        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
        # vsum
        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
-        nt_ss = fluid.layers.softsign(nt_sum)
+        nt_ss = paddle.nn.functional.softsign(nt_sum)
        # fc layer
        nt_fc = fluid.layers.fc(
            input=nt_ss,

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps13.py
@@ -89,7 +89,7 @@ class TestPSPassWithBow(unittest.TestCase):
        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
        # vsum
        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
-        q_ss = fluid.layers.softsign(q_sum)
+        q_ss = paddle.nn.functional.softsign(q_sum)
        # fc layer after conv
        q_fc = fluid.layers.fc(
            input=q_ss,
@@ -119,7 +119,7 @@ class TestPSPassWithBow(unittest.TestCase):
        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
        # vsum
        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
-        pt_ss = fluid.layers.softsign(pt_sum)
+        pt_ss = paddle.nn.functional.softsign(pt_sum)
        # fc layer
        pt_fc = fluid.layers.fc(
            input=pt_ss,
@@ -148,7 +148,7 @@ class TestPSPassWithBow(unittest.TestCase):
        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
        # vsum
        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
-        nt_ss = fluid.layers.softsign(nt_sum)
+        nt_ss = paddle.nn.functional.softsign(nt_sum)
        # fc layer
        nt_fc = fluid.layers.fc(
            input=nt_ss,

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
@@ -88,7 +88,7 @@ class TestPSPassWithBow(unittest.TestCase):
        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
        # vsum
        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
-        q_ss = fluid.layers.softsign(q_sum)
+        q_ss = paddle.nn.functional.softsign(q_sum)
        q_ss = fluid.layers.data_norm(input=q_ss)
        # fc layer after conv
        q_fc = fluid.layers.fc(
@@ -119,7 +119,7 @@ class TestPSPassWithBow(unittest.TestCase):
        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
        # vsum
        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
-        pt_ss = fluid.layers.softsign(pt_sum)
+        pt_ss = paddle.nn.functional.softsign(pt_sum)
        # fc layer
        pt_fc = fluid.layers.fc(
            input=pt_ss,
@@ -148,7 +148,7 @@ class TestPSPassWithBow(unittest.TestCase):
        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
        # vsum
        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
-        nt_ss = fluid.layers.softsign(nt_sum)
+        nt_ss = paddle.nn.functional.softsign(nt_sum)
        # fc layer
        nt_fc = fluid.layers.fc(
            input=nt_ss,

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps3.py
@@ -87,7 +87,7 @@ class TestPSPassWithBow(unittest.TestCase):
        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
        # vsum
        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
-        q_ss = fluid.layers.softsign(q_sum)
+        q_ss = paddle.nn.functional.softsign(q_sum)
        # fc layer after conv
        q_fc = fluid.layers.fc(
            input=q_ss,
@@ -119,7 +119,7 @@ class TestPSPassWithBow(unittest.TestCase):
        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
        # vsum
        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
-        pt_ss = fluid.layers.softsign(pt_sum)
+        pt_ss = paddle.nn.functional.softsign(pt_sum)
        # fc layer
        pt_fc = fluid.layers.fc(
            input=pt_ss,
@@ -150,7 +150,7 @@ class TestPSPassWithBow(unittest.TestCase):
        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
        # vsum
        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
-        nt_ss = fluid.layers.softsign(nt_sum)
+        nt_ss = paddle.nn.functional.softsign(nt_sum)
        # fc layer
        nt_fc = fluid.layers.fc(
            input=nt_ss,

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps4.py
@@ -85,7 +85,7 @@ class TestPSPassWithBow(unittest.TestCase):
        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
        # vsum
        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
-        q_ss = fluid.layers.softsign(q_sum)
+        q_ss = paddle.nn.functional.softsign(q_sum)
        # fc layer after conv
        q_fc = fluid.layers.fc(
            input=q_ss,
@@ -115,7 +115,7 @@ class TestPSPassWithBow(unittest.TestCase):
        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
        # vsum
        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
-        pt_ss = fluid.layers.softsign(pt_sum)
+        pt_ss = paddle.nn.functional.softsign(pt_sum)
        # fc layer
        pt_fc = fluid.layers.fc(
            input=pt_ss,
@@ -144,7 +144,7 @@ class TestPSPassWithBow(unittest.TestCase):
        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
        # vsum
        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
-        nt_ss = fluid.layers.softsign(nt_sum)
+        nt_ss = paddle.nn.functional.softsign(nt_sum)
        # fc layer
        nt_fc = fluid.layers.fc(
            input=nt_ss,

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps5.py
@@ -87,7 +87,7 @@ class TestPSPassWithBow(unittest.TestCase):
        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
        # vsum
        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
-        q_ss = fluid.layers.softsign(q_sum)
+        q_ss = paddle.nn.functional.softsign(q_sum)
        # fc layer after conv
        q_fc = fluid.layers.fc(
            input=q_ss,
@@ -119,7 +119,7 @@ class TestPSPassWithBow(unittest.TestCase):
        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
        # vsum
        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
-        pt_ss = fluid.layers.softsign(pt_sum)
+        pt_ss = paddle.nn.functional.softsign(pt_sum)
        # fc layer
        pt_fc = fluid.layers.fc(
            input=pt_ss,
@@ -150,7 +150,7 @@ class TestPSPassWithBow(unittest.TestCase):
        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
        # vsum
        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
-        nt_ss = fluid.layers.softsign(nt_sum)
+        nt_ss = paddle.nn.functional.softsign(nt_sum)
        # fc layer
        nt_fc = fluid.layers.fc(
            input=nt_ss,

--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_ps6.py
@@ -85,7 +85,7 @@ class TestPSPassWithBow(unittest.TestCase):
        q_emb = fluid.layers.reshape(q_emb, [-1, emb_dim])
        # vsum
        q_sum = fluid.layers.sequence_pool(input=q_emb, pool_type='sum')
-        q_ss = fluid.layers.softsign(q_sum)
+        q_ss = paddle.nn.functional.softsign(q_sum)
        # fc layer after conv
        q_fc = fluid.layers.fc(
            input=q_ss,
@@ -115,7 +115,7 @@ class TestPSPassWithBow(unittest.TestCase):
        pt_emb = fluid.layers.reshape(pt_emb, [-1, emb_dim])
        # vsum
        pt_sum = fluid.layers.sequence_pool(input=pt_emb, pool_type='sum')
-        pt_ss = fluid.layers.softsign(pt_sum)
+        pt_ss = paddle.nn.functional.softsign(pt_sum)
        # fc layer
        pt_fc = fluid.layers.fc(
            input=pt_ss,
@@ -144,7 +144,7 @@ class TestPSPassWithBow(unittest.TestCase):
        nt_emb = fluid.layers.reshape(nt_emb, [-1, emb_dim])
        # vsum
        nt_sum = fluid.layers.sequence_pool(input=nt_emb, pool_type='sum')
-        nt_ss = fluid.layers.softsign(nt_sum)
+        nt_ss = paddle.nn.functional.softsign(nt_sum)
        # fc layer
        nt_fc = fluid.layers.fc(
            input=nt_ss,

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py
@@ -38,7 +38,7 @@ def gru_net(
    fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3)
    gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False)
    gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max')
-    gru_max_tanh = fluid.layers.tanh(gru_max)
+    gru_max_tanh = paddle.tanh(gru_max)
    fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh')
    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
    cost = fluid.layers.cross_entropy(input=prediction, label=label)

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py
@@ -40,7 +40,7 @@ def lstm_net(
        input=fc0, size=hid_dim * 4, is_reverse=False
    )
    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
-    lstm_max_tanh = fluid.layers.tanh(lstm_max)
+    lstm_max_tanh = paddle.tanh(lstm_max)
    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
    cost = fluid.layers.cross_entropy(input=prediction, label=label)

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
@@ -191,10 +191,10 @@ def lm_model(
                    ends=[hidden_size * 4],
                )
-                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
+                c = pre_cell * paddle.nn.functional.sigmoid(
-                    i
+                    f
-                ) * layers.tanh(j)
+                ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j)
-                m = layers.tanh(c) * layers.sigmoid(o)
+                m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o)
                rnn.update_memory(pre_hidden, m)
                rnn.update_memory(pre_cell, c)
@@ -299,10 +299,10 @@ def lm_model(
                gate_input = layers.elementwise_add(gate_input, bias)
                i, j, f, o = layers.split(gate_input, num_or_sections=4, dim=-1)
-                c = pre_cell * layers.sigmoid(f) + layers.sigmoid(
+                c = pre_cell * paddle.nn.functional.sigmoid(
-                    i
+                    f
-                ) * layers.tanh(j)
+                ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j)
-                m = layers.tanh(c) * layers.sigmoid(o)
+                m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o)
                hidden_array[k] = m
                cell_array[k] = c

--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
@@ -327,7 +327,9 @@ class EagerDeletionRecurrentOpTest2(EagerDeletionRecurrentOpTest1):
                bias_attr=False,
            )
-            h = layers.sigmoid(x=layers.elementwise_add(x=temp_l, y=temp_r))
+            h = paddle.nn.functional.sigmoid(
+                x=layers.elementwise_add(x=temp_l, y=temp_r)
+            )
            rnn.update_memory(h_pre, h)
            rnn.output(h)

--- a/python/paddle/fluid/tests/unittests/test_erf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_erf_op.py
@@ -48,7 +48,7 @@ class TestErfLayer(unittest.TestCase):
        y_ref = erf(x)
        with dg.guard(place) as g:
            x_var = dg.to_variable(x)
-            y_var = fluid.layers.erf(x_var)
+            y_var = paddle.erf(x_var)
            y_test = y_var.numpy()
        np.testing.assert_allclose(y_ref, y_test, rtol=1e-05)

--- a/python/paddle/fluid/tests/unittests/test_gelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gelu_op.py
@@ -45,7 +45,7 @@ class TestGeluOp(unittest.TestCase):
        place = fluid.CPUPlace()
        with dg.guard(place) as g:
            x_var = dg.to_variable(x)
-            y_var = fluid.layers.gelu(x_var, approximate)
+            y_var = F.gelu(x_var, approximate)
            y_test = y_var.numpy()
        np.testing.assert_allclose(y_ref, y_test, rtol=1e-05, atol=1e-08)
@@ -56,7 +56,7 @@ class TestGeluOp(unittest.TestCase):
        place = fluid.CUDAPlace(0)
        with dg.guard(place) as g:
            x_var = dg.to_variable(x)
-            y_var = fluid.layers.gelu(x_var, approximate)
+            y_var = F.gelu(x_var, approximate)
            y_test = y_var.numpy()
        np.testing.assert_allclose(y_ref, y_test, rtol=1e-05, atol=1e-08)

--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -35,7 +35,7 @@ def bow_net(
        input=data, is_sparse=True, size=[dict_dim, emb_dim]
    )
    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bow_tanh = fluid.layers.tanh(bow)
+    bow_tanh = paddle.tanh(bow)
    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")

--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -914,7 +914,7 @@ class TestDygraphUtils(unittest.TestCase):
        with fluid.dygraph.guard():
            a = paddle.to_tensor(a_np)
            res1 = func(a, act="sigmoid", use_mkldnn=True, use_cudnn=True)
-            res2 = fluid.layers.sigmoid(a)
+            res2 = paddle.nn.functional.sigmoid(a)
            np.testing.assert_allclose(res1.numpy(), res2.numpy(), rtol=1e-05)
    def test_append_activation_in_dygraph2(self):
@@ -929,7 +929,7 @@ class TestDygraphUtils(unittest.TestCase):
        with fluid.dygraph.guard():
            a = paddle.to_tensor(a_np)
            res1 = func(a, act="sigmoid", use_cudnn=True)
-            res2 = fluid.layers.sigmoid(a)
+            res2 = paddle.nn.functional.sigmoid(a)
            np.testing.assert_array_equal(res1.numpy(), res2.numpy())
    def test_append_activation_in_dygraph3(self):

--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -317,7 +317,7 @@ class SimpleAttention(fluid.dygraph.Layer):
        concated = fluid.layers.elementwise_add(
            encoder_proj, decoder_state_expand
        )
-        concated = fluid.layers.tanh(x=concated)
+        concated = paddle.tanh(x=concated)
        attention_weight = self.fc_2(concated)
        weights_reshape = fluid.layers.reshape(

--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -115,10 +115,10 @@ class SimpleLSTMRNN(fluid.Layer):
                i, j, f, o = fluid.layers.split(
                    gate_input, num_or_sections=4, dim=-1
                )
-                c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
+                c = pre_cell * paddle.nn.functional.sigmoid(
-                    i
+                    f
-                ) * fluid.layers.tanh(j)
+                ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j)
-                m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
+                m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o)
                self.hidden_array[k] = m
                self.cell_array[k] = c
                self._input = m

--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -110,10 +110,10 @@ class SimpleLSTMRNN(fluid.Layer):
                i, j, f, o = fluid.layers.split(
                    gate_input, num_or_sections=4, dim=-1
                )
-                c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
+                c = pre_cell * paddle.nn.functional.sigmoid(
-                    i
+                    f
-                ) * fluid.layers.tanh(j)
+                ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j)
-                m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
+                m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o)
                self.hidden_array[k] = m
                self.cell_array[k] = c
                self._input = m

--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -112,10 +112,10 @@ class SimpleLSTMRNN(fluid.Layer):
                i, j, f, o = fluid.layers.split(
                    gate_input, num_or_sections=4, dim=-1
                )
-                c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
+                c = pre_cell * paddle.nn.functional.sigmoid(
-                    i
+                    f
-                ) * fluid.layers.tanh(j)
+                ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j)
-                m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
+                m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o)
                self.hidden_array[k] = m
                self.cell_array[k] = c
                self._input = m

--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -322,7 +322,7 @@ class Generator(fluid.dygraph.Layer):
        res_block = self._res_block(conv0)
        deconv = self._deconv(res_block)
        conv1 = self._conv1(deconv)
-        out = fluid.layers.tanh(conv1)
+        out = paddle.tanh(conv1)
        return out
@@ -437,11 +437,9 @@ def gradient_penalty(f, real, fake, no_grad_set, cfg):
    )
    epsilon = 1e-16
-    norm = fluid.layers.sqrt(
+    norm = paddle.sqrt(paddle.sum(paddle.square(gradient), axis=1) + epsilon)
-        fluid.layers.reduce_sum(fluid.layers.square(gradient), dim=1) + epsilon
-    )
-    gp = fluid.layers.reduce_mean(fluid.layers.square(norm - 1.0))
+    gp = paddle.mean(paddle.square(norm - 1.0))
    return gp
@@ -451,7 +449,7 @@ def get_generator_loss(
    fake_img = generator(image_real, label_trg)
    rec_img = generator(fake_img, label_org)
    g_loss_rec = fluid.layers.reduce_mean(
-        fluid.layers.abs(fluid.layers.elementwise_sub(image_real, rec_img))
+        paddle.abs(paddle.subtract(image_real, rec_img))
    )
    pred_fake, cls_fake = discriminator(fake_img)

--- a/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_triple_grad.py
@@ -182,7 +182,7 @@ class TestDygraphTripleGrad(TestCase):
        numel = z_np.size
        z.stop_gradient = False
-        out = fluid.layers.sigmoid(paddle.matmul(x, y) + z)
+        out = paddle.nn.functional.sigmoid(paddle.matmul(x, y) + z)
        out_np = out.numpy()
        (dx_actual,) = self.grad([out], [x], create_graph=True)
@@ -278,7 +278,7 @@ class TestDygraphTripleGradBradcastCase(TestCase):
        numel = z_np.size
        z.stop_gradient = False
-        out = fluid.layers.sigmoid(paddle.matmul(x, y) + z)
+        out = paddle.nn.functional.sigmoid(paddle.matmul(x, y) + z)
        out_np = out.numpy()
        (dx_actual,) = self.grad([out], [x], create_graph=True)

--- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py
@@ -87,7 +87,7 @@ class TestInplaceANBOpTraining(unittest.TestCase):
                # a new Variable for fetch
                bn = bn * 1.0
-                sigmoid = fluid.layers.sigmoid(bn)
+                sigmoid = paddle.nn.functional.sigmoid(bn)
                out = fluid.layers.reduce_sum(sigmoid)
                if not only_forward:
                    sgd_opt = fluid.optimizer.SGD(learning_rate=0.0)

--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_nlp.py
@@ -41,7 +41,7 @@ def lstm_net(
        input=fc0, size=hid_dim * 4, is_reverse=False
    )
    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
-    lstm_max_tanh = fluid.layers.tanh(lstm_max)
+    lstm_max_tanh = paddle.tanh(lstm_max)
    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
    cost = fluid.layers.cross_entropy(input=prediction, label=label)

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -3680,126 +3680,6 @@ class TestBook(LayerTest):
            )
            return out
-    def make_sigmoid(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.sigmoid(input, name='sigmoid')
-            return out
-    def make_exp(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.exp(input, name='exp')
-            return out
-    def make_tanh(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.tanh(input, name='tanh')
-            return out
-    def make_tanh_shrink(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.tanh_shrink(input, name='tanh_shrink')
-            return out
-    def make_sqrt(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.sqrt(input, name='sqrt')
-            return out
-    def make_abs(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.abs(input, name='abs')
-            return out
-    def make_ceil(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.ceil(input, name='ceil')
-            return out
-    def make_floor(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.floor(input, name='floor')
-            return out
-    def make_cos(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.cos(input, name='cos')
-            return out
-    def make_sin(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.sin(input, name='sin')
-            return out
-    def make_round(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.round(input, name='round')
-            return out
-    def make_reciprocal(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.reciprocal(input, name='reciprocal')
-            return out
-    def make_square(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.square(input, name='square')
-            return out
-    def make_softplus(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.softplus(input, name='softplus')
-            return out
-    def make_softsign(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.softsign(input, name='softsign')
-            return out
    def make_mish(self):
        with program_guard(
            fluid.default_main_program(), fluid.default_startup_program()
@@ -3920,14 +3800,6 @@ class TestBook(LayerTest):
            out = layers.scale(input, scale=scale_var)
            return out
-    def make_softshrink(self):
-        with program_guard(
-            fluid.default_main_program(), fluid.default_startup_program()
-        ):
-            input = self._get_data(name="input", shape=[16], dtype="float32")
-            out = layers.softshrink(input, alpha=0.3)
-            return out
    def make_iou_similarity(self):
        with program_guard(
            fluid.default_main_program(), fluid.default_startup_program()

--- a/python/paddle/fluid/tests/unittests/test_lgamma_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lgamma_op.py
@@ -63,7 +63,7 @@ class TestLgammaOpApi(unittest.TestCase):
        shape = (1, 4)
        data = np.random.random(shape).astype(self.dtype) + 1
        data_ = paddle.to_tensor(data)
-        out = paddle.fluid.layers.lgamma(data_)
+        out = paddle.lgamma(data_)
        result = special.gammaln(data)
        np.testing.assert_allclose(result, out.numpy(), rtol=1e-05)
        paddle.enable_static()

--- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
+++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py
@@ -50,7 +50,7 @@ def lstm_net(use_feed):
        input=fc0, size=hid_dim * 4, is_reverse=False
    )
    lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max')
-    lstm_max_tanh = fluid.layers.tanh(lstm_max)
+    lstm_max_tanh = paddle.tanh(lstm_max)
    fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh')
    prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax')
    cost = fluid.layers.cross_entropy(input=prediction, label=label)

--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
@@ -81,7 +81,7 @@ def simple_fc_net(img, label, use_py_func_op):
            ),
        )
        if not use_py_func_op:
-            hidden = fluid.layers.tanh(hidden)
+            hidden = paddle.tanh(hidden)
        else:
            new_hidden = (
                fluid.default_main_program()

--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -316,7 +316,9 @@ class RecurrentOpTest2(RecurrentOpTest1):
                bias_attr=False,
            )
-            h = layers.sigmoid(x=layers.elementwise_add(x=temp_l, y=temp_r))
+            h = paddle.nn.functional.sigmoid(
+                x=layers.elementwise_add(x=temp_l, y=temp_r)
+            )
            rnn.update_memory(h_pre, h)
            rnn.output(h)
@@ -710,7 +712,9 @@ class RecurrentOpStopGradientTest(RecurrentOpTest1):
                bias_attr=False,
            )
-            h = layers.sigmoid(x=layers.elementwise_add(temp_l, temp_r))
+            h = paddle.nn.functional.sigmoid(
+                x=layers.elementwise_add(temp_l, temp_r)
+            )
            rnn.update_memory(h_pre, h)
            rnn.output(h)

--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -135,7 +135,7 @@ def bow_net(
        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]
    )
    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bow_tanh = fluid.layers.tanh(bow)
+    bow_tanh = paddle.tanh(bow)
    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
@@ -225,7 +225,7 @@ class TestRegularizer(unittest.TestCase):
            param_list = fluid.default_main_program().block(0).all_parameters()
            para_sum = []
            for para in param_list:
-                para_mul = fluid.layers.square(x=para)
+                para_mul = paddle.square(x=para)
                para_sum.append(fluid.layers.reduce_sum(input=para_mul))
            avg_cost_l2 += fluid.layers.sums(para_sum) * 0.5

--- a/python/paddle/fluid/tests/unittests/test_regularizer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer_api.py
@@ -41,7 +41,7 @@ def bow_net(
        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]
    )
    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bow_tanh = fluid.layers.tanh(bow)
+    bow_tanh = paddle.tanh(bow)
    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
@@ -133,7 +133,7 @@ class TestRegularizer(unittest.TestCase):
            param_list = fluid.default_main_program().block(0).all_parameters()
            para_sum = []
            for para in param_list:
-                para_mul = fluid.layers.square(x=para)
+                para_mul = paddle.square(x=para)
                para_sum.append(fluid.layers.reduce_sum(input=para_mul))
            avg_cost_l2 += fluid.layers.sums(para_sum) * 0.5

--- a/python/paddle/fluid/tests/unittests/test_retain_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py
@@ -30,7 +30,7 @@ class Generator(fluid.dygraph.Layer):
    def forward(self, x):
        x = self.conv1(x)
-        x = fluid.layers.tanh(x)
+        x = paddle.tanh(x)
        return x

--- a/python/paddle/fluid/tests/unittests/test_static_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_static_save_load.py
@@ -122,10 +122,10 @@ class SimpleLSTMRNN(fluid.Layer):
                i, j, f, o = fluid.layers.split(
                    gate_input, num_or_sections=4, dim=-1
                )
-                c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid(
+                c = pre_cell * paddle.nn.functional.sigmoid(
-                    i
+                    f
-                ) * fluid.layers.tanh(j)
+                ) + paddle.nn.functional.sigmoid(i) * paddle.tanh(j)
-                m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o)
+                m = paddle.tanh(c) * paddle.nn.functional.sigmoid(o)
                self.hidden_array[k] = m
                self.cell_array[k] = c
                self._input = m

--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -94,7 +94,7 @@ class TestSyncBatchNormOpTraining(unittest.TestCase):
                    bn = fluid.layers.cast(bn, 'float32')
                else:
                    bn = fluid.layers.cast(bn, 'float64')
-                sigmoid = fluid.layers.sigmoid(bn)
+                sigmoid = paddle.nn.functional.sigmoid(bn)
                out = fluid.layers.reduce_sum(sigmoid)
                if not sync_bn:
                    out = out / core.get_cuda_device_count()

--- a/python/paddle/fluid/tests/unittests/test_weight_decay.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_decay.py
@@ -59,7 +59,7 @@ def bow_net(
        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim]
    )
    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
-    bow_tanh = fluid.layers.tanh(bow)
+    bow_tanh = paddle.tanh(bow)
    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")

--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
@@ -15,6 +15,7 @@
 from functools import partial
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
@@ -156,7 +157,7 @@ def multi_head_attention(
        # So, here define the softmax for temporary solution.
        def __softmax(x, eps=1e-9):
-            exp_out = layers.exp(x=x)
+            exp_out = paddle.exp(x=x)
            sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False)
            return layers.elementwise_div(x=exp_out, y=sum_out, axis=0)

--- a/python/paddle/incubate/distributed/models/moe/grad_clip.py
+++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py
@@ -209,7 +209,7 @@ class ClipGradForMOEByGlobalNorm(ClipGradBase):
            global_norm_var = global_norm_var_normal + global_norm_var_moe
        params_and_grads = []
-        global_norm_var = layers.sqrt(global_norm_var)
+        global_norm_var = paddle.sqrt(global_norm_var)
        max_global_norm = layers.fill_constant(
            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm
        )

--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
@@ -557,7 +557,7 @@ class ModelAverage(Optimizer):
        sum = layers.cast(
            x=sum, dtype='float32' if self._dtype is None else self._dtype
        )
-        layers.ops._elementwise_div(x=sum, y=tmp, out=param)
+        paddle.tensor.ops._elementwise_div(x=sum, y=tmp, out=param)
    def _add_average_restore_op(self, block, param):
        param = block._clone_variable(param)