cherry pick move api, test=release/1.4

494d8ca6 · lujun · 463f88a7 · 494d8ca6 · 494d8ca6 · 494d8ca6
25 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -13,6 +13,7 @@ paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, d
 paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', '7d9a51fc9cf3c5245b5227080a8064c3'))
 paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '4c0cd83f0b401fc2ff84c70974e5d210'))
 paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd0c3ebd813c39958c92b78e3eef7e912'))
+paddle.fluid.in_dygraph_mode (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f06314a1cb30c96b5808dde2219c2dae'))
 paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03'))
 paddle.fluid.Executor.infer_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', '9c7decb955b9c4f718114179c8985581'))

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -66,6 +66,8 @@ from . import compiler
 from .compiler import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 from . import install_check
+from .dygraph.nn import *
+from .dygraph.layers import *
 Tensor = LoDTensor

--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -22,7 +22,7 @@ __all__ = ['enabled', 'guard', 'to_variable']
 def enabled():
-    return framework._in_dygraph_mode()
+    return framework.in_dygraph_mode()
 @signature_safe_contextmanager

--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -97,20 +97,12 @@ def load_persistables(vardict, dirname, filename=None):
    Examples:
        .. code-block:: python
-            my_layer = layer(fluid.dygraph.Layer)
+            my_layer = layer(fluid.Layer)
            param_path = "./my_paddle_model"
            param_dict = fluid.dygraph.load_persistables(my_layer.parameters(), param_path)
            param_1 = param_dict['PtbModel_0.w_1']
-            or:
-            my_layer = layer(fluid.dygraph.Layer)
-            param_path = "./my_paddle_model"
-            filename = "model.file"
-            param_dict = fluid.dygraph.load_persistables(my_layer.state_dict(), param_path,
-                                                                       filename=filename)
-            param_1 = param_dict['PtbModel_0.w_1']
        """
    if isinstance(vardict, collections.OrderedDict):
        return _load_var_from_file(vardict, dirname, filename)

--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ b/python/paddle/fluid/dygraph/layer_object_helper.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 import copy
 import six
-from ..framework import Parameter, _in_dygraph_mode
+from ..framework import Parameter, in_dygraph_mode
 from ..param_attr import ParamAttr
 from .. import core
 from six.moves import zip

--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -139,14 +139,14 @@ class Layer(core.Layer):
    def clear_gradients(self):
        for p in self.parameters():
-            p._clear_gradient()
+            p.clear_gradient()
-    def _build_once(self, *args):
+    def build_once(self, *args):
        pass
    def __call__(self, *inputs):
        if not self._built:
-            self._build_once(*inputs)
+            self.build_once(*inputs)
        outputs = self.forward(*inputs)
        self._built = True

--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -19,7 +19,7 @@ from six.moves import reduce
 from .. import core
 from ..layers import utils
 from . import layers
-from ..framework import Variable, _in_dygraph_mode, OpProtoHolder, Parameter
+from ..framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter
 from ..param_attr import ParamAttr
 from ..initializer import Normal, Constant, NumpyArrayInitializer
 import numpy as np
@@ -33,6 +33,109 @@ __all__ = [
 class Conv2D(layers.Layer):
+    """
+    The convolution2D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCHW format, where N is batch size, C is the number of
+    channels, H is the height of the feature, and W is the width of the feature.
+    Filter is in MCHW format, where M is the number of output image channels,
+    C is the number of input image channels, H is the height of the filter,
+    and W is the width of the filter. If the groups is greater than 1,
+    C will equal the number of input image channels divided by the groups.
+    Please refer to UFLDL's `convolution
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
+    for more detials.
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
+    For each input :math:`X`, the equation is:
+    .. math::
+        Out = \sigma (W \\ast X + b)
+    Where:
+    * :math:`X`: Input value, a tensor with NCHW format.
+    * :math:`W`: Filter value, a tensor with MCHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    Example:
+        - Input:
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
+        - Output:
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+    Args:
+        input (Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride (int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        padding (int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups (int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1.
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
+            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically. Default: None
+    Returns:
+        Variable: The tensor variable storing the convolution and \
+                  non-linearity activation result.
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+    Examples:
+        .. code-block:: python
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
+    """
    def __init__(self,
                 name_scope,
                 num_channels,
@@ -265,7 +368,7 @@ class Conv3D(layers.Layer):
        self._param_attr = param_attr
        self._bias_attr = bias_attr
-    def _build_once(self, input):
+    def build_once(self, input):
        num_channels = input.shape[1]
        self._dtype = self._helper.input_dtype(input)
@@ -332,6 +435,116 @@ class Conv3D(layers.Layer):
 class Conv3DTranspose(layers.Layer):
+    """
+    **Convlution3D transpose layer**
+    The convolution3D transpose layer calculates the output based on the input,
+    filter, and dilations, strides, paddings. Input(Input) and output(Output)
+    are in NCDHW format. Where N is batch size, C is the number of channels,
+    D is the depth of the feature, H is the height of the feature, and W
+    is the width of the feature. Parameters(dilations, strides, paddings) are
+    two elements. These two elements represent height and width, respectively.
+    The details of convolution transpose layer, please refer to the following
+    explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
+    For each input :math:`X`, the equation is:
+    .. math::
+        Out = \sigma (W \\ast X + b)
+    In the above equation:
+    * :math:`X`: Input value, a tensor with NCDHW format.
+    * :math:`W`: Filter value, a tensor with MCDHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+    Example:
+        - Input:
+          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
+        - Output:
+          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+        Where
+        .. math::
+           D_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
+           H_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
+           W_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1
+    Args:
+        input(Variable): The input image with [N, C, D, H, W] format.
+        num_filters(int): The number of the filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain three integers, (image_D, image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square. None if use output size to
+            calculate filter_size.
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
+            padding_D = padding_H = padding_W = padding. Default: padding = 0.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
+            stride_D = stride_H = stride_W = stride. Default: stride = 1.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups(int): The groups number of the Conv3d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups=1
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d_transpose.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv3d_transpose
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None.
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+    Returns:
+        Variable: The tensor variable storing the convolution transpose result.
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+    Examples:
+       .. code-block:: python
+          conv3d_transpose = nn.Conv3DTranspose(
+                'Conv3DTranspose',
+                num_filters=12,
+                filter_size=12,
+                use_cudnn=False)
+          transpose_res = conv3d_transpose(base.to_variable(input_array))
+    """
    def __init__(self,
                 name_scope,
                 num_filters,
@@ -362,7 +575,7 @@ class Conv3DTranspose(layers.Layer):
        self._bias_attr = bias_attr
        self._act = act
-    def _build_once(self, input):
+    def build_once(self, input):
        self._dtype = self._helper.input_dtype(input)
        self._input_channel = input.shape[1]
@@ -436,6 +649,54 @@ class Conv3DTranspose(layers.Layer):
 class Pool2D(layers.Layer):
+    """
+    ${comment}
+    Args:
+        input (Variable): The input tensor of pooling operator. The format of
+                          input tensor is NCHW, where N is batch size, C is
+                          the number of channels, H is the height of the
+                          feature, and W is the width of the feature.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        pool_type: ${pooling_type_comment}
+        pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+        pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple,
+            it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
+            Otherwise, the pool padding size will be a square of an int.
+        global_pooling (bool): ${global_pooling_comment}
+        use_cudnn (bool): ${use_cudnn_comment}
+        ceil_mode (bool): ${ceil_mode_comment}
+        name (str|None): A name for this layer(optional). If set None, the
+                        layer will be named automatically.
+        exclusive (bool): Whether to exclude padding points in average pooling
+                          mode, default is true
+    Returns:
+        Variable: The pooling result.
+    Raises:
+        ValueError: If 'pool_type' is not "max" nor "avg"
+        ValueError: If 'global_pooling' is False and 'pool_size' is -1
+        ValueError: If 'use_cudnn' is not a bool value.
+    Examples:
+        .. code-block:: python
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          pool2d = fluid.Pool2D("pool2d",pool_size=2,
+                            pool_type='max',
+                            pool_stride=1,
+                            global_pooling=False)
+          pool2d_res = pool2d(data)
+    """
    def __init__(self,
                 name_scope,
                 pool_size=-1,
@@ -495,6 +756,102 @@ class Pool2D(layers.Layer):
 class FC(layers.Layer):
+    """
+    **Fully Connected Layer**
+    This function creates a fully connected layer in the network. It can take
+    one or multiple tensors as its inputs(input can be a list of Variable, see
+    Args in detail). It creates a variable called weights for each input tensor,
+    which represents a fully connected weight matrix from each input unit to
+    each output unit. The fully connected layer multiplies each input tensor
+    with its corresponding weight to produce an output Tensor with shape [M, `size`],
+    where M is batch size. If multiple input tensors are given, the results of
+    multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
+    is not None, a bias variable will be created and added to the output.
+    Finally, if activation is not None, it will be applied to the output as well.
+    When the input is single tensor:
+    .. math::
+        Out = Act({XW + b})
+    When the input are multiple tensors:
+    .. math::
+        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+    In the above equation:
+    * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
+    * :math:`X_i`: The i-th input tensor.
+    * :math:`W_i`: The i-th weights matrix corresponding i-th input tensor.
+    * :math:`b`: The bias parameter created by this layer (if needed).
+    * :math:`Act`: The activation function.
+    * :math:`Out`: The output tensor.
+    See below for an example.
+    .. code-block:: text
+        Given:
+            data_1.data = [[[0.1, 0.2],
+                           [0.3, 0.4]]]
+            data_1.shape = (1, 2, 2) # 1 is batch_size
+            data_2 = [[[0.1, 0.2, 0.3]]]
+            data_2.shape = (1, 1, 3)
+            out = fluid.layers.fc(input=[data_1, data_2], size=2)
+        Then:
+            out.data = [[0.18669507, 0.1893476]]
+            out.shape = (1, 2)
+    Args:
+        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+            the input tensor(s) is at least 2.
+        size(int): The number of output units in this layer.
+        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multidimensional tensor will first be flattened
+            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
+            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
+            form the second dimension of the final matrix (width of the matrix). For example, suppose
+            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
+        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+            parameters/weights of this layer.
+        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+        act (str, default None): Activation to be applied to the output of this layer.
+        is_test(bool): A flag indicating whether execution is in test phase.
+        name (str, default None): The name of this layer.
+    Returns:
+        Variable: The transformation result.
+    Raises:
+        ValueError: If rank of the input tensor is less than 2.
+    Examples:
+        .. code-block:: python
+          # when input is single tensor
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          fc = fluid.FC("fc", size=1000, act="tanh")
+          fc_res = fc(data)
+          # when input are multiple tensors
+          data_1 = fluid.layers.data(name="data_1", shape=[32, 32], dtype="float32")
+          data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32")
+          fc = fluid.FC("fc", size=1000, act="tanh")
+          fc_res = fc([data_1, data_2])
+    """
    def __init__(self,
                 name_scope,
                 size,
@@ -522,7 +879,7 @@ class FC(layers.Layer):
        assert isinstance(value, Parameter)
        self.__w[i] = value
-    def _build_once(self, input):
+    def build_once(self, input):
        i = 0
        for inp, param in self._helper.iter_inputs_and_params(input,
                                                              self._param_attr):
@@ -591,6 +948,91 @@ class FC(layers.Layer):
 class BatchNorm(layers.Layer):
+    """
+    **Batch Normalization Layer**
+    Can be used as a normalizer function for conv2d and fully_connected operations.
+    The required data format for this layer is one of the following:
+    1. NHWC `[batch, in_height, in_width, in_channels]`
+    2. NCHW `[batch, in_channels, in_height, in_width]`
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
+    :math:`input` is the input features over a mini-batch.
+    ..  math::
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+    When use_global_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global (or running) statistics. (It usually got from the
+    pre-trained model.)
+    The training and testing (or inference) have the same behavior:
+    ..  math::
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}}  \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta
+    Args:
+        input(variable): The rank of input variable can be 2, 3, 4, 5.
+        act(string, Default None): Activation type, linear|relu|prelu|...
+        is_test (bool, Default False): A flag indicating whether it is in
+            test phrase or not.
+        momentum(float, Default 0.9): The value used for the moving_mean and
+            moving_var computation. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        epsilon(float, Default 1e-05): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
+             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as param_attr. If the Initializer of the param_attr
+             is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm.
+             If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. Default: None.
+        data_layout(string, default NCHW): NCHW|NHWC
+        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
+        name(string, Default None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
+        moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
+        do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
+        fuse_with_relu (bool): if True, this OP performs relu after batch norm.
+        use_global_stats(bool, Default False): Whether to use global mean and
+            variance. In inference or test mode, set use_global_stats to true
+            or is_test to true, and the behavior is equivalent.
+            In train mode, when setting use_global_stats True, the global mean
+            and variance are also used during train period.
+    Returns:
+        Variable: A tensor variable which is the result after applying batch normalization on the input.
+    Examples:
+        .. code-block:: python
+            fc = fluid.FC('fc', size=200, param_attr='fc1.w')
+            hidden1 = fc(x)
+            batch_norm = fluid.BatchNorm("batch_norm", 10)
+            hidden2 = batch_norm(hidden1)
+    """
    def __init__(self,
                 name_scope,
                 num_channels,
@@ -629,7 +1071,7 @@ class BatchNorm(layers.Layer):
            dtype=self._dtype,
            default_initializer=Constant(1.0))
        if use_global_stats and self._param_attr.learning_rate == 0.:
-            self._scale._stop_gradient = True
+            self._scale.stop_gradient = True
        self._bias = self.create_parameter(
            attr=self._param_attr,
@@ -637,7 +1079,7 @@ class BatchNorm(layers.Layer):
            dtype=self._dtype,
            is_bias=True)
        if use_global_stats and self._param_attr.learning_rate == 0.:
-            self._bias._stop_gradient = True
+            self._bias.stop_gradient = True
        self._mean = self.create_parameter(
            attr=ParamAttr(
@@ -647,7 +1089,7 @@ class BatchNorm(layers.Layer):
                do_model_average=do_model_average_for_mean_and_var),
            shape=param_shape,
            dtype=self._dtype)
-        self._mean._stop_gradient = True
+        self._mean.stop_gradient = True
        self._variance = self.create_parameter(
            attr=ParamAttr(
@@ -657,7 +1099,7 @@ class BatchNorm(layers.Layer):
                do_model_average=do_model_average_for_mean_and_var),
            shape=param_shape,
            dtype=self._dtype)
-        self._variance._stop_gradient = True
+        self._variance.stop_gradient = True
        self._in_place = in_place
        self._momentum = momentum
@@ -666,7 +1108,7 @@ class BatchNorm(layers.Layer):
        self._fuse_with_relu = fuse_with_relu
        self._use_global_stats = use_global_stats
-    def _build_once(self, input):
+    def build_once(self, input):
        pass
    def forward(self, input):
@@ -747,7 +1189,7 @@ class Embedding(layers.Layer):
          dict_size = len(dataset.ids)
          input = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
-          embedding = fluid.dygraph.Embedding(size=[dict_size, 16])
+          embedding = fluid.Embedding(size=[dict_size, 16])
          fc = embedding(input)
    """
@@ -797,15 +1239,6 @@ class Embedding(layers.Layer):
 class LayerNorm(layers.Layer):
-    def __init__(self,
-                 name_scope,
-                 scale=True,
-                 shift=True,
-                 begin_norm_axis=1,
-                 epsilon=1e-05,
-                 param_attr=None,
-                 bias_attr=None,
-                 act=None):
    """
    ${comment}
@@ -861,6 +1294,15 @@ class LayerNorm(layers.Layer):
        >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
    """
+    def __init__(self,
+                 name_scope,
+                 scale=True,
+                 shift=True,
+                 begin_norm_axis=1,
+                 epsilon=1e-05,
+                 param_attr=None,
+                 bias_attr=None,
+                 act=None):
        super(LayerNorm, self).__init__(name_scope)
        self._scale = scale
        self._shift = shift
@@ -870,7 +1312,7 @@ class LayerNorm(layers.Layer):
        self._bias_attr = bias_attr
        self._act = act
-    def _build_once(self, input):
+    def build_once(self, input):
        self._dtype = self._helper.input_dtype(input)
        input_shape = input.shape
        param_shape = [
@@ -1232,7 +1674,7 @@ class NCE(layers.Layer):
            'remote_prefetch': remote_prefetch
        }
-    def _build_once(self, input, label, sample_weight=None):
+    def build_once(self, input, label, sample_weight=None):
        assert isinstance(input, Variable)
        assert isinstance(label, Variable)
@@ -1318,7 +1760,7 @@ class PRelu(layers.Layer):
            raise ValueError('mode should be one of all, channel, element.')
        self._alpha_shape = [1]
-    def _build_once(self, input):
+    def build_once(self, input):
        if self._mode == 'channel':
            self._alpha_shape = [1, input.shape[1], 1, 1]
        elif self._mode == 'element':
@@ -1396,7 +1838,7 @@ class BilinearTensorProduct(layers.Layer):
        self._name = name
        self._inputs = dict()
-    def _build_once(self, x, y):
+    def build_once(self, x, y):
        self._dtype = self._helper.input_dtype(x)
        param_shape = [self._size, x.shape[1], y.shape[1]]
@@ -1572,7 +2014,7 @@ class Conv2DTranspose(layers.Layer):
        self._output_size = output_size
        self._op_type = 'conv2d_transpose'
-    def _build_once(self, input):
+    def build_once(self, input):
        input_channel = input.shape[1]
        if (input_channel == self._groups and
                self._num_filters == input_channel and not self._use_cudnn):
@@ -1686,7 +2128,7 @@ class SequenceConv(layers.Layer):
                 bias_attr=None,
                 param_attr=None,
                 act=None):
-        assert not _in_dygraph_mode(
+        assert not in_dygraph_mode(
        ), "SequenceConv is not supported by dynamic graph mode yet!"
        super(SequenceConv, self).__init__(name_scope)
        self._num_filters = num_filters
@@ -1696,7 +2138,7 @@ class SequenceConv(layers.Layer):
        self._bias_attr = bias_attr
        self._param_attr = param_attr
-    def _build_once(self, input):
+    def build_once(self, input):
        self._dtype = self._helper.input_dtype(input)
        filter_shape = [self._filter_size * input.shape[1], self._num_filters]
        self._filter_param = self.create_parameter(
@@ -1726,14 +2168,14 @@ class RowConv(layers.Layer):
                 future_context_size,
                 param_attr=None,
                 act=None):
-        assert not _in_dygraph_mode(
+        assert not in_dygraph_mode(
        ), "RowConv is not supported by dynamic graph mode yet!"
        super(RowConv, self).__init__(name_scope)
        self._act = act
        self._param_attr = param_attr
        self._future_context_size = future_context_size
-    def _build_once(self, input):
+    def build_once(self, input):
        self._dtype = self._helper.input_dtype(input)
        filter_shape = [self._future_context_size + 1, input.shape[1]]
        self._filter_param = self.create_parameter(
@@ -1796,7 +2238,7 @@ class GroupNorm(layers.Layer):
        if data_layout != 'NCHW':
            raise ValueError("unsupported data layout:" + data_layout)
-    def _build_once(self, input):
+    def build_once(self, input):
        self._dtype = self._helper.input_dtype(input)
        param_shape = [input.shape[1]]
        if self._bias_attr:
@@ -1849,7 +2291,7 @@ class SpectralNorm(layers.Layer):
        self._eps = eps
        self._dim = dim
-    def _build_once(self, weight):
+    def build_once(self, weight):
        self._dtype = self._helper.input_dtype(weight)
        input_shape = weight.shape
        h = input_shape[self._dim]
@@ -1904,7 +2346,7 @@ class TreeConv(layers.Layer):
        self._bias_attr = bias_attr
        self._param_attr = param_attr
-    def _build_once(self, nodes_vector, edge_set):
+    def build_once(self, nodes_vector, edge_set):
        assert isinstance(nodes_vector, Variable)
        assert isinstance(edge_set, Variable)
        self._dtype = self._helper.input_dtype(nodes_vector)

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -67,6 +67,7 @@ __all__ = [
    'cuda_places',
    'cpu_places',
    'cuda_pinned_places',
+    'in_dygraph_mode',
 ]
 EMPTY_VAR_NAME = core.kEmptyVarName()
@@ -79,7 +80,10 @@ _dygraph_tracer_ = None
 _dygraph_current_expected_place_ = None
-def _in_dygraph_mode():
+def in_dygraph_mode():
+    '''
+    Returns(bool): True if the program is running in dynamic graph mode
+    '''
    return _dygraph_tracer_ is not None
@@ -396,7 +400,7 @@ class Variable(object):
            if not isinstance(dtype, core.VarDesc.VarType):
                dtype = convert_np_dtype_to_dtype_(dtype)
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            # record vars in tracer rather than blocks
            self._ivar = kwargs.get("ivar", None)
            if not self._ivar:
@@ -482,21 +486,21 @@ class Variable(object):
            self.block.vars[name] = self
            self.op = None
-            self.stop_gradient = stop_gradient
+            self._stop_gradient = stop_gradient
            self.is_data = is_data
-    def _numpy(self):
+    def numpy(self):
        new_ivar = self._ivar._copy_to(core.CPUPlace(), True)
        return np.array(new_ivar.value().get_tensor())
-    def _backward(self):
+    def backward(self):
        self._ivar._run_backward()
-    def _gradient(self):
+    def gradient(self):
        new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True)
        return np.array(new_ivar.value().get_tensor())
-    def _clear_gradient(self):
+    def clear_gradient(self):
        self._ivar._clear_gradient()
    def __str__(self):
@@ -516,7 +520,7 @@ class Variable(object):
        Returns:
            str: The debug string.
        """
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            # TODO(panyx0718): add more dygraph debug info.
            return 'name %s, dtype: %s shape: %s' % (self.name, self.dtype,
                                                     self.shape)
@@ -535,7 +539,7 @@ class Variable(object):
    __repr__ = __str__
-    def _set_desc(self, input):
+    def set_desc(self, input):
        """
        Set the variable description.
@@ -548,43 +552,43 @@ class Variable(object):
        self.desc = input
    @property
-    def _stop_gradient(self):
+    def stop_gradient(self):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            return self._ivar.stop_gradient
        else:
-            return self.stop_gradient
+            return self._stop_gradient
-    @_stop_gradient.setter
+    @stop_gradient.setter
-    def _stop_gradient(self, s):
+    def stop_gradient(self, s):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            self._ivar.stop_gradient = s
        else:
-            self.stop_gradient = s
+            self._stop_gradient = s
    @property
    def persistable(self):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            return self._ivar.persistable
        else:
            return self.desc.persistable()
    @persistable.setter
    def persistable(self, p):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            return self._ivar.persistable
        else:
            self.desc.set_persistable(p)
    @property
    def name(self):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            return self._ivar.name
        else:
            return cpt.to_text(self.desc.name())
    @name.setter
    def name(self, new_name):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            self._ivar.name = new_name
        else:
            self.desc.set_name(new_name)
@@ -592,14 +596,14 @@ class Variable(object):
    @property
    def shape(self):
        # convert to tuple, make it as same as numpy API.
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            return self._ivar.shape
        else:
            return tuple(self.desc.shape())
    @property
    def dtype(self):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            return self._ivar.dtype
        else:
            return self.desc.dtype()
@@ -611,7 +615,7 @@ class Variable(object):
    @property
    def type(self):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            return self._ivar.dtype
        else:
            return self.desc.type()
@@ -721,7 +725,7 @@ class Variable(object):
                name=unique_name.generate(".".join(self.name)),
                dtype=self.dtype,
                persistable=self.persistable,
-                stop_gradient=self._stop_gradient, )
+                stop_gradient=self.stop_gradient, )
        else:
            return self
@@ -930,7 +934,7 @@ class Operator(object):
                 inputs=None,
                 outputs=None,
                 attrs=None):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            if type is None:
                raise ValueError(
                    "`type` to initialized an Operator can not be None.")
@@ -1049,7 +1053,7 @@ class Operator(object):
                    for arg in out_args:
                        out_arg_names.append(cpt.to_text(arg.name))
                        # TODO(minqiyang): could we remove variable's op in static mode?
-                        if not _in_dygraph_mode():
+                        if not in_dygraph_mode():
                            arg.op = self
                    self.desc.set_output(out_proto.name, out_arg_names)
@@ -1095,7 +1099,7 @@ class Operator(object):
    @property
    def type(self):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            return self.iop.type
        else:
            return self.desc.type()
@@ -1638,7 +1642,7 @@ class Block(object):
        Returns:
            Operator: the append Operator.
        """
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            op = Operator(
                block=self,
                desc=None,
@@ -1710,7 +1714,7 @@ class Block(object):
        return self.ops[start:end]
    def _prepend_op(self, *args, **kwargs):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            op = Operator(
                self,
                None,

--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -165,7 +165,7 @@ class ConstantInitializer(Initializer):
                'force_cpu': self._force_cpu or force_init_on_cpu()
            },
            stop_gradient=True)
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
            var.op = op
        return op
@@ -245,7 +245,7 @@ class UniformInitializer(Initializer):
                attrs={"in_dtype": out_var.dtype,
                       "out_dtype": var.dtype})
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
            var.op = op
        return op
@@ -324,7 +324,7 @@ class NormalInitializer(Initializer):
                outputs={"Out": var},
                attrs={"in_dtype": out_var.dtype,
                       "out_dtype": var.dtype})
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
            var.op = op
        return op
@@ -403,7 +403,7 @@ class TruncatedNormalInitializer(Initializer):
                outputs={"Out": var},
                attrs={"in_dtype": out_var.dtype,
                       "out_dtype": var.dtype})
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
            var.op = op
        return op
@@ -509,7 +509,7 @@ class XavierInitializer(Initializer):
                    "seed": self._seed
                },
                stop_gradient=True)
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
            var.op = op
        return op
@@ -610,7 +610,7 @@ class MSRAInitializer(Initializer):
                    "seed": self._seed
                },
                stop_gradient=True)
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
            var.op = op
        return op
@@ -709,7 +709,7 @@ class BilinearInitializer(Initializer):
                'shape': list(shape),
                value_name: values
            })
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
            var.op = op
        return op
@@ -768,7 +768,7 @@ class NumpyArrayInitializer(Initializer):
                value_name: values
            },
            stop_gradient=True)
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
            var.op = op
        return op

--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import copy
 import six
-from .framework import Parameter, dtype_is_floating, _in_dygraph_mode
+from .framework import Parameter, dtype_is_floating, in_dygraph_mode
 from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
 from .param_attr import ParamAttr

--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import copy
 import numpy as np
-from .framework import Variable, default_main_program, default_startup_program, _in_dygraph_mode, _current_expected_place
+from .framework import Variable, default_main_program, default_startup_program, in_dygraph_mode, _current_expected_place
 from . import unique_name
 from .param_attr import ParamAttr, WeightNormParamAttr
 from . import core
@@ -54,7 +54,7 @@ class LayerHelperBase(object):
        Return Variable construct from value
        """
        if isinstance(value, np.ndarray):
-            assert _in_dygraph_mode(
+            assert in_dygraph_mode(
            ), "to_variable could only be called in dygraph mode"
            if not block:
@@ -302,7 +302,7 @@ class LayerHelperBase(object):
            param = self._create_weight_normalize(attr, shape, dtype)
            WeightNormParamAttr.params_with_weight_norm.append(param)
            return param
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            # In dygraph mode, we want the returned parameter to be
            # initialized so that it can be used imperatively.
            return self.main_program.global_block().create_parameter(
@@ -370,7 +370,7 @@ class LayerHelperBase(object):
               initializer: initializer to use
        """
        assert isinstance(var, Variable)
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
            initializer(var, var.block)
        else:
            self.startup_program.global_block().create_var(

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -23,7 +23,7 @@ import os
 import inspect
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder, _in_dygraph_mode
+from ..framework import Variable, OpProtoHolder, in_dygraph_mode
 from ..dygraph import base
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
@@ -3288,7 +3288,7 @@ def layer_norm(input,
        >>>                          dtype='float32')
        >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
    """
-    assert _in_dygraph_mode(
+    assert in_dygraph_mode(
    ) is not True, "please use FC instead of fc in dygraph mode!"
    helper = LayerHelper('layer_norm', **locals())
    dtype = helper.input_dtype()
@@ -6454,7 +6454,7 @@ def squeeze(input, axes, name=None):
            x = layers.data(name='x', shape=[5, 1, 10])
            y = layers.sequeeze(input=x, axes=[1])
    """
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
        "squeeze layer is not supported in dygraph mode yet.")
    helper = LayerHelper("squeeze", **locals())
    out = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -9193,7 +9193,7 @@ def _elementwise_op(helper):
    op_type = helper.layer_type
    x = helper.kwargs.get('x', None)
    y = helper.kwargs.get('y', None)
-    if _in_dygraph_mode():
+    if in_dygraph_mode():
        x = base.to_variable(x)
        y = base.to_variable(y)

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -55,7 +55,7 @@ class Optimizer(object):
    """
    def __init__(self, learning_rate, regularization=None, name=None):
-        if framework._in_dygraph_mode():
+        if framework.in_dygraph_mode():
            if not isinstance(learning_rate, float) and \
                    not isinstance(learning_rate, LearningRateDecay):
                raise TypeError(
@@ -205,7 +205,7 @@ class Optimizer(object):
            name = self._name + "_" + name
        if (name in self._accumulators and
                param.name in self._accumulators[name]):
-            if framework._in_dygraph_mode():
+            if framework.in_dygraph_mode():
                return self._accumulators[name][param.name]
            raise Exception("Accumulator {} already exists for parameter {}".
                            format(name, param.name))
@@ -275,7 +275,7 @@ class Optimizer(object):
        self._create_global_learning_rate()
        optimize_ops = []
-        if framework._in_dygraph_mode():
+        if framework.in_dygraph_mode():
            for param_and_grad in parameters_and_grads:
                if param_and_grad[1] is None:
                    continue
@@ -374,7 +374,7 @@ class Optimizer(object):
            See examples in `apply_gradients`.
        """
        self._dtype = loss.dtype
-        if framework._in_dygraph_mode():
+        if framework.in_dygraph_mode():
            if parameter_list is not None:
                parameters = parameter_list
            else:
@@ -459,7 +459,7 @@ class Optimizer(object):
        Returns:
            list: A list of operators appended to the current program.
        """
-        if framework._in_dygraph_mode():
+        if framework.in_dygraph_mode():
            with program_guard(framework.default_main_program(),
                               framework.default_startup_program()):
                optimize_ops = self._create_optimization_pass(params_grads)

--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -18,7 +18,7 @@ import numpy as np
 import paddle.fluid as fluid
-class L1(fluid.dygraph.Layer):
+class L1(fluid.Layer):
    def __init__(self, prefix):
        super(L1, self).__init__(prefix)
        self._param_attr = fluid.ParamAttr(
@@ -32,7 +32,7 @@ class L1(fluid.dygraph.Layer):
        return self.w1 + self.w2
-class L2(fluid.dygraph.Layer):
+class L2(fluid.Layer):
    def __init__(self, prefix):
        super(L2, self).__init__(prefix)
        self.layer1 = L1(self.full_name())
@@ -42,7 +42,7 @@ class L2(fluid.dygraph.Layer):
        return self.layer1() + self.layer2()
-class L3(fluid.dygraph.Layer):
+class L3(fluid.Layer):
    def __init__(self, prefix):
        super(L3, self).__init__(prefix)
        self.layer1 = L2(self.full_name())
@@ -59,7 +59,7 @@ class TestBaseLayer(unittest.TestCase):
            ret = l()
            self.assertEqual(l.w1.name, "test_one_level/L1_0.w_0")
            self.assertEqual(l.w2.name, "test_one_level/L1_0.w_1")
-            self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2])))
+            self.assertTrue(np.allclose(ret.numpy(), 0.2 * np.ones([2, 2])))
    def test_three_level(self):
        with fluid.dygraph.guard():
@@ -72,7 +72,7 @@ class TestBaseLayer(unittest.TestCase):
            self.assertEqual(names[3], "test_three_level/L3_0/L2_0/L1_1.w_1")
            self.assertEqual(names[4], "test_three_level/L3_0/L2_1/L1_0.w_0")
            self.assertEqual(names[5], "test_three_level/L3_0/L2_1/L1_0.w_1")
-            self.assertTrue(np.allclose(ret._numpy(), 0.8 * np.ones([2, 2])))
+            self.assertTrue(np.allclose(ret.numpy(), 0.8 * np.ones([2, 2])))
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -18,11 +18,11 @@ import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid import core
-from paddle.fluid.dygraph.nn import FC
+from paddle.fluid import FC
 from test_imperative_base import new_program_scope
-class MyLayer(fluid.dygraph.Layer):
+class MyLayer(fluid.Layer):
    def __init__(self, name_scope):
        super(MyLayer, self).__init__(name_scope)
@@ -34,7 +34,7 @@ class MyLayer(fluid.dygraph.Layer):
        return [x]
-class MyPyLayer(fluid.dygraph.PyLayer):
+class MyPyLayer(fluid.PyLayer):
    def __init__(self):
        super(MyPyLayer, self).__init__()
@@ -48,7 +48,7 @@ class MyPyLayer(fluid.dygraph.PyLayer):
        return np.array(dout) * (1 - np.square(np.array(out)))
-class MLP(fluid.dygraph.Layer):
+class MLP(fluid.Layer):
    def __init__(self, name_scope):
        super(MLP, self).__init__(name_scope)
        self._fc1 = FC(self.full_name(),
@@ -71,7 +71,7 @@ class MLP(fluid.dygraph.Layer):
        return x
-class SimpleRNNCell(fluid.dygraph.Layer):
+class SimpleRNNCell(fluid.Layer):
    def __init__(self, name_scope, step_input_size, hidden_size, output_size,
                 param_attr):
        super(SimpleRNNCell, self).__init__(name_scope)
@@ -81,7 +81,7 @@ class SimpleRNNCell(fluid.dygraph.Layer):
        self._dtype = core.VarDesc.VarType.FP32
        self.param_attr = param_attr
-    def _build_once(self, inputs, pre_hidden):
+    def build_once(self, inputs, pre_hidden):
        i2h_param_shape = [self.step_input_size, self.hidden_size]
        h2h_param_shape = [self.hidden_size, self.hidden_size]
        h2o_param_shape = [self.output_size, self.hidden_size]
@@ -159,7 +159,7 @@ class SimpleRNNCell(fluid.dygraph.Layer):
        return reduce_out, hidden
-class SimpleRNN(fluid.dygraph.Layer):
+class SimpleRNN(fluid.Layer):
    def __init__(self, name_scope):
        super(SimpleRNN, self).__init__(name_scope)
        self.seq_len = 4
@@ -200,22 +200,22 @@ class TestImperative(unittest.TestCase):
                inputs.append(fluid.dygraph.base.to_variable(x))
            ret = fluid.layers.sums(inputs)
            loss = fluid.layers.reduce_sum(ret)
-            loss._backward()
+            loss.backward()
-            self.assertTrue(np.allclose(ret._numpy(), x * 10))
+            self.assertTrue(np.allclose(ret.numpy(), x * 10))
-            self.assertTrue(np.allclose(inputs[0]._gradient(), x))
+            self.assertTrue(np.allclose(inputs[0].gradient(), x))
    def test_layer(self):
        with fluid.dygraph.guard():
            cl = core.Layer()
            cl.forward([])
-            l = fluid.dygraph.Layer("l")
+            l = fluid.Layer("l")
            self.assertRaises(NotImplementedError, l.forward, [])
    def test_pylayer_func_id(self):
        with fluid.dygraph.guard():
-            class PyLayer1(fluid.dygraph.PyLayer):
+            class PyLayer1(fluid.PyLayer):
                def __init__(self):
                    super(PyLayer1, self).__init__()
@@ -257,9 +257,9 @@ class TestImperative(unittest.TestCase):
            my_py_layer = MyPyLayer()
            var_inp = fluid.dygraph.base.to_variable(np_inp)
            outs = my_py_layer(var_inp)
-            dy_out = np.sum(outs[0]._numpy())
+            dy_out = np.sum(outs[0].numpy())
-            outs[0]._backward()
+            outs[0].backward()
-            dy_grad = var_inp._gradient()
+            dy_grad = var_inp.gradient()
        with new_program_scope():
            inp = fluid.layers.data(
@@ -287,9 +287,9 @@ class TestImperative(unittest.TestCase):
            l = MyLayer("my_layer")
            x = l(var_inp)[0]
            self.assertIsNotNone(x)
-            dy_out = x._numpy()
+            dy_out = x.numpy()
-            x._backward()
+            x.backward()
-            dy_grad = l._x_for_debug._gradient()
+            dy_grad = l._x_for_debug.gradient()
        with new_program_scope():
            inp = fluid.layers.data(
@@ -314,9 +314,9 @@ class TestImperative(unittest.TestCase):
            var_inp = fluid.dygraph.base.to_variable(np_inp)
            mlp = MLP("mlp")
            out = mlp(var_inp)
-            dy_out = out._numpy()
+            dy_out = out.numpy()
-            out._backward()
+            out.backward()
-            dy_grad = mlp._fc1._w._gradient()
+            dy_grad = mlp._fc1._w.gradient()
        with new_program_scope():
            inp = fluid.layers.data(
@@ -358,11 +358,11 @@ class TestImperative(unittest.TestCase):
            var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3])
            simple_rnn = SimpleRNN("simple_rnn")
            outs, pre_hiddens = simple_rnn.forward(var_inp)
-            dy_out = outs[3]._numpy()
+            dy_out = outs[3].numpy()
-            outs[3]._backward()
+            outs[3].backward()
-            dy_grad_h2o = simple_rnn._cell._h2o_w._gradient()
+            dy_grad_h2o = simple_rnn._cell._h2o_w.gradient()
-            dy_grad_h2h = simple_rnn._cell._h2h_w._gradient()
+            dy_grad_h2h = simple_rnn._cell._h2h_w.gradient()
-            dy_grad_i2h = simple_rnn._cell._i2h_w._gradient()
+            dy_grad_i2h = simple_rnn._cell._i2h_w.gradient()
        with new_program_scope():
            inp = fluid.layers.data(

--- a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
@@ -18,11 +18,11 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
+from paddle.fluid import Conv2D, Pool2D, FC
 from paddle.fluid.dygraph.base import to_variable
-class SimpleImgConvPool(fluid.dygraph.Layer):
+class SimpleImgConvPool(fluid.Layer):
    def __init__(self,
                 name_scope,
                 num_channels,
@@ -71,7 +71,7 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
        return x
-class MNIST(fluid.dygraph.Layer):
+class MNIST(fluid.Layer):
    def __init__(self, name_scope):
        super(MNIST, self).__init__(name_scope)
@@ -125,21 +125,21 @@ class TestDygraphCheckpoint(unittest.TestCase):
                    img = to_variable(dy_x_data)
                    label = to_variable(y_data)
-                    label._stop_gradient = True
+                    label.stop_gradient = True
                    cost = mnist(img)
                    loss = fluid.layers.cross_entropy(cost, label)
                    avg_loss = fluid.layers.mean(loss)
-                    dy_out = avg_loss._numpy()
+                    dy_out = avg_loss.numpy()
-                    avg_loss._backward()
+                    avg_loss.backward()
                    sgd.minimize(avg_loss)
                    fluid.dygraph.save_persistables(mnist, "save_dir")
                    mnist.clear_gradients()
                    for param in mnist.parameters():
-                        dy_param_init_value[param.name] = param._numpy()
+                        dy_param_init_value[param.name] = param.numpy()
                    mnist.load_dict(
                        fluid.dygraph.load_persistables(mnist, "save_dir"))

--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -32,11 +32,11 @@ NUM_BATCHES = int(os.environ.get('NUM_BATCHES', 5))
 NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1))
-class DMF(fluid.dygraph.Layer):
+class DMF(fluid.Layer):
    def __init__(self, name_scope):
        super(DMF, self).__init__(name_scope)
-        self._user_latent = fluid.dygraph.FC(self.full_name(), 256)
+        self._user_latent = fluid.FC(self.full_name(), 256)
-        self._item_latent = fluid.dygraph.FC(self.full_name(), 256)
+        self._item_latent = fluid.FC(self.full_name(), 256)
        self._user_layers = []
        self._item_layers = []
@@ -45,13 +45,11 @@ class DMF(fluid.dygraph.Layer):
            self._user_layers.append(
                self.add_sublayer(
                    'user_layer_%d' % i,
-                    fluid.dygraph.FC(
+                    fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
-                        self.full_name(), self._hid_sizes[i], act='relu')))
            self._item_layers.append(
                self.add_sublayer(
                    'item_layer_%d' % i,
-                    fluid.dygraph.FC(
+                    fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
-                        self.full_name(), self._hid_sizes[i], act='relu')))
    def forward(self, users, items):
        users = self._user_latent(users)
@@ -63,19 +61,18 @@ class DMF(fluid.dygraph.Layer):
        return fluid.layers.elementwise_mul(users, items)
-class MLP(fluid.dygraph.Layer):
+class MLP(fluid.Layer):
    def __init__(self, name_scope):
        super(MLP, self).__init__(name_scope)
-        self._user_latent = fluid.dygraph.FC(self.full_name(), 256)
+        self._user_latent = fluid.FC(self.full_name(), 256)
-        self._item_latent = fluid.dygraph.FC(self.full_name(), 256)
+        self._item_latent = fluid.FC(self.full_name(), 256)
        self._match_layers = []
        self._hid_sizes = [128, 64]
        for i in range(len(self._hid_sizes)):
            self._match_layers.append(
                self.add_sublayer(
                    'match_layer_%d' % i,
-                    fluid.dygraph.FC(
+                    fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
-                        self.full_name(), self._hid_sizes[i], act='relu')))
        self._mat
    def forward(self, users, items):
@@ -88,7 +85,7 @@ class MLP(fluid.dygraph.Layer):
        return match_vec
-class DeepCF(fluid.dygraph.Layer):
+class DeepCF(fluid.Layer):
    def __init__(self, name_scope, num_users, num_items, matrix):
        super(DeepCF, self).__init__(name_scope)
        self._num_users = num_users
@@ -99,11 +96,11 @@ class DeepCF(fluid.dygraph.Layer):
            matrix.dtype,
            is_bias=False,
            default_initializer=fluid.initializer.NumpyArrayInitializer(matrix))
-        self._rating_matrix._stop_gradient = True
+        self._rating_matrix.stop_gradient = True
        self._mlp = MLP(self.full_name())
        self._dmf = DMF(self.full_name())
-        self._match_fc = fluid.dygraph.FC(self.full_name(), 1, act='sigmoid')
+        self._match_fc = fluid.FC(self.full_name(), 1, act='sigmoid')
    def forward(self, users, items):
        # users_emb = self._user_emb(users)
@@ -255,10 +252,10 @@ class TestDygraphDeepCF(unittest.TestCase):
                        fluid.layers.log_loss(prediction,
                                              to_variable(labels_np[
                                                  slice:slice + BATCH_SIZE])))
-                    loss._backward()
+                    loss.backward()
                    adam.minimize(loss)
                    deepcf.clear_gradients()
-                    dy_loss = loss._numpy()
+                    dy_loss = loss.numpy()
                    sys.stderr.write('dynamic loss: %s %s\n' % (slice, dy_loss))
        self.assertEqual(static_loss, dy_loss)

--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -22,12 +22,12 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
+from paddle.fluid import Conv2D, Pool2D, FC
 from test_imperative_base import new_program_scope
 from paddle.fluid.dygraph.base import to_variable
-class Discriminator(fluid.dygraph.Layer):
+class Discriminator(fluid.Layer):
    def __init__(self, name_scope):
        super(Discriminator, self).__init__(name_scope)
        self._fc1 = FC(self.full_name(), size=32, act='elu')
@@ -38,7 +38,7 @@ class Discriminator(fluid.dygraph.Layer):
        return self._fc2(x)
-class Generator(fluid.dygraph.Layer):
+class Generator(fluid.Layer):
    def __init__(self, name_scope):
        super(Generator, self).__init__(name_scope)
        self._fc1 = FC(self.full_name(), size=64, act='elu')
@@ -150,7 +150,7 @@ class TestDygraphGAN(unittest.TestCase):
                    x=d_fake, label=to_variable(np.zeros([2, 1], np.float32))))
            d_loss = d_loss_real + d_loss_fake
-            d_loss._backward()
+            d_loss.backward()
            sgd.minimize(d_loss)
            discriminator.clear_gradients()
            generator.clear_gradients()
@@ -160,15 +160,15 @@ class TestDygraphGAN(unittest.TestCase):
            g_loss = fluid.layers.reduce_mean(
                fluid.layers.sigmoid_cross_entropy_with_logits(
                    x=d_fake, label=to_variable(np.ones([2, 1], np.float32))))
-            g_loss._backward()
+            g_loss.backward()
            sgd.minimize(g_loss)
            for p in discriminator.parameters():
-                dy_params[p.name] = p._numpy()
+                dy_params[p.name] = p.numpy()
            for p in generator.parameters():
-                dy_params[p.name] = p._numpy()
+                dy_params[p.name] = p.numpy()
-            dy_g_loss = g_loss._numpy()
+            dy_g_loss = g_loss.numpy()
-            dy_d_loss = d_loss._numpy()
+            dy_d_loss = d_loss.numpy()
        self.assertEqual(dy_g_loss, static_g_loss)
        self.assertEqual(dy_d_loss, static_d_loss)

--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -15,14 +15,12 @@
 import contextlib
 import unittest
 import numpy as np
-import six
 import sys
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
 from test_imperative_base import new_program_scope
 from paddle.fluid.dygraph.base import to_variable
@@ -31,7 +29,7 @@ def gen_data():
    pass
-class GraphConv(fluid.dygraph.Layer):
+class GraphConv(fluid.Layer):
    def __init__(self, name_scope, in_features, out_features):
        super(GraphConv, self).__init__(name_scope)
@@ -50,7 +48,7 @@ class GraphConv(fluid.dygraph.Layer):
        return fluid.layers.matmul(adj, support) + self.bias
-class GCN(fluid.dygraph.Layer):
+class GCN(fluid.Layer):
    def __init__(self, name_scope, num_hidden):
        super(GCN, self).__init__(name_scope)
        self.gc = GraphConv(self.full_name(), num_hidden, 32)
@@ -134,10 +132,9 @@ class TestDygraphGNN(unittest.TestCase):
            loss = fluid.layers.reduce_sum(loss)
            adam = AdamOptimizer(learning_rate=1e-3)
            adam.minimize(loss)
-            self.assertEqual(static_loss, loss._numpy())
+            self.assertEqual(static_loss, loss.numpy())
-            self.assertTrue(
+            self.assertTrue(np.allclose(static_weight, model.gc.weight.numpy()))
-                np.allclose(static_weight, model.gc.weight._numpy()))
+            sys.stderr.write('%s %s\n' % (static_loss, loss.numpy()))
-            sys.stderr.write('%s %s\n' % (static_loss, loss._numpy()))
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -128,25 +128,25 @@ class TestImperativeMnist(unittest.TestCase):
                    img = to_variable(dy_x_data)
                    label = to_variable(y_data)
-                    label._stop_gradient = True
+                    label.stop_gradient = True
                    cost = mnist(img)
                    loss = fluid.layers.cross_entropy(cost, label)
                    avg_loss = fluid.layers.mean(loss)
-                    dy_out = avg_loss._numpy()
+                    dy_out = avg_loss.numpy()
                    if epoch == 0 and batch_id == 0:
                        for param in mnist.parameters():
-                            dy_param_init_value[param.name] = param._numpy()
+                            dy_param_init_value[param.name] = param.numpy()
-                    avg_loss._backward()
+                    avg_loss.backward()
                    sgd.minimize(avg_loss)
                    mnist.clear_gradients()
                    dy_param_value = {}
                    for param in mnist.parameters():
-                        dy_param_value[param.name] = param._numpy()
+                        dy_param_value[param.name] = param.numpy()
        with new_program_scope():
            fluid.default_startup_program().random_seed = seed

--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -28,7 +28,7 @@ from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
-class MLP(fluid.dygraph.Layer):
+class MLP(fluid.Layer):
    def __init__(self, name_scope, param_attr=None, bias_attr=None):
        super(MLP, self).__init__(name_scope)
@@ -75,18 +75,18 @@ class TestImperativeOptimizerBase(unittest.TestCase):
                cost = mlp(img)
                avg_loss = fluid.layers.reduce_mean(cost)
-                dy_out = avg_loss._numpy()
+                dy_out = avg_loss.numpy()
                if batch_id == 0:
                    for param in mlp.parameters():
-                        dy_param_init_value[param.name] = param._numpy()
+                        dy_param_init_value[param.name] = param.numpy()
-                avg_loss._backward()
+                avg_loss.backward()
                optimizer.minimize(avg_loss)
                mlp.clear_gradients()
                dy_param_value = {}
                for param in mlp.parameters():
-                    dy_param_value[param.name] = param._numpy()
+                    dy_param_value[param.name] = param.numpy()
        with new_program_scope():
            fluid.default_startup_program().random_seed = seed

--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -24,10 +24,9 @@ from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 import numpy as np
 import six
-from paddle.fluid.backward import append_backward
-class SimpleLSTMRNN(fluid.dygraph.Layer):
+class SimpleLSTMRNN(fluid.Layer):
    def __init__(self,
                 name_scope,
                 hidden_size,
@@ -45,7 +44,7 @@ class SimpleLSTMRNN(fluid.dygraph.Layer):
        self.cell_array = []
        self.hidden_array = []
-    def _build_once(self, input_embedding, init_hidden=None, init_cell=None):
+    def build_once(self, input_embedding, init_hidden=None, init_cell=None):
        self.weight_1_arr = []
        self.weight_2_arr = []
        self.bias_arr = []
@@ -132,7 +131,7 @@ class SimpleLSTMRNN(fluid.dygraph.Layer):
        return real_res, last_hidden, last_cell
-class PtbModel(fluid.dygraph.Layer):
+class PtbModel(fluid.Layer):
    def __init__(self,
                 name_scope,
                 hidden_size,
@@ -177,7 +176,7 @@ class PtbModel(fluid.dygraph.Layer):
            default_initializer=fluid.initializer.UniformInitializer(
                low=-self.init_scale, high=self.init_scale))
-    def _build_once(self, input, label, init_hidden, init_cell):
+    def build_once(self, input, label, init_hidden, init_cell):
        pass
    def forward(self, input, label, init_hidden, init_cell):
@@ -260,13 +259,13 @@ class TestDygraphPtbRnn(unittest.TestCase):
                                                            init_cell)
                if i == 0:
                    for param in ptb_model.parameters():
-                        dy_param_init[param.name] = param._numpy()
+                        dy_param_init[param.name] = param.numpy()
-                dy_loss._backward()
+                dy_loss.backward()
                sgd.minimize(dy_loss)
                ptb_model.clear_gradients()
                if i == batch_num - 1:
                    for param in ptb_model.parameters():
-                        dy_param_updated[param.name] = param._numpy()
+                        dy_param_updated[param.name] = param.numpy()
        with new_program_scope():
            fluid.default_startup_program().random_seed = seed
@@ -333,10 +332,10 @@ class TestDygraphPtbRnn(unittest.TestCase):
                    for k in range(3, len(out)):
                        static_param_updated[static_param_name_list[k -
                                                                    3]] = out[k]
-        self.assertTrue(np.allclose(static_loss_value, dy_loss._numpy()))
+        self.assertTrue(np.allclose(static_loss_value, dy_loss.numpy()))
-        self.assertTrue(np.allclose(static_last_cell_value, last_cell._numpy()))
+        self.assertTrue(np.allclose(static_last_cell_value, last_cell.numpy()))
        self.assertTrue(
-            np.allclose(static_last_hidden_value, last_hidden._numpy()))
+            np.allclose(static_last_hidden_value, last_hidden.numpy()))
        for key, value in six.iteritems(static_param_init):
            # print("static_init name: {}, value {}".format(key, value))
            # print("dy_init name: {}, value {}".format(key, dy_param_init[key]))

--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -21,7 +21,7 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC
+from paddle.fluid import Conv2D, Pool2D, BatchNorm, FC
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
@@ -68,7 +68,7 @@ def optimizer_setting(params):
    return optimizer
-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(fluid.Layer):
    def __init__(self,
                 name_scope,
                 num_channels,
@@ -99,7 +99,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
        return y
-class BottleneckBlock(fluid.dygraph.Layer):
+class BottleneckBlock(fluid.Layer):
    def __init__(self,
                 name_scope,
                 num_channels,
@@ -156,7 +156,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
        return layer_helper.append_activation(y)
-class ResNet(fluid.dygraph.Layer):
+class ResNet(fluid.Layer):
    def __init__(self, name_scope, layers=50, class_dim=102):
        super(ResNet, self).__init__(name_scope)
@@ -247,7 +247,7 @@ class TestDygraphResnet(unittest.TestCase):
            dy_param_init_value = {}
            for param in resnet.parameters():
-                dy_param_init_value[param.name] = param._numpy()
+                dy_param_init_value[param.name] = param.numpy()
            for batch_id, data in enumerate(train_reader()):
                if batch_id >= batch_num:
@@ -260,20 +260,20 @@ class TestDygraphResnet(unittest.TestCase):
                img = to_variable(dy_x_data)
                label = to_variable(y_data)
-                label._stop_gradient = True
+                label.stop_gradient = True
                out = resnet(img)
                loss = fluid.layers.cross_entropy(input=out, label=label)
                avg_loss = fluid.layers.mean(x=loss)
-                dy_out = avg_loss._numpy()
+                dy_out = avg_loss.numpy()
                if batch_id == 0:
                    for param in resnet.parameters():
                        if param.name not in dy_param_init_value:
-                            dy_param_init_value[param.name] = param._numpy()
+                            dy_param_init_value[param.name] = param.numpy()
-                avg_loss._backward()
+                avg_loss.backward()
                dy_grad_value = {}
                for param in resnet.parameters():
@@ -288,7 +288,7 @@ class TestDygraphResnet(unittest.TestCase):
                dy_param_value = {}
                for param in resnet.parameters():
-                    dy_param_value[param.name] = param._numpy()
+                    dy_param_value[param.name] = param.numpy()
        with new_program_scope():
            fluid.default_startup_program().random_seed = seed

--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
@@ -16,7 +16,8 @@ from __future__ import print_function
 import unittest
 import paddle.fluid as fluid
-from paddle.fluid.dygraph import Embedding, LayerNorm, FC, to_variable, Layer, guard
+from paddle.fluid import Embedding, LayerNorm, FC, Layer
+from paddle.fluid.dygraph import to_variable, guard
 from test_imperative_base import new_program_scope
 from paddle.fluid import core
 import numpy as np
@@ -985,15 +986,15 @@ class TestDygraphTransformer(unittest.TestCase):
                if i == 0:
                    for param in transformer.parameters():
-                        dy_param_init[param.name] = param._numpy()
+                        dy_param_init[param.name] = param.numpy()
-                dy_avg_cost._backward()
+                dy_avg_cost.backward()
                optimizer.minimize(dy_avg_cost)
                transformer.clear_gradients()
                if i == batch_num - 1:
                    for param in transformer.parameters():
-                        dy_param_updated[param.name] = param._numpy()
+                        dy_param_updated[param.name] = param.numpy()
        with new_program_scope():
            fluid.default_startup_program().random_seed = seed
@@ -1069,13 +1070,13 @@ class TestDygraphTransformer(unittest.TestCase):
                                                                    4]] = out[k]
        self.assertTrue(
-            np.array_equal(static_avg_cost_value, dy_avg_cost._numpy()))
+            np.array_equal(static_avg_cost_value, dy_avg_cost.numpy()))
        self.assertTrue(
-            np.array_equal(static_sum_cost_value, dy_sum_cost._numpy()))
+            np.array_equal(static_sum_cost_value, dy_sum_cost.numpy()))
        self.assertTrue(
-            np.array_equal(static_predict_value, dy_predict._numpy()))
+            np.array_equal(static_predict_value, dy_predict.numpy()))
        self.assertTrue(
-            np.array_equal(static_token_num_value, dy_token_num._numpy()))
+            np.array_equal(static_token_num_value, dy_token_num.numpy()))
        for key, value in six.iteritems(static_param_init):
            self.assertTrue(np.array_equal(value, dy_param_init[key]))
        for key, value in six.iteritems(static_param_updated):

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -102,7 +102,7 @@ class TestLayer(LayerTest):
            dy_ret = lm(base.to_variable(inp))
        self.assertTrue(np.allclose(static_ret, static_ret2))
-        self.assertTrue(np.allclose(dy_ret._numpy(), static_ret2))
+        self.assertTrue(np.allclose(dy_ret.numpy(), static_ret2))
    def test_relu(self):
        with self.static_graph():
@@ -116,7 +116,7 @@ class TestLayer(LayerTest):
            t = np.ones([3, 3], dtype='float32')
            dy_ret = layers.relu(base.to_variable(t))
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
    def test_matmul(self):
        with self.static_graph():
@@ -137,7 +137,7 @@ class TestLayer(LayerTest):
            t2 = np.ones([3, 3], dtype='float32')
            dy_ret = layers.matmul(base.to_variable(t), base.to_variable(t2))
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
    def test_conv2d(self):
        with self.static_graph():
@@ -164,7 +164,7 @@ class TestLayer(LayerTest):
                'conv2d', num_channels=3, num_filters=3, filter_size=[2, 2])
            dy_ret = conv2d(base.to_variable(images))
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
        self.assertTrue(np.allclose(static_ret, static_ret2))
    def test_gru_unit(self):
@@ -206,7 +206,7 @@ class TestLayer(LayerTest):
        for i in range(len(static_ret)):
            self.assertTrue(np.allclose(static_ret[i], static_ret2[i]))
-            self.assertTrue(np.allclose(static_ret[i], dy_ret[i]._numpy()))
+            self.assertTrue(np.allclose(static_ret[i], dy_ret[i].numpy()))
    def test_elementwise_math(self):
        n = np.ones([3, 3], dtype='float32')
@@ -248,8 +248,8 @@ class TestLayer(LayerTest):
            ret = layers.elementwise_sub(ret, n5)
            dy_ret = layers.elementwise_mul(ret, n6)
        self.assertTrue(
-            np.allclose(static_ret, dy_ret._numpy()),
+            np.allclose(static_ret, dy_ret.numpy()),
-            '%s vs %s' % (static_ret, dy_ret._numpy()))
+            '%s vs %s' % (static_ret, dy_ret.numpy()))
    def test_elementwise_minmax(self):
        n = np.ones([3, 3], dtype='float32')
@@ -259,8 +259,8 @@ class TestLayer(LayerTest):
            min_ret = layers.elementwise_min(n, n2)
            max_ret = layers.elementwise_max(n, n2)
-        self.assertTrue(np.allclose(n, min_ret._numpy()))
+        self.assertTrue(np.allclose(n, min_ret.numpy()))
-        self.assertTrue(np.allclose(n2, max_ret._numpy()))
+        self.assertTrue(np.allclose(n2, max_ret.numpy()))
    def test_sequence_conv(self):
        inp_np = np.arange(12).reshape([3, 4]).astype('float32')
@@ -327,7 +327,7 @@ class TestLayer(LayerTest):
                'conv2d_transpose', num_filters=10, output_size=28)
            dy_rlt = conv2d_transpose(base.to_variable(inp_np))
        self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
+        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
    def test_bilinear_tensor_product(self):
        inp_np_x = np.array([[1, 2, 3]]).astype('float32')
@@ -370,7 +370,7 @@ class TestLayer(LayerTest):
            dy_rlt = btp(base.to_variable(inp_np_x), base.to_variable(inp_np_y))
        self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
+        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
    def test_prelu(self):
        inp_np = np.ones([5, 200, 100, 100]).astype('float32')
@@ -411,7 +411,7 @@ class TestLayer(LayerTest):
            dy_rlt = prelu(base.to_variable(inp_np))
        self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
+        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
    def test_embeding(self):
        inp_word = np.array([[[1]]]).astype('int64')
@@ -444,7 +444,7 @@ class TestLayer(LayerTest):
            static_rlt3 = emb2(base.to_variable(inp_word))
        self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(static_rlt3._numpy(), static_rlt))
+        self.assertTrue(np.allclose(static_rlt3.numpy(), static_rlt))
    def test_nce(self):
        window_size = 5
@@ -558,7 +558,7 @@ class TestLayer(LayerTest):
            nce_loss3 = nce(embs3, words[label_word])
        self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(nce_loss3._numpy(), static_rlt))
+        self.assertTrue(np.allclose(nce_loss3.numpy(), static_rlt))
    def test_conv3d(self):
        with self.static_graph():
@@ -585,7 +585,7 @@ class TestLayer(LayerTest):
            conv3d = nn.Conv3D('conv3d', num_filters=3, filter_size=2)
            dy_ret = conv3d(base.to_variable(images))
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
        self.assertTrue(np.allclose(static_ret, static_ret2))
    def test_row_conv(self):
@@ -679,7 +679,7 @@ class TestLayer(LayerTest):
            groupNorm = nn.GroupNorm('GroupNorm', groups=2)
            dy_ret = groupNorm(base.to_variable(input))
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
        self.assertTrue(np.allclose(static_ret, static_ret2))
    def test_spectral_norm(self):
@@ -729,7 +729,7 @@ class TestLayer(LayerTest):
            spectralNorm = nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2)
            dy_ret = spectralNorm(base.to_variable(input))
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
        self.assertTrue(np.allclose(static_ret, static_ret2))
    def test_tree_conv(self):
@@ -802,7 +802,7 @@ class TestLayer(LayerTest):
            dy_ret = treeConv(base.to_variable(vectors), base.to_variable(adj))
        self.assertTrue(np.allclose(static_ret, static_ret2))
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
    def test_conv3d_transpose(self):
        input_array = np.arange(0, 48).reshape(
@@ -832,7 +832,7 @@ class TestLayer(LayerTest):
                use_cudnn=False)
            dy_rlt = conv3d_transpose(base.to_variable(input_array))
        self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
+        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
 class TestBook(unittest.TestCase):