Add comment for dygraph api (#17869)

* add api commet; test=develop * fix fc dtype bug; test=develop * remove float32 in default parameter; test=develop * fix exmpale bug; test=develop * fix build once; test=develop * fix num_chanels bug; test=develop * fix install check failed bug; test=develop

Add comment for dygraph api (#17869)
* add api commet; test=develop * fix fc dtype bug; test=develop * remove float32 in default parameter; test=develop * fix exmpale bug; test=develop * fix build once; test=develop * fix num_chanels bug; test=develop * fix install check failed bug; test=develop
2a9d74f6 · Hongyu Liu · GitHub · 209a3f4e · 2a9d74f6 · 2a9d74f6
13 changed file
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -27,6 +27,10 @@ __all__ = [
 class LearningRateDecay(object):
    """
    Base class of learning rate decay
+    Define the common interface of an LearningRateDecay.
+    User should not use this class directly,
+    but need to use one of it's implementation.
    """
    def __init__(self, begin=0, step=1, dtype='float32'):
@@ -42,6 +46,14 @@ class LearningRateDecay(object):
        return lr
    def create_lr_var(self, lr):
+        """
+        convert lr from float to variable
+        Args: 
+            lr: learning rate
+        Returns:
+            learning rate variable
+        """
        from .. import layers
        lr = layers.create_global_var(
            name=unique_name.generate("learning_rate"),
@@ -56,6 +68,40 @@ class LearningRateDecay(object):
 class PiecewiseDecay(LearningRateDecay):
+    """
+    piecewise decay scheduler
+    The algorithm can be described as the code below.
+    .. code-block:: text
+      boundaries = [10000, 20000]
+      values = [1.0, 0.5, 0.1]
+      if step < 10000:
+          learning_rate = 1.0
+      elif 10000 <= step < 20000:
+          learning_rate = 0.5
+      else:
+          learning_rate = 0.1
+    Args:
+        boundaries: A list of steps numbers.
+        values: A list of learning rate values that will be picked during
+            different step boundaries.
+        begin: The begin step to initilize the self.step_num
+        step: The step_size using when calculate the new step_num (Defalult is 1)
+        dtype: The dtype used to create the learning rate variable
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          boundaries = [10000, 20000]
+          values = [1.0, 0.5, 0.1]
+          with fluid.dygraph.guard():
+              optimizer = fluid.optimizer.SGD(
+                 learning_rate=fluid.dygraph.PiecewiseDecay(boundaries, values, 0) )
+    """
    def __init__(self, boundaries, values, begin, step=1, dtype='float32'):
        super(PiecewiseDecay, self).__init__(begin, step, dtype)
        self.boundaries = boundaries
@@ -73,6 +119,41 @@ class PiecewiseDecay(LearningRateDecay):
 class NaturalExpDecay(LearningRateDecay):
+    """
+    Applies natural exponential decay to the initial learning rate.
+    .. code-block:: python
+        if not staircase:
+            decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+        else:
+            decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+        begin: A Python 'int32' number, the begin step (Default is 0)
+        step: A Python 'int32' number, the step size (Default is 1)
+        dtype: A Python 'str', the dtype used to create learning rate variable (Default is 'float32')
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          base_lr = 0.1
+          with fluid.dygraph.guard():
+              sgd_optimizer = fluid.optimizer.SGD(
+        	      learning_rate=fluid.dygraph.NaturalExpDecay(
+	    	            learning_rate=base_lr,
+        		    decay_steps=10000,
+		            decay_rate=0.5,
+		            staircase=True))
+    """
    def __init__(self,
                 learning_rate,
                 decay_steps,
@@ -99,6 +180,45 @@ class NaturalExpDecay(LearningRateDecay):
 class ExponentialDecay(LearningRateDecay):
+    """
+    Applies exponential decay to the learning rate.
+    When training a model, it is often recommended to lower the learning rate as the
+    training progresses. By using this function, the learning rate will be decayed by
+    'decay_rate' every 'decay_steps' steps.
+    .. code-block:: python
+        if staircase == True:
+            decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
+        else:
+            decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
+    Args:
+        learning_rate(Variable|float): The initial learning rate.
+        decay_steps(int): See the decay computation above.
+        decay_rate(float): The decay rate. See the decay computation above.
+        staircase(Boolean): If True, decay the learning rate at discrete intervals.
+                            Default: False
+        begin(int): The begin step (default is 0)
+        step(int): The step size (default is 1)
+        dtype(str): The dtype used to create learning rate (default is 'float32')
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          base_lr = 0.1
+          with fluid.dygraph.guard():
+              sgd_optimizer = fluid.optimizer.SGD(
+    	            learning_rate=fluid.dygraph.ExponentialDecay(
+		        learning_rate=base_lr,
+    		        decay_steps=10000,
+		        decay_rate=0.5,
+		        staircase=True))
+    """
    def __init__(self,
                 learning_rate,
                 decay_steps,
@@ -125,6 +245,43 @@ class ExponentialDecay(LearningRateDecay):
 class InverseTimeDecay(LearningRateDecay):
+    """
+    Applies inverse time decay to the initial learning rate.
+    When training a model, it is often recommended to lower the learning rate as the
+    training progresses. By using this function, an inverse decay function will be
+    applied to the initial learning rate.
+    >>> if staircase == True:
+    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
+    >>> else:
+    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
+    Args:
+        learning_rate(Variable|float): The initial learning rate.
+        decay_steps(int): See the decay computation above.
+        decay_rate(float): The decay rate. See the decay computation above.
+        staircase(Boolean): If True, decay the learning rate at discrete intervals.
+                            Default: False
+        begin(int): The begin step (default is 0)
+        step(int): The step size (default is 1)
+        dtype(str): The dtype used to create learning rate (default is 'float32')
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          base_lr = 0.1
+          with fluid.dygraph.guard():
+              sgd_optimizer = fluid.optimizer.SGD(
+	          learning_rate=fluid.dygraph.InverseTimeDecay(
+		        learning_rate=base_lr,
+		        decay_steps=10000,
+		        decay_rate=0.5,
+		        staircase=True))
+    """
    def __init__(self,
                 learning_rate,
                 decay_steps,
@@ -151,6 +308,43 @@ class InverseTimeDecay(LearningRateDecay):
 class PolynomialDecay(LearningRateDecay):
+    """
+    Applies polynomial decay to the initial learning rate.
+    .. code-block:: text
+     if cycle:
+       decay_steps = decay_steps * ceil(global_step / decay_steps)
+     else:
+       global_step = min(global_step, decay_steps)
+       decayed_learning_rate = (learning_rate - end_learning_rate) *
+            (1 - global_step / decay_steps) ^ power + end_learning_rate
+    Args:
+        learning_rate(Variable|float32): A scalar float32 value or a Variable. This
+          will be the initial learning rate during training.
+        decay_steps(int32): A Python `int32` number.
+        end_learning_rate(float): A Python `float` number.
+        power(float): A Python `float` number.
+        cycle(bool): If set true, decay the learning rate every decay_steps.
+        begin(int): The begin step (default is 0)
+        step(int): The step size (default is 1)
+        dtype(str): The dtype used to create learning rate (default is 'float32')
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          start_lr = 0.01
+          total_step = 5000
+          end_lr = 0
+          with fluid.dygraph.guard():
+              optimizer  = fluid.optimizer.SGD(
+                  learning_rate = fluid.dygraph.PolynomialDecay(
+                  start_lr, total_step, end_lr, power=1.0) )
+    """
    def __init__(self,
                 learning_rate,
                 decay_steps,
@@ -189,6 +383,35 @@ class PolynomialDecay(LearningRateDecay):
 class CosineDecay(LearningRateDecay):
+    """
+    Applies cosine decay to the learning rate.
+    when training a model, it is often recommended to lower the learning rate as the
+    training progresses. By using this function, the learning rate will be decayed by
+    following cosine decay strategy.
+    .. math::
+	decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1)
+    Args:
+        learning_rate(Variable|float): The initial learning rate.
+        step_each_epoch(int): the number of steps in an epoch.
+        epochs(int): the number of epochs.
+        begin(int): The begin step (default is 0).
+        step(int): The step size (default is 1).
+        dtype(str): The dtype used to create learning rate (default is 'float32').
+    Examples:
+	.. code-block:: python
+  	    base_lr = 0.1
+            with fluid.dygraph.guard():
+                optimizer  = fluid.optimizer.SGD(
+        	    learning_rate = fluid.dygraph.CosineDecay(
+	                    base_lr, 10000, 120) )
+    """
    def __init__(self,
                 learning_rate,
                 step_each_epoch,
@@ -211,6 +434,45 @@ class CosineDecay(LearningRateDecay):
 class NoamDecay(LearningRateDecay):
+    """
+    Noam decay method. The numpy implementation of noam decay as follows.
+    .. code-block:: python
+      import numpy as np
+      # set hyper parameters
+      d_model = 2
+      current_steps = 20
+      warmup_steps = 200
+      # compute
+      lr_value = np.power(d_model, -0.5) * np.min([
+                              np.power(current_steps, -0.5),
+                              np.power(warmup_steps, -1.5) * current_steps])
+    Please reference `attention is all you need
+    <https://arxiv.org/pdf/1706.03762.pdf>`_.
+    Args:
+        d_model(Variable): The dimensionality of input and output of model.
+        warmup_steps(Variable): A super parameter.
+        begin(int): The begin step (default is 0)
+        step(int): The step size (default is 1)
+        dtype(str): The dtype used to create learning rate (default is 'float32')
+    Examples:
+        .. code-block:: python
+          import paddle.fluid as fluid
+          warmup_steps = 100
+          learning_rate = 0.01
+          with fluid.dygraph.guard():
+              optimizer  = fluid.optimizer.SGD(
+                  learning_rate = fluid.dygraph.NoamDecay(
+                         1/(warmup_steps *(learning_rate ** 2)),
+                         warmup_steps) )
+    """
    def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'):
        super(NoamDecay, self).__init__(begin, step, dtype)
        self.d_model = d_model

--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -84,7 +84,7 @@ class Conv2D(layers.Layer):
            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
    Args:
-        input (Variable): The input image with [N, C, H, W] format.
+        name_scope(str) : The name for this class.
        num_filters(int): The number of filter. It is as same as the output
            image channel.
        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
@@ -118,12 +118,6 @@ class Conv2D(layers.Layer):
            library is installed. Default: True
        act (str): Activation type, if it is set to None, activation is not appended.
            Default: None
-        name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically. Default: None
-    Returns:
-        Variable: The tensor variable storing the convolution and \
-                  non-linearity activation result.
    Raises:
        ValueError: If the shapes of input, filter_size, stride, padding and
@@ -132,24 +126,36 @@ class Conv2D(layers.Layer):
    Examples:
        .. code-block:: python
-          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          with fluid.dygraph.guard():
-          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
+             conv2d = Conv2D( "conv2d", 2, 3)
+             data = to_variable( data )
+             conv = conv2d( data )
+          from paddle.fluid.dygraph.base import to_variable
+          import paddle.fluid as fluid
+          from paddle.fluid.dygraph import Conv2D
+          import numpy as np
+          data = np.random.uniform( -1, 1, [10, 3, 32, 32] ).astype('float32')
+          with fluid.dygraph.guard():
+              conv2d = Conv2D( "conv2d", 2, 3)
+              data = to_variable( data )
+              conv = conv2d( data )
    """
    def __init__(self,
                 name_scope,
-                 num_channels,
                 num_filters,
                 filter_size,
                 stride=1,
                 padding=0,
                 dilation=1,
                 groups=None,
-                 use_cudnn=True,
-                 act=None,
                 param_attr=None,
                 bias_attr=None,
-                 dtype=core.VarDesc.VarType.FP32):
+                 use_cudnn=True,
+                 act=None,
+                 dtype='float32'):
        assert param_attr is not False, "param_attr should not be False here."
        super(Conv2D, self).__init__(name_scope, dtype)
        self._groups = groups
@@ -160,7 +166,11 @@ class Conv2D(layers.Layer):
        if not isinstance(use_cudnn, bool):
            raise ValueError("use_cudnn should be True or False")
        self._use_cudnn = use_cudnn
-        self._num_channels = num_channels
+        self._filter_size = filter_size
+        self._num_filters = num_filters
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._dtype = dtype
        # if (self._num_channels == self._groups and
        #         num_filters % self._num_channels == 0 and not self._use_cudnn):
        #     self._l_type = 'depthwise_conv2d'
@@ -169,22 +179,26 @@ class Conv2D(layers.Layer):
        #  kernel fixed https://github.com/PaddlePaddle/Paddle/issues/17275
        self._l_type = 'conv2d'
-        if groups is None:
+    def _build_once(self, input):
-            num_filter_channels = num_channels
+        self._num_channels = input.shape[1]
+        if self._groups is None:
+            num_filter_channels = self._num_channels
        else:
-            if num_channels % groups != 0:
+            if self._num_channels % self._groups != 0:
                raise ValueError("num_channels must be divisible by groups.")
-            num_filter_channels = num_channels // groups
+            num_filter_channels = self._num_channels // self._groups
-        filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
+        filter_size = utils.convert_to_list(self._filter_size, 2, 'filter_size')
-        filter_shape = [num_filters, int(num_filter_channels)] + filter_size
+        filter_shape = [self._num_filters, int(num_filter_channels)
+                        ] + filter_size
        def _get_default_param_initializer():
-            filter_elem_num = filter_size[0] * filter_size[1] * num_channels
+            filter_elem_num = filter_size[0] * filter_size[
+                1] * self._num_channels
            std = (2.0 / filter_elem_num)**0.5
            return Normal(0.0, std, 0)
        self._filter_param = self.create_parameter(
-            attr=param_attr,
+            attr=self._param_attr,
            shape=filter_shape,
            dtype=self._dtype,
            default_initializer=_get_default_param_initializer())
@@ -204,8 +218,8 @@ class Conv2D(layers.Layer):
                type=core.VarDesc.VarType.RAW)
        self._bias_param = self.create_parameter(
-            attr=bias_attr,
+            attr=self._bias_attr,
-            shape=[num_filters],
+            shape=[self._num_filters],
            dtype=self._dtype,
            is_bias=True)
@@ -653,14 +667,12 @@ class Conv3DTranspose(layers.Layer):
 class Pool2D(layers.Layer):
+    # TODO, should delete this class
    """
    ${comment}
    Args:
-        input (Variable): The input tensor of pooling operator. The format of
+        name_scope(str) : The name of this class.
-                          input tensor is NCHW, where N is batch size, C is
-                          the number of channels, H is the height of the
-                          feature, and W is the width of the feature.
        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
            it must contain two integers, (pool_size_Height, pool_size_Width).
            Otherwise, the pool kernel size will be a square of an int.
@@ -814,8 +826,7 @@ class FC(layers.Layer):
            out.shape = (1, 2)
    Args:
-        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+        name(str): The name of this class.
-            the input tensor(s) is at least 2.
        size(int): The number of output units in this layer.
        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
            two dimensions. If this happens, the multidimensional tensor will first be flattened
@@ -833,10 +844,7 @@ class FC(layers.Layer):
            If it is set to None, the bias is initialized zero. Default: None.
        act (str, default None): Activation to be applied to the output of this layer.
        is_test(bool): A flag indicating whether execution is in test phase.
-        name (str, default None): The name of this layer.
+        dtype(str): Dtype used for weight
-    Returns:
-        Variable: The transformation result.
    Raises:
        ValueError: If rank of the input tensor is less than 2.
@@ -844,26 +852,27 @@ class FC(layers.Layer):
    Examples:
        .. code-block:: python
-          # when input is single tensor
+          from paddle.fluid.dygraph.base import to_variable
-          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          import paddle.fluid as fluid
-          fc = fluid.FC("fc", size=1000, act="tanh")
+          from paddle.fluid.dygraph import FC
-          fc_res = fc(data)
+          import numpy as np
+          data = np.random.uniform( -1, 1, [30, 10, 32] ).astype('float32')
+          with fluid.dygraph.guard():
+              fc = FC( "fc", 64, num_flatten_dims=2)
+              data = to_variable( data )
+              conv = fc( data )
-          # when input are multiple tensors
-          data_1 = fluid.layers.data(name="data_1", shape=[32, 32], dtype="float32")
-          data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32")
-          fc = fluid.FC("fc", size=1000, act="tanh")
-          fc_res = fc([data_1, data_2])
    """
    def __init__(self,
                 name_scope,
                 size,
+                 num_flatten_dims=1,
                 param_attr=None,
                 bias_attr=None,
-                 num_flatten_dims=1,
+                 act=None,
-                 dtype=core.VarDesc.VarType.FP32,
+                 is_test=False,
-                 act=None):
+                 dtype="float32"):
        super(FC, self).__init__(name_scope, dtype)
        self._size = size
@@ -1048,7 +1057,7 @@ class BatchNorm(layers.Layer):
                 epsilon=1e-05,
                 param_attr=None,
                 bias_attr=None,
-                 dtype=core.VarDesc.VarType.FP32,
+                 dtype='float32',
                 data_layout='NCHW',
                 in_place=False,
                 moving_mean_name=None,
@@ -1064,8 +1073,8 @@ class BatchNorm(layers.Layer):
        assert bias_attr is not False, "bias_attr should not be False in batch_norm."
-        if dtype == core.VarDesc.VarType.FP16:
+        if dtype == "float16":
-            self._dtype = core.VarDesc.VarType.FP32
+            self._dtype = "float32"
        else:
            self._dtype = dtype
@@ -1444,6 +1453,7 @@ class GRUUnit(layers.Layer):
                             Default: 'tanh'
        gate_activation (string): The activation type for gates (actGate).
                                  Default: 'sigmoid'
+        dtype(string): The dtype of the layers
    Returns:
        tuple: The hidden value, reset-hidden value and gate values.

--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -31,7 +31,7 @@ class SimpleLayer(Layer):
        super(SimpleLayer, self).__init__(name_scope)
        self._fc1 = nn.FC(self.full_name(),
                          3,
-                          ParamAttr(initializer=Constant(value=0.1)))
+                          param_attr=ParamAttr(initializer=Constant(value=0.1)))
    def forward(self, inputs):
        x = self._fc1(inputs)

--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
@@ -55,7 +55,6 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
        self._conv2d = Conv2D(
            self.full_name(),
-            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=filter_size,
            stride=conv_stride,

--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
@@ -47,7 +47,6 @@ class ConvBNLayer(fluid.dygraph.Layer):
        self._conv = Conv2D(
            self.full_name(),
-            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=filter_size,
            stride=stride,

--- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
@@ -51,7 +51,6 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
        self._conv2d = Conv2D(
            self.full_name(),
-            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=filter_size,
            stride=conv_stride,

--- a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
@@ -25,7 +25,6 @@ from paddle.fluid.dygraph.base import to_variable
 class SimpleImgConvPool(fluid.Layer):
    def __init__(self,
                 name_scope,
-                 num_channels,
                 num_filters,
                 filter_size,
                 pool_size,
@@ -45,7 +44,6 @@ class SimpleImgConvPool(fluid.Layer):
        self._conv2d = Conv2D(
            self.full_name(),
-            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=filter_size,
            stride=conv_stride,
@@ -76,10 +74,10 @@ class MNIST(fluid.Layer):
        super(MNIST, self).__init__(name_scope)
        self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            self.full_name(), 1, 20, 5, 2, 2, act="relu")
+            self.full_name(), 20, 5, 2, 2, act="relu")
        self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            self.full_name(), 20, 50, 5, 2, 2, act="relu")
+            self.full_name(), 50, 5, 2, 2, act="relu")
        pool_2_shape = 50 * 4 * 4
        SIZE = 10

--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -31,7 +31,6 @@ from test_imperative_base import new_program_scope
 class SimpleImgConvPool(fluid.dygraph.Layer):
    def __init__(self,
                 name_scope,
-                 num_channels,
                 num_filters,
                 filter_size,
                 pool_size,
@@ -51,7 +50,6 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
        self._conv2d = Conv2D(
            self.full_name(),
-            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=filter_size,
            stride=conv_stride,
@@ -82,10 +80,10 @@ class MNIST(fluid.dygraph.Layer):
        super(MNIST, self).__init__(name_scope)
        self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            self.full_name(), 1, 20, 5, 2, 2, act="relu")
+            self.full_name(), 20, 5, 2, 2, act="relu")
        self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            self.full_name(), 20, 50, 5, 2, 2, act="relu")
+            self.full_name(), 50, 5, 2, 2, act="relu")
        pool_2_shape = 50 * 4 * 4
        SIZE = 10

--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -80,7 +80,6 @@ class ConvBNPool(fluid.dygraph.Layer):
        self.conv_0_layer = Conv2D(
            self.full_name(),
-            channels[0],
            out_ch[0],
            3,
            padding=1,
@@ -92,7 +91,6 @@ class ConvBNPool(fluid.dygraph.Layer):
            self.full_name(), out_ch[0], act=act, is_test=is_test)
        self.conv_1_layer = Conv2D(
            self.full_name(),
-            num_channels=channels[1],
            num_filters=out_ch[1],
            filter_size=3,
            padding=1,

--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -71,7 +71,6 @@ def optimizer_setting(params):
 class ConvBNLayer(fluid.Layer):
    def __init__(self,
                 name_scope,
-                 num_channels,
                 num_filters,
                 filter_size,
                 stride=1,
@@ -81,7 +80,6 @@ class ConvBNLayer(fluid.Layer):
        self._conv = Conv2D(
            self.full_name(),
-            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=filter_size,
            stride=stride,
@@ -100,30 +98,22 @@ class ConvBNLayer(fluid.Layer):
 class BottleneckBlock(fluid.Layer):
-    def __init__(self,
+    def __init__(self, name_scope, num_filters, stride, shortcut=True):
-                 name_scope,
-                 num_channels,
-                 num_filters,
-                 stride,
-                 shortcut=True):
        super(BottleneckBlock, self).__init__(name_scope)
        self.conv0 = ConvBNLayer(
            self.full_name(),
-            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=1,
            act='relu')
        self.conv1 = ConvBNLayer(
            self.full_name(),
-            num_channels=num_filters,
            num_filters=num_filters,
            filter_size=3,
            stride=stride,
            act='relu')
        self.conv2 = ConvBNLayer(
            self.full_name(),
-            num_channels=num_filters,
            num_filters=num_filters * 4,
            filter_size=1,
            act=None)
@@ -131,15 +121,12 @@ class BottleneckBlock(fluid.Layer):
        if not shortcut:
            self.short = ConvBNLayer(
                self.full_name(),
-                num_channels=num_channels,
                num_filters=num_filters * 4,
                filter_size=1,
                stride=stride)
        self.shortcut = shortcut
-        self._num_channels_out = num_filters * 4
    def forward(self, inputs):
        y = self.conv0(inputs)
        conv1 = self.conv1(y)
@@ -175,7 +162,6 @@ class ResNet(fluid.Layer):
        self.conv = ConvBNLayer(
            self.full_name(),
-            num_channels=3,
            num_filters=64,
            filter_size=7,
            stride=2,
@@ -188,7 +174,6 @@ class ResNet(fluid.Layer):
            pool_type='max')
        self.bottleneck_block_list = []
-        num_channels = 64
        for block in range(len(depth)):
            shortcut = False
            for i in range(depth[block]):
@@ -196,11 +181,9 @@ class ResNet(fluid.Layer):
                    'bb_%d_%d' % (block, i),
                    BottleneckBlock(
                        self.full_name(),
-                        num_channels=num_channels,
                        num_filters=num_filters[block],
                        stride=2 if i == 0 and block != 0 else 1,
                        shortcut=shortcut))
-                num_channels = bottleneck_block._num_channels_out
                self.bottleneck_block_list.append(bottleneck_block)
                shortcut = True

--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -64,7 +64,6 @@ def optimizer_setting(params):
 class ConvBNLayer(fluid.dygraph.Layer):
    def __init__(self,
                 name_scope,
-                 num_channels,
                 num_filters,
                 filter_size,
                 stride=1,
@@ -74,7 +73,6 @@ class ConvBNLayer(fluid.dygraph.Layer):
        self._conv = Conv2D(
            self.full_name(),
-            num_channels=num_channels,
            num_filters=num_filters,
            filter_size=filter_size,
            stride=stride,
@@ -131,20 +129,15 @@ class BottleneckBlock(fluid.dygraph.Layer):
        super(BottleneckBlock, self).__init__(name_scope)
        self.conv0 = ConvBNLayer(
-            self.full_name(),
+            self.full_name(), num_filters=num_filters, filter_size=1)
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=1)
        self.conv1 = ConvBNLayer(
            self.full_name(),
-            num_channels=num_filters,
            num_filters=num_filters,
            filter_size=3,
            stride=stride,
            groups=cardinality)
        self.conv2 = ConvBNLayer(
            self.full_name(),
-            num_channels=num_filters,
            num_filters=num_filters * 4,
            filter_size=1,
            act='relu')
@@ -157,7 +150,6 @@ class BottleneckBlock(fluid.dygraph.Layer):
        if not shortcut:
            self.short = ConvBNLayer(
                self.full_name(),
-                num_channels=num_channels,
                num_filters=num_filters * 4,
                filter_size=1,
                stride=stride)
@@ -200,7 +192,6 @@ class SeResNeXt(fluid.dygraph.Layer):
            num_filters = [128, 256, 512, 1024]
            self.conv0 = ConvBNLayer(
                self.full_name(),
-                num_channels=3,
                num_filters=64,
                filter_size=7,
                stride=2,
@@ -218,7 +209,6 @@ class SeResNeXt(fluid.dygraph.Layer):
            num_filters = [128, 256, 512, 1024]
            self.conv0 = ConvBNLayer(
                self.full_name(),
-                num_channels=3,
                num_filters=3,
                filter_size=7,
                stride=2,
@@ -236,21 +226,18 @@ class SeResNeXt(fluid.dygraph.Layer):
            num_filters = [128, 256, 512, 1024]
            self.conv0 = ConvBNLayer(
                self.full_name(),
-                num_channels=3,
                num_filters=3,
                filter_size=7,
                stride=2,
                act='relu')
            self.conv1 = ConvBNLayer(
                self.full_name(),
-                num_channels=64,
                num_filters=3,
                filter_size=7,
                stride=2,
                act='relu')
            self.conv2 = ConvBNLayer(
                self.full_name(),
-                num_channels=64,
                num_filters=3,
                filter_size=7,
                stride=2,

--- a/python/paddle/fluid/tests/unittests/test_install_check.py
+++ b/python/paddle/fluid/tests/unittests/test_install_check.py
@@ -20,3 +20,7 @@ import paddle.fluid as fluid
 class TestInstallCheck(unittest.TestCase):
    def test_install_check(self):
        fluid.install_check.run_check()
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -190,8 +190,7 @@ class TestLayer(LayerTest):
        with self.static_graph():
            images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32')
-            conv2d = nn.Conv2D(
+            conv2d = nn.Conv2D('conv2d', num_filters=3, filter_size=[2, 2])
-                'conv2d', num_channels=3, num_filters=3, filter_size=[2, 2])
            ret = conv2d(images)
            static_ret2 = self.get_static_graph_result(
                feed={'pixel': np.ones(
@@ -200,8 +199,7 @@ class TestLayer(LayerTest):
        with self.dynamic_graph():
            images = np.ones([2, 3, 5, 5], dtype='float32')
-            conv2d = nn.Conv2D(
+            conv2d = nn.Conv2D('conv2d', num_filters=3, filter_size=[2, 2])
-                'conv2d', num_channels=3, num_filters=3, filter_size=[2, 2])
            dy_ret = conv2d(base.to_variable(images))
        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))