From 2a9d74f67c39d46986868faa0700a705e7d57a24 Mon Sep 17 00:00:00 2001
From: Hongyu Liu <43953930+phlrain@users.noreply.github.com>
Date: Mon, 10 Jun 2019 11:31:59 +0800
Subject: [PATCH] Add comment for dygraph api (#17869)

* add api commet; test=develop

* fix fc dtype bug; test=develop

* remove float32 in default parameter; test=develop

* fix exmpale bug; test=develop

* fix build once; test=develop

* fix num_chanels bug; test=develop

* fix install check failed bug; test=develop
---
 .../fluid/dygraph/learning_rate_scheduler.py  | 262 ++++++++++++++++++
 python/paddle/fluid/dygraph/nn.py             | 110 ++++----
 python/paddle/fluid/install_check.py          |   2 +-
 .../tests/unittests/parallel_dygraph_mnist.py |   1 -
 .../unittests/parallel_dygraph_se_resnext.py  |   1 -
 .../unittests/test_dygraph_multi_forward.py   |   1 -
 .../unittests/test_imperative_checkpoint.py   |   6 +-
 .../tests/unittests/test_imperative_mnist.py  |   6 +-
 .../test_imperative_ocr_attention_model.py    |   2 -
 .../tests/unittests/test_imperative_resnet.py |  19 +-
 .../unittests/test_imperative_se_resnext.py   |  15 +-
 .../tests/unittests/test_install_check.py     |   4 +
 .../fluid/tests/unittests/test_layers.py      |   6 +-
 13 files changed, 335 insertions(+), 100 deletions(-)

diff --git a/python/paddle/fluid/dygraph/learning_rate_scheduler.py b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
index d425d1c25..d28c8d3c1 100644
--- a/python/paddle/fluid/dygraph/learning_rate_scheduler.py
+++ b/python/paddle/fluid/dygraph/learning_rate_scheduler.py
@@ -27,6 +27,10 @@ __all__ = [
 class LearningRateDecay(object):
     """
     Base class of learning rate decay
+    
+    Define the common interface of an LearningRateDecay.
+    User should not use this class directly,
+    but need to use one of it's implementation.
     """
 
     def __init__(self, begin=0, step=1, dtype='float32'):
@@ -42,6 +46,14 @@ class LearningRateDecay(object):
         return lr
 
     def create_lr_var(self, lr):
+        """
+        convert lr from float to variable
+
+        Args: 
+            lr: learning rate
+        Returns:
+            learning rate variable
+        """
         from .. import layers
         lr = layers.create_global_var(
             name=unique_name.generate("learning_rate"),
@@ -56,6 +68,40 @@ class LearningRateDecay(object):
 
 
 class PiecewiseDecay(LearningRateDecay):
+    """
+    piecewise decay scheduler
+
+    The algorithm can be described as the code below.
+
+    .. code-block:: text
+
+      boundaries = [10000, 20000]
+      values = [1.0, 0.5, 0.1]
+      if step < 10000:
+          learning_rate = 1.0
+      elif 10000 <= step < 20000:
+          learning_rate = 0.5
+      else:
+          learning_rate = 0.1
+    Args:
+        boundaries: A list of steps numbers.
+        values: A list of learning rate values that will be picked during
+            different step boundaries.
+        begin: The begin step to initilize the self.step_num
+        step: The step_size using when calculate the new step_num (Defalult is 1)
+        dtype: The dtype used to create the learning rate variable
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          boundaries = [10000, 20000]
+          values = [1.0, 0.5, 0.1]
+          with fluid.dygraph.guard():
+              optimizer = fluid.optimizer.SGD(
+                 learning_rate=fluid.dygraph.PiecewiseDecay(boundaries, values, 0) )
+    """
+
     def __init__(self, boundaries, values, begin, step=1, dtype='float32'):
         super(PiecewiseDecay, self).__init__(begin, step, dtype)
         self.boundaries = boundaries
@@ -73,6 +119,41 @@ class PiecewiseDecay(LearningRateDecay):
 
 
 class NaturalExpDecay(LearningRateDecay):
+    """
+    Applies natural exponential decay to the initial learning rate.
+    
+    .. code-block:: python
+
+        if not staircase:
+            decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+        else:
+            decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+        begin: A Python 'int32' number, the begin step (Default is 0)
+        step: A Python 'int32' number, the step size (Default is 1)
+        dtype: A Python 'str', the dtype used to create learning rate variable (Default is 'float32')
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          base_lr = 0.1
+          with fluid.dygraph.guard():
+              sgd_optimizer = fluid.optimizer.SGD(
+        	      learning_rate=fluid.dygraph.NaturalExpDecay(
+	    	            learning_rate=base_lr,
+        		    decay_steps=10000,
+		            decay_rate=0.5,
+		            staircase=True))
+
+    """
+
     def __init__(self,
                  learning_rate,
                  decay_steps,
@@ -99,6 +180,45 @@ class NaturalExpDecay(LearningRateDecay):
 
 
 class ExponentialDecay(LearningRateDecay):
+    """
+    Applies exponential decay to the learning rate.
+
+    When training a model, it is often recommended to lower the learning rate as the
+    training progresses. By using this function, the learning rate will be decayed by
+    'decay_rate' every 'decay_steps' steps.
+    
+    .. code-block:: python
+
+        if staircase == True:
+            decayed_learning_rate = learning_rate * decay_rate ^ floor(global_step / decay_steps)
+        else:
+            decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps)
+
+    Args:
+        learning_rate(Variable|float): The initial learning rate.
+        decay_steps(int): See the decay computation above.
+        decay_rate(float): The decay rate. See the decay computation above.
+        staircase(Boolean): If True, decay the learning rate at discrete intervals.
+                            Default: False
+        begin(int): The begin step (default is 0)
+        step(int): The step size (default is 1)
+        dtype(str): The dtype used to create learning rate (default is 'float32')
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          base_lr = 0.1
+          with fluid.dygraph.guard():
+              sgd_optimizer = fluid.optimizer.SGD(
+    	            learning_rate=fluid.dygraph.ExponentialDecay(
+		        learning_rate=base_lr,
+    		        decay_steps=10000,
+		        decay_rate=0.5,
+		        staircase=True))
+
+    """
+
     def __init__(self,
                  learning_rate,
                  decay_steps,
@@ -125,6 +245,43 @@ class ExponentialDecay(LearningRateDecay):
 
 
 class InverseTimeDecay(LearningRateDecay):
+    """
+    Applies inverse time decay to the initial learning rate.
+
+    When training a model, it is often recommended to lower the learning rate as the
+    training progresses. By using this function, an inverse decay function will be
+    applied to the initial learning rate.
+
+    >>> if staircase == True:
+    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
+    >>> else:
+    >>>     decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
+
+    Args:
+        learning_rate(Variable|float): The initial learning rate.
+        decay_steps(int): See the decay computation above.
+        decay_rate(float): The decay rate. See the decay computation above.
+        staircase(Boolean): If True, decay the learning rate at discrete intervals.
+                            Default: False
+        begin(int): The begin step (default is 0)
+        step(int): The step size (default is 1)
+        dtype(str): The dtype used to create learning rate (default is 'float32')
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          base_lr = 0.1
+          with fluid.dygraph.guard():
+              sgd_optimizer = fluid.optimizer.SGD(
+	          learning_rate=fluid.dygraph.InverseTimeDecay(
+		        learning_rate=base_lr,
+		        decay_steps=10000,
+		        decay_rate=0.5,
+		        staircase=True))
+
+    """
+
     def __init__(self,
                  learning_rate,
                  decay_steps,
@@ -151,6 +308,43 @@ class InverseTimeDecay(LearningRateDecay):
 
 
 class PolynomialDecay(LearningRateDecay):
+    """
+    Applies polynomial decay to the initial learning rate.
+
+    .. code-block:: text
+
+     if cycle:
+       decay_steps = decay_steps * ceil(global_step / decay_steps)
+     else:
+       global_step = min(global_step, decay_steps)
+       decayed_learning_rate = (learning_rate - end_learning_rate) *
+            (1 - global_step / decay_steps) ^ power + end_learning_rate
+
+    Args:
+        learning_rate(Variable|float32): A scalar float32 value or a Variable. This
+          will be the initial learning rate during training.
+        decay_steps(int32): A Python `int32` number.
+        end_learning_rate(float): A Python `float` number.
+        power(float): A Python `float` number.
+        cycle(bool): If set true, decay the learning rate every decay_steps.
+        begin(int): The begin step (default is 0)
+        step(int): The step size (default is 1)
+        dtype(str): The dtype used to create learning rate (default is 'float32')
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          start_lr = 0.01
+          total_step = 5000
+          end_lr = 0
+          with fluid.dygraph.guard():
+              optimizer  = fluid.optimizer.SGD(
+                  learning_rate = fluid.dygraph.PolynomialDecay(
+                  start_lr, total_step, end_lr, power=1.0) )
+
+    """
+
     def __init__(self,
                  learning_rate,
                  decay_steps,
@@ -189,6 +383,35 @@ class PolynomialDecay(LearningRateDecay):
 
 
 class CosineDecay(LearningRateDecay):
+    """
+    Applies cosine decay to the learning rate.
+
+    when training a model, it is often recommended to lower the learning rate as the
+    training progresses. By using this function, the learning rate will be decayed by
+    following cosine decay strategy.
+
+    .. math::
+
+	decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1)
+    
+    Args:
+        learning_rate(Variable|float): The initial learning rate.
+        step_each_epoch(int): the number of steps in an epoch.
+        epochs(int): the number of epochs.
+        begin(int): The begin step (default is 0).
+        step(int): The step size (default is 1).
+        dtype(str): The dtype used to create learning rate (default is 'float32').
+
+    Examples:
+	.. code-block:: python
+
+  	    base_lr = 0.1
+            with fluid.dygraph.guard():
+                optimizer  = fluid.optimizer.SGD(
+        	    learning_rate = fluid.dygraph.CosineDecay(
+	                    base_lr, 10000, 120) )
+    """
+
     def __init__(self,
                  learning_rate,
                  step_each_epoch,
@@ -211,6 +434,45 @@ class CosineDecay(LearningRateDecay):
 
 
 class NoamDecay(LearningRateDecay):
+    """
+    Noam decay method. The numpy implementation of noam decay as follows.
+
+    .. code-block:: python
+      
+      import numpy as np
+      # set hyper parameters
+      d_model = 2
+      current_steps = 20
+      warmup_steps = 200
+      # compute
+      lr_value = np.power(d_model, -0.5) * np.min([
+                              np.power(current_steps, -0.5),
+                              np.power(warmup_steps, -1.5) * current_steps])
+
+    Please reference `attention is all you need
+    <https://arxiv.org/pdf/1706.03762.pdf>`_.
+
+    Args:
+        d_model(Variable): The dimensionality of input and output of model.
+
+        warmup_steps(Variable): A super parameter.
+        begin(int): The begin step (default is 0)
+        step(int): The step size (default is 1)
+        dtype(str): The dtype used to create learning rate (default is 'float32')
+
+    Examples:
+        .. code-block:: python
+
+          import paddle.fluid as fluid
+          warmup_steps = 100
+          learning_rate = 0.01
+          with fluid.dygraph.guard():
+              optimizer  = fluid.optimizer.SGD(
+                  learning_rate = fluid.dygraph.NoamDecay(
+                         1/(warmup_steps *(learning_rate ** 2)),
+                         warmup_steps) )
+    """
+
     def __init__(self, d_model, warmup_steps, begin=1, step=1, dtype='float32'):
         super(NoamDecay, self).__init__(begin, step, dtype)
         self.d_model = d_model
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 753cb26dc..200e2917a 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -84,7 +84,7 @@ class Conv2D(layers.Layer):
             W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
 
     Args:
-        input (Variable): The input image with [N, C, H, W] format.
+        name_scope(str) : The name for this class.
         num_filters(int): The number of filter. It is as same as the output
             image channel.
         filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
@@ -118,12 +118,6 @@ class Conv2D(layers.Layer):
             library is installed. Default: True
         act (str): Activation type, if it is set to None, activation is not appended.
             Default: None
-        name (str|None): A name for this layer(optional). If set None, the layer
-            will be named automatically. Default: None
-
-    Returns:
-        Variable: The tensor variable storing the convolution and \
-                  non-linearity activation result.
 
     Raises:
         ValueError: If the shapes of input, filter_size, stride, padding and
@@ -131,25 +125,37 @@ class Conv2D(layers.Layer):
 
     Examples:
         .. code-block:: python
+          
+          with fluid.dygraph.guard():
+             conv2d = Conv2D( "conv2d", 2, 3)
+             data = to_variable( data )
+             conv = conv2d( data )
+          from paddle.fluid.dygraph.base import to_variable
+          import paddle.fluid as fluid
+          from paddle.fluid.dygraph import Conv2D
+          import numpy as np
+
+          data = np.random.uniform( -1, 1, [10, 3, 32, 32] ).astype('float32')
+          with fluid.dygraph.guard():
+              conv2d = Conv2D( "conv2d", 2, 3)
+              data = to_variable( data )
+              conv = conv2d( data )
 
-          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
-          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
     """
 
     def __init__(self,
                  name_scope,
-                 num_channels,
                  num_filters,
                  filter_size,
                  stride=1,
                  padding=0,
                  dilation=1,
                  groups=None,
-                 use_cudnn=True,
-                 act=None,
                  param_attr=None,
                  bias_attr=None,
-                 dtype=core.VarDesc.VarType.FP32):
+                 use_cudnn=True,
+                 act=None,
+                 dtype='float32'):
         assert param_attr is not False, "param_attr should not be False here."
         super(Conv2D, self).__init__(name_scope, dtype)
         self._groups = groups
@@ -160,7 +166,11 @@ class Conv2D(layers.Layer):
         if not isinstance(use_cudnn, bool):
             raise ValueError("use_cudnn should be True or False")
         self._use_cudnn = use_cudnn
-        self._num_channels = num_channels
+        self._filter_size = filter_size
+        self._num_filters = num_filters
+        self._param_attr = param_attr
+        self._bias_attr = bias_attr
+        self._dtype = dtype
         # if (self._num_channels == self._groups and
         #         num_filters % self._num_channels == 0 and not self._use_cudnn):
         #     self._l_type = 'depthwise_conv2d'
@@ -169,22 +179,26 @@ class Conv2D(layers.Layer):
         #  kernel fixed https://github.com/PaddlePaddle/Paddle/issues/17275
         self._l_type = 'conv2d'
 
-        if groups is None:
-            num_filter_channels = num_channels
+    def _build_once(self, input):
+        self._num_channels = input.shape[1]
+        if self._groups is None:
+            num_filter_channels = self._num_channels
         else:
-            if num_channels % groups != 0:
+            if self._num_channels % self._groups != 0:
                 raise ValueError("num_channels must be divisible by groups.")
-            num_filter_channels = num_channels // groups
-        filter_size = utils.convert_to_list(filter_size, 2, 'filter_size')
-        filter_shape = [num_filters, int(num_filter_channels)] + filter_size
+            num_filter_channels = self._num_channels // self._groups
+        filter_size = utils.convert_to_list(self._filter_size, 2, 'filter_size')
+        filter_shape = [self._num_filters, int(num_filter_channels)
+                        ] + filter_size
 
         def _get_default_param_initializer():
-            filter_elem_num = filter_size[0] * filter_size[1] * num_channels
+            filter_elem_num = filter_size[0] * filter_size[
+                1] * self._num_channels
             std = (2.0 / filter_elem_num)**0.5
             return Normal(0.0, std, 0)
 
         self._filter_param = self.create_parameter(
-            attr=param_attr,
+            attr=self._param_attr,
             shape=filter_shape,
             dtype=self._dtype,
             default_initializer=_get_default_param_initializer())
@@ -204,8 +218,8 @@ class Conv2D(layers.Layer):
                 type=core.VarDesc.VarType.RAW)
 
         self._bias_param = self.create_parameter(
-            attr=bias_attr,
-            shape=[num_filters],
+            attr=self._bias_attr,
+            shape=[self._num_filters],
             dtype=self._dtype,
             is_bias=True)
 
@@ -653,14 +667,12 @@ class Conv3DTranspose(layers.Layer):
 
 
 class Pool2D(layers.Layer):
+    # TODO, should delete this class
     """
     ${comment}
 
     Args:
-        input (Variable): The input tensor of pooling operator. The format of
-                          input tensor is NCHW, where N is batch size, C is
-                          the number of channels, H is the height of the
-                          feature, and W is the width of the feature.
+        name_scope(str) : The name of this class.
         pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two integers, (pool_size_Height, pool_size_Width).
             Otherwise, the pool kernel size will be a square of an int.
@@ -814,8 +826,7 @@ class FC(layers.Layer):
             out.shape = (1, 2)
 
     Args:
-        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
-            the input tensor(s) is at least 2.
+        name(str): The name of this class.
         size(int): The number of output units in this layer.
         num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
             two dimensions. If this happens, the multidimensional tensor will first be flattened
@@ -833,37 +844,35 @@ class FC(layers.Layer):
             If it is set to None, the bias is initialized zero. Default: None.
         act (str, default None): Activation to be applied to the output of this layer.
         is_test(bool): A flag indicating whether execution is in test phase.
-        name (str, default None): The name of this layer.
-
-    Returns:
-        Variable: The transformation result.
+        dtype(str): Dtype used for weight
 
     Raises:
         ValueError: If rank of the input tensor is less than 2.
 
     Examples:
         .. code-block:: python
+        
+          from paddle.fluid.dygraph.base import to_variable
+          import paddle.fluid as fluid
+          from paddle.fluid.dygraph import FC
+          import numpy as np
+          data = np.random.uniform( -1, 1, [30, 10, 32] ).astype('float32')
+          with fluid.dygraph.guard():
+              fc = FC( "fc", 64, num_flatten_dims=2)
+              data = to_variable( data )
+              conv = fc( data )
 
-          # when input is single tensor
-          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
-          fc = fluid.FC("fc", size=1000, act="tanh")
-          fc_res = fc(data)
-
-          # when input are multiple tensors
-          data_1 = fluid.layers.data(name="data_1", shape=[32, 32], dtype="float32")
-          data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32")
-          fc = fluid.FC("fc", size=1000, act="tanh")
-          fc_res = fc([data_1, data_2])
     """
 
     def __init__(self,
                  name_scope,
                  size,
+                 num_flatten_dims=1,
                  param_attr=None,
                  bias_attr=None,
-                 num_flatten_dims=1,
-                 dtype=core.VarDesc.VarType.FP32,
-                 act=None):
+                 act=None,
+                 is_test=False,
+                 dtype="float32"):
         super(FC, self).__init__(name_scope, dtype)
 
         self._size = size
@@ -1048,7 +1057,7 @@ class BatchNorm(layers.Layer):
                  epsilon=1e-05,
                  param_attr=None,
                  bias_attr=None,
-                 dtype=core.VarDesc.VarType.FP32,
+                 dtype='float32',
                  data_layout='NCHW',
                  in_place=False,
                  moving_mean_name=None,
@@ -1064,8 +1073,8 @@ class BatchNorm(layers.Layer):
 
         assert bias_attr is not False, "bias_attr should not be False in batch_norm."
 
-        if dtype == core.VarDesc.VarType.FP16:
-            self._dtype = core.VarDesc.VarType.FP32
+        if dtype == "float16":
+            self._dtype = "float32"
         else:
             self._dtype = dtype
 
@@ -1444,6 +1453,7 @@ class GRUUnit(layers.Layer):
                              Default: 'tanh'
         gate_activation (string): The activation type for gates (actGate).
                                   Default: 'sigmoid'
+        dtype(string): The dtype of the layers
 
     Returns:
         tuple: The hidden value, reset-hidden value and gate values.
diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py
index 3cdd05533..dd1725b45 100644
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -31,7 +31,7 @@ class SimpleLayer(Layer):
         super(SimpleLayer, self).__init__(name_scope)
         self._fc1 = nn.FC(self.full_name(),
                           3,
-                          ParamAttr(initializer=Constant(value=0.1)))
+                          param_attr=ParamAttr(initializer=Constant(value=0.1)))
 
     def forward(self, inputs):
         x = self._fc1(inputs)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
index d2ce14e92..389023601 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_mnist.py
@@ -55,7 +55,6 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
 
         self._conv2d = Conv2D(
             self.full_name(),
-            num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
             stride=conv_stride,
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
index 9eb860cb6..49c715f74 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_se_resnext.py
@@ -47,7 +47,6 @@ class ConvBNLayer(fluid.dygraph.Layer):
 
         self._conv = Conv2D(
             self.full_name(),
-            num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
             stride=stride,
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
index 8b8fdcc88..f473c435e 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
@@ -51,7 +51,6 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
 
         self._conv2d = Conv2D(
             self.full_name(),
-            num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
             stride=conv_stride,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
index b7c3695ee..25d490f67 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
@@ -25,7 +25,6 @@ from paddle.fluid.dygraph.base import to_variable
 class SimpleImgConvPool(fluid.Layer):
     def __init__(self,
                  name_scope,
-                 num_channels,
                  num_filters,
                  filter_size,
                  pool_size,
@@ -45,7 +44,6 @@ class SimpleImgConvPool(fluid.Layer):
 
         self._conv2d = Conv2D(
             self.full_name(),
-            num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
             stride=conv_stride,
@@ -76,10 +74,10 @@ class MNIST(fluid.Layer):
         super(MNIST, self).__init__(name_scope)
 
         self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            self.full_name(), 1, 20, 5, 2, 2, act="relu")
+            self.full_name(), 20, 5, 2, 2, act="relu")
 
         self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            self.full_name(), 20, 50, 5, 2, 2, act="relu")
+            self.full_name(), 50, 5, 2, 2, act="relu")
 
         pool_2_shape = 50 * 4 * 4
         SIZE = 10
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
index b44166388..c3a12addf 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -31,7 +31,6 @@ from test_imperative_base import new_program_scope
 class SimpleImgConvPool(fluid.dygraph.Layer):
     def __init__(self,
                  name_scope,
-                 num_channels,
                  num_filters,
                  filter_size,
                  pool_size,
@@ -51,7 +50,6 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
 
         self._conv2d = Conv2D(
             self.full_name(),
-            num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
             stride=conv_stride,
@@ -82,10 +80,10 @@ class MNIST(fluid.dygraph.Layer):
         super(MNIST, self).__init__(name_scope)
 
         self._simple_img_conv_pool_1 = SimpleImgConvPool(
-            self.full_name(), 1, 20, 5, 2, 2, act="relu")
+            self.full_name(), 20, 5, 2, 2, act="relu")
 
         self._simple_img_conv_pool_2 = SimpleImgConvPool(
-            self.full_name(), 20, 50, 5, 2, 2, act="relu")
+            self.full_name(), 50, 5, 2, 2, act="relu")
 
         pool_2_shape = 50 * 4 * 4
         SIZE = 10
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 3f53552ba..22bd2e55d 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -80,7 +80,6 @@ class ConvBNPool(fluid.dygraph.Layer):
 
         self.conv_0_layer = Conv2D(
             self.full_name(),
-            channels[0],
             out_ch[0],
             3,
             padding=1,
@@ -92,7 +91,6 @@ class ConvBNPool(fluid.dygraph.Layer):
             self.full_name(), out_ch[0], act=act, is_test=is_test)
         self.conv_1_layer = Conv2D(
             self.full_name(),
-            num_channels=channels[1],
             num_filters=out_ch[1],
             filter_size=3,
             padding=1,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index ff5d1ef69..9eab5abc0 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -71,7 +71,6 @@ def optimizer_setting(params):
 class ConvBNLayer(fluid.Layer):
     def __init__(self,
                  name_scope,
-                 num_channels,
                  num_filters,
                  filter_size,
                  stride=1,
@@ -81,7 +80,6 @@ class ConvBNLayer(fluid.Layer):
 
         self._conv = Conv2D(
             self.full_name(),
-            num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
             stride=stride,
@@ -100,30 +98,22 @@ class ConvBNLayer(fluid.Layer):
 
 
 class BottleneckBlock(fluid.Layer):
-    def __init__(self,
-                 name_scope,
-                 num_channels,
-                 num_filters,
-                 stride,
-                 shortcut=True):
+    def __init__(self, name_scope, num_filters, stride, shortcut=True):
         super(BottleneckBlock, self).__init__(name_scope)
 
         self.conv0 = ConvBNLayer(
             self.full_name(),
-            num_channels=num_channels,
             num_filters=num_filters,
             filter_size=1,
             act='relu')
         self.conv1 = ConvBNLayer(
             self.full_name(),
-            num_channels=num_filters,
             num_filters=num_filters,
             filter_size=3,
             stride=stride,
             act='relu')
         self.conv2 = ConvBNLayer(
             self.full_name(),
-            num_channels=num_filters,
             num_filters=num_filters * 4,
             filter_size=1,
             act=None)
@@ -131,15 +121,12 @@ class BottleneckBlock(fluid.Layer):
         if not shortcut:
             self.short = ConvBNLayer(
                 self.full_name(),
-                num_channels=num_channels,
                 num_filters=num_filters * 4,
                 filter_size=1,
                 stride=stride)
 
         self.shortcut = shortcut
 
-        self._num_channels_out = num_filters * 4
-
     def forward(self, inputs):
         y = self.conv0(inputs)
         conv1 = self.conv1(y)
@@ -175,7 +162,6 @@ class ResNet(fluid.Layer):
 
         self.conv = ConvBNLayer(
             self.full_name(),
-            num_channels=3,
             num_filters=64,
             filter_size=7,
             stride=2,
@@ -188,7 +174,6 @@ class ResNet(fluid.Layer):
             pool_type='max')
 
         self.bottleneck_block_list = []
-        num_channels = 64
         for block in range(len(depth)):
             shortcut = False
             for i in range(depth[block]):
@@ -196,11 +181,9 @@ class ResNet(fluid.Layer):
                     'bb_%d_%d' % (block, i),
                     BottleneckBlock(
                         self.full_name(),
-                        num_channels=num_channels,
                         num_filters=num_filters[block],
                         stride=2 if i == 0 and block != 0 else 1,
                         shortcut=shortcut))
-                num_channels = bottleneck_block._num_channels_out
                 self.bottleneck_block_list.append(bottleneck_block)
                 shortcut = True
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index ded97051e..f6585d1b3 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -64,7 +64,6 @@ def optimizer_setting(params):
 class ConvBNLayer(fluid.dygraph.Layer):
     def __init__(self,
                  name_scope,
-                 num_channels,
                  num_filters,
                  filter_size,
                  stride=1,
@@ -74,7 +73,6 @@ class ConvBNLayer(fluid.dygraph.Layer):
 
         self._conv = Conv2D(
             self.full_name(),
-            num_channels=num_channels,
             num_filters=num_filters,
             filter_size=filter_size,
             stride=stride,
@@ -131,20 +129,15 @@ class BottleneckBlock(fluid.dygraph.Layer):
         super(BottleneckBlock, self).__init__(name_scope)
 
         self.conv0 = ConvBNLayer(
-            self.full_name(),
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=1)
+            self.full_name(), num_filters=num_filters, filter_size=1)
         self.conv1 = ConvBNLayer(
             self.full_name(),
-            num_channels=num_filters,
             num_filters=num_filters,
             filter_size=3,
             stride=stride,
             groups=cardinality)
         self.conv2 = ConvBNLayer(
             self.full_name(),
-            num_channels=num_filters,
             num_filters=num_filters * 4,
             filter_size=1,
             act='relu')
@@ -157,7 +150,6 @@ class BottleneckBlock(fluid.dygraph.Layer):
         if not shortcut:
             self.short = ConvBNLayer(
                 self.full_name(),
-                num_channels=num_channels,
                 num_filters=num_filters * 4,
                 filter_size=1,
                 stride=stride)
@@ -200,7 +192,6 @@ class SeResNeXt(fluid.dygraph.Layer):
             num_filters = [128, 256, 512, 1024]
             self.conv0 = ConvBNLayer(
                 self.full_name(),
-                num_channels=3,
                 num_filters=64,
                 filter_size=7,
                 stride=2,
@@ -218,7 +209,6 @@ class SeResNeXt(fluid.dygraph.Layer):
             num_filters = [128, 256, 512, 1024]
             self.conv0 = ConvBNLayer(
                 self.full_name(),
-                num_channels=3,
                 num_filters=3,
                 filter_size=7,
                 stride=2,
@@ -236,21 +226,18 @@ class SeResNeXt(fluid.dygraph.Layer):
             num_filters = [128, 256, 512, 1024]
             self.conv0 = ConvBNLayer(
                 self.full_name(),
-                num_channels=3,
                 num_filters=3,
                 filter_size=7,
                 stride=2,
                 act='relu')
             self.conv1 = ConvBNLayer(
                 self.full_name(),
-                num_channels=64,
                 num_filters=3,
                 filter_size=7,
                 stride=2,
                 act='relu')
             self.conv2 = ConvBNLayer(
                 self.full_name(),
-                num_channels=64,
                 num_filters=3,
                 filter_size=7,
                 stride=2,
diff --git a/python/paddle/fluid/tests/unittests/test_install_check.py b/python/paddle/fluid/tests/unittests/test_install_check.py
index 5802e2ed0..5cb199d49 100644
--- a/python/paddle/fluid/tests/unittests/test_install_check.py
+++ b/python/paddle/fluid/tests/unittests/test_install_check.py
@@ -20,3 +20,7 @@ import paddle.fluid as fluid
 class TestInstallCheck(unittest.TestCase):
     def test_install_check(self):
         fluid.install_check.run_check()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 9217a3fc4..2204ea21c 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -190,8 +190,7 @@ class TestLayer(LayerTest):
 
         with self.static_graph():
             images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32')
-            conv2d = nn.Conv2D(
-                'conv2d', num_channels=3, num_filters=3, filter_size=[2, 2])
+            conv2d = nn.Conv2D('conv2d', num_filters=3, filter_size=[2, 2])
             ret = conv2d(images)
             static_ret2 = self.get_static_graph_result(
                 feed={'pixel': np.ones(
@@ -200,8 +199,7 @@ class TestLayer(LayerTest):
 
         with self.dynamic_graph():
             images = np.ones([2, 3, 5, 5], dtype='float32')
-            conv2d = nn.Conv2D(
-                'conv2d', num_channels=3, num_filters=3, filter_size=[2, 2])
+            conv2d = nn.Conv2D('conv2d', num_filters=3, filter_size=[2, 2])
             dy_ret = conv2d(base.to_variable(images))
 
         self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
-- 
GitLab