Polish PyLayers

test=develop

Polish PyLayers
test=develop
22956530 · minqiyang · 0f6ef8ed · 22956530 · 22956530 · 22956530
5 changed file
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -24,19 +24,7 @@ __all__ = ['PyLayer']
 class PyLayer(core.Layer):
-    def __init__(self,
+    def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None):
-                 dtype=core.VarDesc.VarType.FP32,
-                 param_attr=None,
-                 bias_attr=None,
-                 name=None):
-        from ..layer_helper import LayerHelper
-        self._helper = LayerHelper(
-            type(self).__name__,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            dtype=dtype,
-            name=name)
        self._once_built = False
        self._dtype = dtype

--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -46,8 +46,15 @@ class Conv2D(layers.PyLayer):
                 name=None,
                 dtype=core.VarDesc.VarType.FP32):
        assert param_attr is not False, "param_attr should not be False here."
-        super(Conv2D, self).__init__(
+        super(Conv2D, self).__init__(name=name, dtype=dtype)
-            param_attr=param_attr, bias_attr=bias_attr, name=name, dtype=dtype)
+        from ..layer_helper import LayerHelper
+        self._helper = LayerHelper(
+            type(self).__name__,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            dtype=dtype,
+            name=name)
        self._groups = groups
        self._stride = utils.convert_to_list(stride, 2, 'stride')
@@ -163,6 +170,9 @@ class Pool2D(layers.PyLayer):
        super(Pool2D, self).__init__(name=name, dtype=dtype)
+        from ..layer_helper import LayerHelper
+        self._helper = LayerHelper(type(self).__name__, dtype=dtype, name=name)
        self._pool_type = pool_type
        self._pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
        self._pool_padding = utils.convert_to_list(pool_padding, 2,
@@ -197,32 +207,22 @@ class Pool2D(layers.PyLayer):
 class FC(layers.PyLayer):
    def __init__(self,
-                 size_in,
+                 size,
-                 size_out,
-                 num_flatten_dims=1,
                 param_attr=None,
+                 num_flatten_dims=1,
                 dtype=core.VarDesc.VarType.FP32):
-        super(FC, self).__init__(param_attr=param_attr, dtype=dtype)
+        super(FC, self).__init__()
+        self._size = size
-        self._size_in = size_in
-        self._size_out = size_out
        self._num_flatten_dims = num_flatten_dims
        self._dtype = dtype
-        if self._size_in != -1:
+        from ..layer_helper import LayerHelper
-            self._w = self._helper.create_parameter(
+        self._helper = LayerHelper('FC', param_attr=param_attr)
-                attr=self._helper.param_attr,
-                shape=[size_in, size_out],
-                dtype=self._dtype,
-                is_bias=False)
    def _build_once(self, input):
-        if self._size_in != -1:
-            return
        input_shape = input.shape
        param_shape = [
            reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
-        ] + [self._size_out]
+        ] + [self._size]
        self._w = self._helper.create_parameter(
            attr=self._helper.param_attr,
            shape=param_shape,

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -502,22 +502,22 @@ def lstm(input,
    If Device is GPU, This op will use cudnn LSTM implementation
    A four-gate Long Short-Term Memory network with no peephole connections.
-    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
+    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
    the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
    .. math::
-       i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) 
+       i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i)
-       f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) 
+       f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f)
-       o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) 
+       o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o)
       \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c)
-       c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} 
+       c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
-       h_t &= o_t \odot tanh(c_t) 
+       h_t &= o_t \odot tanh(c_t)
    - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
      of weights from the input gate to the input)
@@ -531,19 +531,19 @@ def lstm(input,
    - :math:`\\tilde{c_t}` is also called candidate hidden state,
      which is computed based on the current input and the previous hidden state.
-    Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication, 
+    Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication,
    X represensts a matrix multiplication
    Args:
        input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
-        init_h(Variable): The initial hidden state of the LSTM                       
+        init_h(Variable): The initial hidden state of the LSTM
                       This is a tensor with shape ( num_layers x batch_size x hidden_size)
                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
        init_c(Variable): The initial cell state of the LSTM.
                       This is a tensor with shape ( num_layers x batch_size x hidden_size )
                       if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
-        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len 
+        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
        hidden_size (int): hidden size of the LSTM
        num_layers (int): total layers number of the LSTM
        dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
@@ -558,18 +558,18 @@ def lstm(input,
    Returns:
-        rnn_out(Tensor),last_h(Tensor),last_c(Tensor):  
+        rnn_out(Tensor),last_h(Tensor),last_c(Tensor):
                        Three tensors, rnn_out, last_h, last_c:
                        - rnn_out is result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \
                          if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
                        - last_h is the hidden state of the last step of LSTM \
                          shape is ( num_layers x batch_size x hidden_size ) \
-                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
                        - last_c(Tensor): the cell state of the last step of LSTM \
                          shape is ( num_layers x batch_size x hidden_size ) \
-                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
    Examples:
@@ -1255,7 +1255,7 @@ def dropout(x,
                                           (mask is a tensor same shape with input, value is 0 or 1
                                           ratio of 0 is dropout_prob)
    Returns:
        Variable: A tensor variable is the shape with `x`.
@@ -1346,10 +1346,10 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
         ValueError:
                      1. the 1st dimension of ``input`` and ``label`` are not equal.
                      2. when ``soft_label == True``, and the 2nd dimension of
                         ``input`` and ``label`` are not equal.
                      3. when ``soft_label == False``, and the 2nd dimension of
                         ``label`` is not 1.
@@ -1471,7 +1471,7 @@ def chunk_eval(input,
    This function computes and outputs the precision, recall and
    F1-score of chunk detection.
-    For some basics of chunking, please refer to 
+    For some basics of chunking, please refer to
    `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
    ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
@@ -2306,7 +2306,7 @@ def sequence_slice(input, offset, length, name=None):
                out.lod = [[2, 1]],
                out.dims = (3, 2).
-    Note: 
+    Note:
          The first dimension size of **input**, **offset** and **length**
          should be equal. The **offset** should start from 0.
@@ -4678,7 +4678,7 @@ def ctc_greedy_decoder(input, blank, name=None):
                      [0.5, 0.1, 0.3, 0.1]]
        input.lod = [[4, 4]]
        Computation:
        step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
@@ -4712,7 +4712,7 @@ def ctc_greedy_decoder(input, blank, name=None):
        Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \
                  'Lp' is the sum if all output sequences' length. If all the sequences \
                  in result were empty, the result LoDTensor will be [-1] with  \
-                  LoD [[]] and dims [1, 1]. 
+                  LoD [[]] and dims [1, 1].
    Examples:
        .. code-block:: python
@@ -5065,7 +5065,7 @@ def hsigmoid(input,
    """
    The hierarchical sigmoid operator is used to accelerate the training
    process of language model. This operator organizes the classes into a
-    complete binary tree, or you can use is_custom to pass your own tree to 
+    complete binary tree, or you can use is_custom to pass your own tree to
    implement hierarchical. Each leaf node represents a class(a word) and each
    internal node acts as a binary classifier. For each word there's a unique
    path from root to it's leaf node, hsigmoid calculate the cost for each
@@ -5082,7 +5082,7 @@ def hsigmoid(input,
    2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
    3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
       means label of each binary classification, using 1 indicate true, 0 indicate false.
-    4. now, each word should has its path and code along the path, you can pass a batch of path and code 
+    4. now, each word should has its path and code along the path, you can pass a batch of path and code
       related to the same batch of inputs.
    Args:
@@ -5091,8 +5091,8 @@ def hsigmoid(input,
            and :math:`D` is the feature size.
        label (Variable): The tensor variable contains labels of training data.
            It's a tensor with shape is :math:`[N \\times 1]`.
-        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, 
+        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set,
-            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num 
+            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num
            which indicates the num of classes using by binary classify.
        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
             of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid
@@ -5105,15 +5105,15 @@ def hsigmoid(input,
             is not set, the bias is initialized zero. Default: None.
        name (str|None): A name for this layer(optional). If set None, the layer
             will be named automatically. Default: None.
-        path_table: (Variable|None) this variable can store each batch of samples' path to root, 
+        path_table: (Variable|None) this variable can store each batch of samples' path to root,
            it should be in leaf -> root order
-            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like 
+            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like
-            structure and each element in this array is indexes in parent nodes' Weight Matrix. 
+            structure and each element in this array is indexes in parent nodes' Weight Matrix.
-        path_code:  (Variable|None) this variable can store each batch of samples' code, 
+        path_code:  (Variable|None) this variable can store each batch of samples' code,
            each code consist with every code of parent nodes. it should be in leaf -> root order
-        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is 
+        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is
             set you need to set path_table/path_code/num_classes, otherwise num_classes should be set
-        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient 
+        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient
             of W and input will be sparse.
    Returns:
@@ -6965,10 +6965,10 @@ def mean_iou(input, label, num_classes):
        num_classes (int): The possible number of labels.
    Returns:
-        mean_iou (Variable),out_wrong(Variable),out_correct(Variable): 
+        mean_iou (Variable),out_wrong(Variable),out_correct(Variable):
                     Three variables:
                     - mean_iou : A Tensor representing the mean intersection-over-union with shape [1].
                     - out_wrong: A Tensor with shape [num_classes]. The wrong numbers of each class.
                     - out_correct: A Tensor with shape [num_classes]. The correct numbers of each class.
@@ -7166,7 +7166,7 @@ def affine_grid(theta, out_shape, name=None):
    Args:
        theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
-        out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W]. 
+        out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W].
                                             ``out_shape`` can be a Variable or a list or tuple.
        name(str|None): A name for this layer(optional). If set None, the layer
                        will be named automatically.
@@ -7762,9 +7762,9 @@ def flatten(x, axis=1, name=None):
    """
    **Flatten layer**
    Flattens the input tensor into a 2D matrix.
    For Example:
    .. code-block:: text
        Case 1:
@@ -8942,7 +8942,7 @@ def similarity_focus(input, axis, indexes, name=None):
    SimilarityFocus Operator
    Generate a similarity focus mask with the same shape of input using the following method:
    1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
       to the axis according to the indexes. For example, if axis=1 and indexes=[a],
       it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
@@ -9713,47 +9713,3 @@ def huber_loss(input, label, delta):
                 'Residual': residual},
        attrs={'delta': delta})
    return out
-class FC(layers.PyLayer):
-    def __init__(self,
-                 size,
-                 param_attr=None,
-                 num_flatten_dims=1,
-                 dtype=core.VarDesc.VarType.FP32):
-        super(FC, self).__init__(param_attr=param_attr)
-        self._size = size
-        self._num_flatten_dims = num_flatten_dims
-        self._dtype = dtype
-        self._tmp = self._helper.create_variable_for_type_inference(self._dtype)
-        self._out = self._helper.create_variable_for_type_inference(self._dtype)
-    def _build_once(self, inputs):
-        input_shape = inputs.shape
-        param_shape = [
-            reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
-        ] + [self._size]
-        self._w = self._helper.create_parameter(
-            attr=self._helper.param_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            is_bias=False)
-    def forward(self, inputs):
-        self._helper.append_op(
-            type="mul",
-            inputs={"X": inputs,
-                    "Y": self._w},
-            outputs={"Out": self._tmp},
-            attrs={
-                "x_num_col_dims": self._num_flatten_dims,
-                "y_num_col_dims": 1
-            })
-        self._helper.append_op(
-            type="sum",
-            inputs={"X": [self._tmp]},
-            outputs={"Out": self._out},
-            attrs={"use_mkldnn": False})
-        return self._out
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -18,7 +18,7 @@ import numpy as np
 import paddle.fluid as fluid
 from paddle.fluid import core
-from paddle.fluid.layers.nn import FC
+from paddle.fluid.imperative.nn import FC
 from test_imperative_base import new_program_scope

--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -74,7 +74,7 @@ class SimpleImgConvPool(fluid.imperative.PyLayer):
 class MNIST(fluid.imperative.PyLayer):
    def __init__(self, param_attr=None, bias_attr=None):
-        super(MNIST, self).__init__(param_attr=param_attr, bias_attr=bias_attr)
+        super(MNIST, self).__init__()
        self._simple_img_conv_pool_1 = SimpleImgConvPool(
            1, 20, 5, 2, 2, act="relu")
@@ -85,8 +85,7 @@ class MNIST(fluid.imperative.PyLayer):
        pool_2_shape = 50 * 8 * 8
        SIZE = 10
        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
-        self._fc = FC(-1,
+        self._fc = FC(10,
-                      10,
                      param_attr=fluid.param_attr.ParamAttr(
                          initializer=fluid.initializer.NormalInitializer(
                              loc=0.0, scale=scale)))