diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py
index c95b89a2c48d7b69da05365a865d612befa2f808..d78d61eb3f02c27ec44806ae52e134068c2cb9be 100644
--- a/python/paddle/fluid/imperative/layers.py
+++ b/python/paddle/fluid/imperative/layers.py
@@ -24,19 +24,7 @@ __all__ = ['PyLayer']
 
 
 class PyLayer(core.Layer):
-    def __init__(self,
-                 dtype=core.VarDesc.VarType.FP32,
-                 param_attr=None,
-                 bias_attr=None,
-                 name=None):
-        from ..layer_helper import LayerHelper
-        self._helper = LayerHelper(
-            type(self).__name__,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            dtype=dtype,
-            name=name)
-
+    def __init__(self, dtype=core.VarDesc.VarType.FP32, name=None):
         self._once_built = False
         self._dtype = dtype
 
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index 8757670ef82a8ed419edbcbfb12056d0f88200fb..4f30417e99d21bcb66dacaab0257816c4d77f932 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -46,8 +46,15 @@ class Conv2D(layers.PyLayer):
                  name=None,
                  dtype=core.VarDesc.VarType.FP32):
         assert param_attr is not False, "param_attr should not be False here."
-        super(Conv2D, self).__init__(
-            param_attr=param_attr, bias_attr=bias_attr, name=name, dtype=dtype)
+        super(Conv2D, self).__init__(name=name, dtype=dtype)
+
+        from ..layer_helper import LayerHelper
+        self._helper = LayerHelper(
+            type(self).__name__,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            dtype=dtype,
+            name=name)
 
         self._groups = groups
         self._stride = utils.convert_to_list(stride, 2, 'stride')
@@ -163,6 +170,9 @@ class Pool2D(layers.PyLayer):
 
         super(Pool2D, self).__init__(name=name, dtype=dtype)
 
+        from ..layer_helper import LayerHelper
+        self._helper = LayerHelper(type(self).__name__, dtype=dtype, name=name)
+
         self._pool_type = pool_type
         self._pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
         self._pool_padding = utils.convert_to_list(pool_padding, 2,
@@ -197,32 +207,22 @@ class Pool2D(layers.PyLayer):
 
 class FC(layers.PyLayer):
     def __init__(self,
-                 size_in,
-                 size_out,
-                 num_flatten_dims=1,
+                 size,
                  param_attr=None,
+                 num_flatten_dims=1,
                  dtype=core.VarDesc.VarType.FP32):
-        super(FC, self).__init__(param_attr=param_attr, dtype=dtype)
-
-        self._size_in = size_in
-        self._size_out = size_out
+        super(FC, self).__init__()
+        self._size = size
         self._num_flatten_dims = num_flatten_dims
         self._dtype = dtype
-        if self._size_in != -1:
-            self._w = self._helper.create_parameter(
-                attr=self._helper.param_attr,
-                shape=[size_in, size_out],
-                dtype=self._dtype,
-                is_bias=False)
+        from ..layer_helper import LayerHelper
+        self._helper = LayerHelper('FC', param_attr=param_attr)
 
     def _build_once(self, input):
-        if self._size_in != -1:
-            return
-
         input_shape = input.shape
         param_shape = [
             reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
-        ] + [self._size_out]
+        ] + [self._size]
         self._w = self._helper.create_parameter(
             attr=self._helper.param_attr,
             shape=param_shape,
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 81b214898971ba66d94de0e767adf2329943c2be..9572fcb385823eab16d5c44fd56c680e577c8f04 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -502,22 +502,22 @@ def lstm(input,
     If Device is GPU, This op will use cudnn LSTM implementation
 
     A four-gate Long Short-Term Memory network with no peephole connections.
-    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1, 
+    In the forward pass the output ht and cell output ct for a given iteration can be computed from the recurrent input ht-1,
     the cell input ct-1 and the previous layer input xt given matrices W, R and biases bW, bR from the following equations:
 
     .. math::
-    
-       i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i) 
-       
-       f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f) 
-       
-       o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o) 
-       
+
+       i_t &= \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + bx_i + bh_i)
+
+       f_t &= \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + bx_f + bh_f)
+
+       o_t &= \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + bx_o + bh_o)
+
        \\tilde{c_t} &= tanh(W_{cx}x_t + W_{ch}h_{t-1} + bx_c + bh_c)
-       
-       c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t} 
-       
-       h_t &= o_t \odot tanh(c_t) 
+
+       c_t &= f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
+
+       h_t &= o_t \odot tanh(c_t)
 
     - $W$ terms denote weight matrices (e.g. $W_{ix}$ is the matrix
       of weights from the input gate to the input)
@@ -531,19 +531,19 @@ def lstm(input,
     - :math:`\\tilde{c_t}` is also called candidate hidden state,
       which is computed based on the current input and the previous hidden state.
 
-    Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication, 
+    Where sigmoid is the sigmoid operator: :math:`sigmoid(x) = 1 / (1 + e^{-x})` , * represents a point-wise multiplication,
     X represensts a matrix multiplication
 
 
     Args:
         input (Variable): LSTM input tensor, shape MUST be ( seq_len x batch_size x input_size )
-        init_h(Variable): The initial hidden state of the LSTM                       
+        init_h(Variable): The initial hidden state of the LSTM
                        This is a tensor with shape ( num_layers x batch_size x hidden_size)
                        if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
         init_c(Variable): The initial cell state of the LSTM.
                        This is a tensor with shape ( num_layers x batch_size x hidden_size )
                        if is_bidirec = True, shape should be ( num_layers*2 x batch_size x hidden_size)
-        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len 
+        max_len (int): max length of LSTM. the first dim of input tensor CAN NOT greater than max_len
         hidden_size (int): hidden size of the LSTM
         num_layers (int): total layers number of the LSTM
         dropout_prob(float|0.0): dropout prob, dropout ONLY work between rnn layers, NOT between time steps
@@ -558,18 +558,18 @@ def lstm(input,
 
 
     Returns:
-        rnn_out(Tensor),last_h(Tensor),last_c(Tensor):  
-                        
+        rnn_out(Tensor),last_h(Tensor),last_c(Tensor):
+
                         Three tensors, rnn_out, last_h, last_c:
-                        
+
                         - rnn_out is result of LSTM hidden, shape is (seq_len x batch_size x hidden_size) \
                           if is_bidirec set to True, shape will be ( seq_len x batch_sze x hidden_size*2)
                         - last_h is the hidden state of the last step of LSTM \
                           shape is ( num_layers x batch_size x hidden_size ) \
-                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
                         - last_c(Tensor): the cell state of the last step of LSTM \
                           shape is ( num_layers x batch_size x hidden_size ) \
-                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)                     
+                          if is_bidirec set to True, shape will be ( num_layers*2 x batch_size x hidden_size)
 
 
     Examples:
@@ -1255,7 +1255,7 @@ def dropout(x,
                                            (mask is a tensor same shape with input, value is 0 or 1
                                            ratio of 0 is dropout_prob)
 
-                                        
+
     Returns:
         Variable: A tensor variable is the shape with `x`.
 
@@ -1346,10 +1346,10 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex):
          ValueError:
 
                       1. the 1st dimension of ``input`` and ``label`` are not equal.
-                      
+
                       2. when ``soft_label == True``, and the 2nd dimension of
                          ``input`` and ``label`` are not equal.
-                         
+
                       3. when ``soft_label == False``, and the 2nd dimension of
                          ``label`` is not 1.
 
@@ -1471,7 +1471,7 @@ def chunk_eval(input,
     This function computes and outputs the precision, recall and
     F1-score of chunk detection.
 
-    For some basics of chunking, please refer to 
+    For some basics of chunking, please refer to
     `Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>`_ .
 
     ChunkEvalOp computes the precision, recall, and F1-score of chunk detection,
@@ -2306,7 +2306,7 @@ def sequence_slice(input, offset, length, name=None):
                 out.lod = [[2, 1]],
                 out.dims = (3, 2).
 
-    Note: 
+    Note:
           The first dimension size of **input**, **offset** and **length**
           should be equal. The **offset** should start from 0.
 
@@ -4678,7 +4678,7 @@ def ctc_greedy_decoder(input, blank, name=None):
                       [0.5, 0.1, 0.3, 0.1]]
 
         input.lod = [[4, 4]]
-      
+
         Computation:
 
         step1: Apply argmax to first input sequence which is input.data[0:4]. Then we get:
@@ -4712,7 +4712,7 @@ def ctc_greedy_decoder(input, blank, name=None):
         Variable: CTC greedy decode result which is a 2-D tensor with shape [Lp, 1]. \
                   'Lp' is the sum if all output sequences' length. If all the sequences \
                   in result were empty, the result LoDTensor will be [-1] with  \
-                  LoD [[]] and dims [1, 1]. 
+                  LoD [[]] and dims [1, 1].
 
     Examples:
         .. code-block:: python
@@ -5065,7 +5065,7 @@ def hsigmoid(input,
     """
     The hierarchical sigmoid operator is used to accelerate the training
     process of language model. This operator organizes the classes into a
-    complete binary tree, or you can use is_custom to pass your own tree to 
+    complete binary tree, or you can use is_custom to pass your own tree to
     implement hierarchical. Each leaf node represents a class(a word) and each
     internal node acts as a binary classifier. For each word there's a unique
     path from root to it's leaf node, hsigmoid calculate the cost for each
@@ -5082,7 +5082,7 @@ def hsigmoid(input,
     2. build a dict to store word_id -> word's leaf to root path, we call it path_table.
     3. build a dict to store word_id -> code of word's leaf to root path, we call it path_code. Code
        means label of each binary classification, using 1 indicate true, 0 indicate false.
-    4. now, each word should has its path and code along the path, you can pass a batch of path and code 
+    4. now, each word should has its path and code along the path, you can pass a batch of path and code
        related to the same batch of inputs.
 
     Args:
@@ -5091,8 +5091,8 @@ def hsigmoid(input,
             and :math:`D` is the feature size.
         label (Variable): The tensor variable contains labels of training data.
             It's a tensor with shape is :math:`[N \\times 1]`.
-        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set, 
-            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num 
+        num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set,
+            it should never be None under is_custom=False, but while is_custom is true, it should be non leaf num
             which indicates the num of classes using by binary classify.
         param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
              of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid
@@ -5105,15 +5105,15 @@ def hsigmoid(input,
              is not set, the bias is initialized zero. Default: None.
         name (str|None): A name for this layer(optional). If set None, the layer
              will be named automatically. Default: None.
-        path_table: (Variable|None) this variable can store each batch of samples' path to root, 
+        path_table: (Variable|None) this variable can store each batch of samples' path to root,
             it should be in leaf -> root order
-            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like 
-            structure and each element in this array is indexes in parent nodes' Weight Matrix. 
-        path_code:  (Variable|None) this variable can store each batch of samples' code, 
+            path_table should have the same shape with path_code, and for each sample i path_table[i] indicates a np.array like
+            structure and each element in this array is indexes in parent nodes' Weight Matrix.
+        path_code:  (Variable|None) this variable can store each batch of samples' code,
             each code consist with every code of parent nodes. it should be in leaf -> root order
-        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is 
+        is_custom: (bool|False)using user defined binary tree instead of default complete binary tree, if costum is
              set you need to set path_table/path_code/num_classes, otherwise num_classes should be set
-        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient 
+        is_sparse: (bool|False)using sparse update instead of dense update, if set, the gradient
              of W and input will be sparse.
 
     Returns:
@@ -6965,10 +6965,10 @@ def mean_iou(input, label, num_classes):
         num_classes (int): The possible number of labels.
 
     Returns:
-        mean_iou (Variable),out_wrong(Variable),out_correct(Variable): 
-        
+        mean_iou (Variable),out_wrong(Variable),out_correct(Variable):
+
                      Three variables:
-                      
+
                      - mean_iou : A Tensor representing the mean intersection-over-union with shape [1].
                      - out_wrong: A Tensor with shape [num_classes]. The wrong numbers of each class.
                      - out_correct: A Tensor with shape [num_classes]. The correct numbers of each class.
@@ -7166,7 +7166,7 @@ def affine_grid(theta, out_shape, name=None):
 
     Args:
         theta (Variable): A batch of affine transform parameters with shape [N, 2, 3].
-        out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W]. 
+        out_shape (Variable | list | tuple): The shape of target output with format [N, C, H, W].
                                              ``out_shape`` can be a Variable or a list or tuple.
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
@@ -7762,9 +7762,9 @@ def flatten(x, axis=1, name=None):
     """
     **Flatten layer**
     Flattens the input tensor into a 2D matrix.
-    
+
     For Example:
-    
+
     .. code-block:: text
 
         Case 1:
@@ -8942,7 +8942,7 @@ def similarity_focus(input, axis, indexes, name=None):
     SimilarityFocus Operator
 
     Generate a similarity focus mask with the same shape of input using the following method:
-    
+
     1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
        to the axis according to the indexes. For example, if axis=1 and indexes=[a],
        it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
@@ -9713,47 +9713,3 @@ def huber_loss(input, label, delta):
                  'Residual': residual},
         attrs={'delta': delta})
     return out
-
-
-class FC(layers.PyLayer):
-    def __init__(self,
-                 size,
-                 param_attr=None,
-                 num_flatten_dims=1,
-                 dtype=core.VarDesc.VarType.FP32):
-        super(FC, self).__init__(param_attr=param_attr)
-        self._size = size
-        self._num_flatten_dims = num_flatten_dims
-        self._dtype = dtype
-        self._tmp = self._helper.create_variable_for_type_inference(self._dtype)
-        self._out = self._helper.create_variable_for_type_inference(self._dtype)
-
-    def _build_once(self, inputs):
-        input_shape = inputs.shape
-        param_shape = [
-            reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
-        ] + [self._size]
-        self._w = self._helper.create_parameter(
-            attr=self._helper.param_attr,
-            shape=param_shape,
-            dtype=self._dtype,
-            is_bias=False)
-
-    def forward(self, inputs):
-        self._helper.append_op(
-            type="mul",
-            inputs={"X": inputs,
-                    "Y": self._w},
-            outputs={"Out": self._tmp},
-            attrs={
-                "x_num_col_dims": self._num_flatten_dims,
-                "y_num_col_dims": 1
-            })
-
-        self._helper.append_op(
-            type="sum",
-            inputs={"X": [self._tmp]},
-            outputs={"Out": self._out},
-            attrs={"use_mkldnn": False})
-
-        return self._out
diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py
index f717801baea5f5d0db4fa7f683906708e9038d69..1dc13ec74e8da1f13d447950b3c7822bbbecb2a7 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative.py
@@ -18,7 +18,7 @@ import numpy as np
 
 import paddle.fluid as fluid
 from paddle.fluid import core
-from paddle.fluid.layers.nn import FC
+from paddle.fluid.imperative.nn import FC
 from test_imperative_base import new_program_scope
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index e9dd158295e0d27de4173b6398f72ce730dc8d74..5d97edf8768d8d2cf1ba7f826fa4d588c30f2aee 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -74,7 +74,7 @@ class SimpleImgConvPool(fluid.imperative.PyLayer):
 
 class MNIST(fluid.imperative.PyLayer):
     def __init__(self, param_attr=None, bias_attr=None):
-        super(MNIST, self).__init__(param_attr=param_attr, bias_attr=bias_attr)
+        super(MNIST, self).__init__()
 
         self._simple_img_conv_pool_1 = SimpleImgConvPool(
             1, 20, 5, 2, 2, act="relu")
@@ -85,8 +85,7 @@ class MNIST(fluid.imperative.PyLayer):
         pool_2_shape = 50 * 8 * 8
         SIZE = 10
         scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
-        self._fc = FC(-1,
-                      10,
+        self._fc = FC(10,
                       param_attr=fluid.param_attr.ParamAttr(
                           initializer=fluid.initializer.NormalInitializer(
                               loc=0.0, scale=scale)))