merge branch, test=develop

60e3e355 · lujun · e97ded83 · 423bc515 · 60e3e355 · 60e3e355
14 changed file
--- a/paddle/fluid/framework/details/memory_optimize_helper.cc
+++ b/paddle/fluid/framework/details/memory_optimize_helper.cc
@@ -131,16 +131,7 @@ size_t NodeSize(const VarDesc& node) {
  return type_size * std::abs(size);
 }

-size_t NodeSize(ir::Node* n) {
-  VarDesc* desc = nullptr;
-  // some op do not have block pointer
-  if (n->inputs[0]->Op() != nullptr) {
-    desc = FindVarDescInBlock(n);
-  } else {
-    desc = n->Var();
-  }
-  return NodeSize(*desc);
-}
+size_t NodeSize(ir::Node* n) { return NodeSize(*(n->Var())); }

 std::string DebugStringImpl(VarDesc* var) {
  std::stringstream ss;
@@ -163,24 +154,22 @@ std::string DebugStringImpl(VarDesc* var) {
 }

 std::string DebugString(ir::Node* var) {
-  return DebugStringImpl(FindVarDescInBlock(var));
+  return DebugStringImpl(GetVarDesc(var));
 }

 // NOTE(dzh): based ir node, if a large node has been reused
 // by a small size node, then next time it appear in pool, it will
 // have the small size. Find the original node shap from blockdesc.
-VarDesc* FindVarDescInBlock(ir::Node* n) {
+VarDesc* GetVarDesc(ir::Node* n) {
  PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1);
-  BlockDesc* block = n->inputs[0]->Op()->Block();
-  PADDLE_ENFORCE(block->HasVar(n->Name()),
-                 string::Sprintf("Block do not has var %s", n->Name()));
-  return block->FindVar(n->Name());
+  return n->Var();
 }

 struct NodeComparator {
  bool operator()(ir::Node* lhs, ir::Node* rhs) const {
-    auto* lhs_desc = FindVarDescInBlock(lhs);
-    auto* rhs_desc = FindVarDescInBlock(rhs);
+    if (lhs->Var()->GetType() != rhs->Var()->GetType()) return false;
+    auto* lhs_desc = GetVarDesc(lhs);
+    auto* rhs_desc = GetVarDesc(rhs);
    // match data type
    if (lhs_desc->GetDataType() != rhs_desc->GetDataType()) {
      return false;
@@ -204,7 +193,7 @@ void OrderedSet::Insert(ir::Node* var) {
    return;
  }

-  auto* var_desc = FindVarDescInBlock(var);
+  auto* var_desc = var->Var();
  auto var_shape = var_desc->GetShape();
  int batch_size = static_cast<int>(var_shape[0]);

@@ -212,7 +201,7 @@ void OrderedSet::Insert(ir::Node* var) {
  Iter it = nodes_.begin();
  while (it != nodes_.end()) {
    auto& prev = it->front();
-    auto* cache_desc = FindVarDescInBlock(prev);
+    auto* cache_desc = GetVarDesc(prev);
    int cache_batch_size = cache_desc->GetShape()[0];
    if ((cache_batch_size == -1 && batch_size == -1) ||
        (cache_batch_size != -1 && batch_size != -1)) {
@@ -336,10 +325,16 @@ int MinChunkSize() {
 bool NodeCanReused(const VarDesc& node) {
  auto type = node.GetType();
  // only these types holds bulk of gpu memory
-  if (!(type == proto::VarType::LOD_TENSOR ||
-        type == proto::VarType::LOD_TENSOR_ARRAY)) {
-    return false;
-  }
+  // FIXME(liuwei1031) did not find good ways to test SELECTED_ROWS and
+  // LOD_TENSOR_ARRAY re-use logic,
+  // disable them in version 1.4
+  // if (!(type == proto::VarType::LOD_TENSOR ||
+  //       type == proto::VarType::SELECTED_ROWS ||
+  //       type == proto::VarType::LOD_TENSOR_ARRAY)) {
+  //   return false;
+  // }
+  if (type != proto::VarType::LOD_TENSOR) return false;
+
  // persistable variable is parameter
  if (node.Persistable()) {
    return false;

--- a/paddle/fluid/framework/details/memory_optimize_helper.h
+++ b/paddle/fluid/framework/details/memory_optimize_helper.h
@@ -20,6 +20,7 @@
 #include <map>
 #include <set>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
@@ -140,11 +141,7 @@ size_t NodeSize(const VarDesc&);

 std::string DebugString(ir::Node* var);

-// NOTE(dzhwinter)
-// after node reuse, the replaced node shape is
-// different with its VarDesc. So need to find the
-// correct VarDesc in Block.
-VarDesc* FindVarDescInBlock(ir::Node* n);
+VarDesc* GetVarDesc(ir::Node* n);

 static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) {
  return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() &&

--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -84,7 +84,8 @@ void BatchMergePass::ApplyImpl(ir::Graph* graph) const {

  // 1. record op nodes of different roles
  for (auto node : nodes) {
-    if (node->IsVar()) continue;
+    if (!node->IsOp()) continue;
+    PADDLE_ENFORCE(node->Op(), "must find opdesc");
    int op_role = boost::get<int>(node->Op()->GetAttr(
        framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
    if ((op_role == static_cast<int>(framework::OpRole::kForward)) ||

--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -122,14 +122,14 @@ class Autograd {
      std::map<std::string, std::vector<VarBase*>> input_grads =
          ready_op->ApplyGrad();

-      for (auto it : input_grads) {
-        const std::vector<VarBase*>& ingrads = it.second;
+      for (auto it = input_grads.rbegin(); it != input_grads.rend(); ++it) {
+        const std::vector<VarBase*>& ingrads = it->second;
        for (size_t i = 0; i < ingrads.size(); ++i) {
          if (!ingrads[i]) continue;
-          if (ready_op->input_vars_[it.first][i]->IsStopGradient()) {
+          if (ready_op->input_vars_[it->first][i]->IsStopGradient()) {
            continue;
          }
-          OpBase* pre_op = ready_op->pre_ops_[it.first][i];
+          OpBase* pre_op = ready_op->pre_ops_[it->first][i];
          if (!pre_op) continue;

          dep_counts[pre_op] -= 1;

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -148,20 +148,20 @@ inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_con
 if(WITH_MKLDNN)
  set(INT8_DATA_DIR "${INFERENCE_DEMO_INSTALL_DIR}/int8")
  if (NOT EXISTS ${INT8_DATA_DIR})
-    inference_download_and_uncompress(${INT8_DATA_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "imagenet_val_100.tar.gz")
+    inference_download_and_uncompress(${INT8_DATA_DIR} ${INFERENCE_URL}"/int8" "imagenet_val_100.tar.gz")
  endif()

  #resnet50 int8
  set(INT8_RESNET50_MODEL_DIR "${INT8_DATA_DIR}/resnet50")
  if (NOT EXISTS ${INT8_RESNET50_MODEL_DIR})
-    inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "resnet50_int8_model.tar.gz" )
+    inference_download_and_uncompress(${INT8_RESNET50_MODEL_DIR} ${INFERENCE_URL}"/int8" "resnet50_int8_model.tar.gz" )
  endif()
  inference_analysis_api_int8_test(test_analyzer_int8_resnet50 ${INT8_RESNET50_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)

  #mobilenet int8
  set(INT8_MOBILENET_MODEL_DIR "${INT8_DATA_DIR}/mobilenet")
  if (NOT EXISTS ${INT8_MOBILENET_MODEL_DIR})
-    inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} "https://paddle-inference-dist.bj.bcebos.com/int8" "mobilenetv1_int8_model.tar.gz" )
+    inference_download_and_uncompress(${INT8_MOBILENET_MODEL_DIR} ${INFERENCE_URL}"/int8" "mobilenetv1_int8_model.tar.gz" )
  endif()
  inference_analysis_api_int8_test(test_analyzer_int8_mobilenet ${INT8_MOBILENET_MODEL_DIR} ${INT8_DATA_DIR} analyzer_int8_image_classification_tester.cc SERIAL)
 endif()

--- a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
 #   copyright (c) 2019 paddlepaddle authors. all rights reserved.
-#
 # licensed under the apache license, version 2.0 (the "license");
 # you may not use this file except in compliance with the license.
 # you may obtain a copy of the license at
@@ -11,6 +10,7 @@
 # without warranties or conditions of any kind, either express or implied.
 # see the license for the specific language governing permissions and
 # limitations under the license.
+import hashlib
 import unittest
 import os
 import numpy as np
@@ -21,16 +21,20 @@ import functools
 import contextlib
 from PIL import Image, ImageEnhance
 import math
-from paddle.dataset.common import download
+from paddle.dataset.common import download, md5file
+import tarfile

 random.seed(0)
 np.random.seed(0)

 DATA_DIM = 224
-
 SIZE_FLOAT32 = 4
 SIZE_INT64 = 8
-
+FULL_SIZE_BYTES = 30106000008
+FULL_IMAGES = 50000
+DATA_DIR_NAME = 'ILSVRC2012'
+IMG_DIR_NAME = 'var'
+TARGET_HASH = '8dc592db6dcc8d521e4d5ba9da5ca7d2'
 img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
 img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))

@@ -70,19 +74,9 @@ def process_image(img_path, mode, color_jitter, rotate):
    return img


-def download_unzip():
-    int8_download = 'int8/download'
-
-    target_name = 'data'
-
-    cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
-                                      int8_download)
-
-    target_folder = os.path.join(cache_folder, target_name)
-
+def download_concat(cache_folder, zip_path):
    data_urls = []
    data_md5s = []
-
    data_urls.append(
        'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partaa'
    )
@@ -91,49 +85,80 @@ def download_unzip():
        'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab'
    )
    data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5')
-
    file_names = []
-
+    print("Downloading full ImageNet Validation dataset ...")
    for i in range(0, len(data_urls)):
        download(data_urls[i], cache_folder, data_md5s[i])
-        file_names.append(data_urls[i].split('/')[-1])
-
-    zip_path = os.path.join(cache_folder, 'full_imagenet_val.tar.gz')
-
+        file_name = os.path.join(cache_folder, data_urls[i].split('/')[-1])
+        file_names.append(file_name)
+        print("Downloaded part {0}\n".format(file_name))
    if not os.path.exists(zip_path):
-        cat_command = 'cat'
-        for file_name in file_names:
-            cat_command += ' ' + os.path.join(cache_folder, file_name)
-        cat_command += ' > ' + zip_path
-        os.system(cat_command)
-        print('Data is downloaded at {0}\n').format(zip_path)
-
-    if not os.path.exists(target_folder):
-        cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(target_folder, zip_path)
-        os.system(cmd)
-        print('Data is unzipped at {0}\n'.format(target_folder))
-
-    data_dir = os.path.join(target_folder, 'ILSVRC2012')
-    print('ILSVRC2012 full val set at {0}\n'.format(data_dir))
-    return data_dir
+        with open(zip_path, "w+") as outfile:
+            for fname in file_names:
+                with open(fname) as infile:
+                    outfile.write(infile.read())
+
+
+def extract(zip_path, extract_folder):
+    data_dir = os.path.join(extract_folder, DATA_DIR_NAME)
+    img_dir = os.path.join(data_dir, IMG_DIR_NAME)
+    print("Extracting...\n")
+
+    if not (os.path.exists(img_dir) and
+            len(os.listdir(img_dir)) == FULL_IMAGES):
+        tar = tarfile.open(zip_path)
+        tar.extractall(path=extract_folder)
+        tar.close()
+    print('Extracted. Full Imagenet Validation dataset is located at {0}\n'.
+          format(data_dir))
+
+
+def print_processbar(done, total):
+    done_filled = done * '='
+    empty_filled = (total - done) * ' '
+    percentage_done = done * 100 / total
+    sys.stdout.write("\r[%s%s]%d%%" %
+                     (done_filled, empty_filled, percentage_done))
+    sys.stdout.flush()
+
+
+def check_integrity(filename, target_hash):
+    print('\nThe binary file exists. Checking file integrity...\n')
+    md = hashlib.md5()
+    count = 0
+    total_parts = 50
+    chunk_size = 8192
+    onepart = FULL_SIZE_BYTES / chunk_size / total_parts
+    with open(filename) as ifs:
+        while True:
+            buf = ifs.read(8192)
+            if count % onepart == 0:
+                done = count / onepart
+                print_processbar(done, total_parts)
+            count = count + 1
+            if not buf:
+                break
+            md.update(buf)
+    hash1 = md.hexdigest()
+    if hash1 == target_hash:
+        return True
+    else:
+        return False


-def reader():
-    data_dir = download_unzip()
-    file_list = os.path.join(data_dir, 'val_list.txt')
-    output_file = os.path.join(data_dir, 'int8_full_val.bin')
+def convert(file_list, data_dir, output_file):
+    print('Converting 50000 images to binary file ...\n')
    with open(file_list) as flist:
        lines = [line.strip() for line in flist]
        num_images = len(lines)
-        if not os.path.exists(output_file):
-            print(
-                'Preprocessing to binary file...<num_images><all images><all labels>...\n'
-            )
-            with open(output_file, "w+b") as of:
+        with open(output_file, "w+b") as ofs:
            #save num_images(int64_t) to file
-                of.seek(0)
+            ofs.seek(0)
            num = np.array(int(num_images)).astype('int64')
-                of.write(num.tobytes())
+            ofs.write(num.tobytes())
+            per_parts = 1000
+            full_parts = FULL_IMAGES / per_parts
+            print_processbar(0, full_parts)
            for idx, line in enumerate(lines):
                img_path, label = line.split()
                img_path = os.path.join(data_dir, img_path)
@@ -144,19 +169,54 @@ def reader():
                img = process_image(
                    img_path, 'val', color_jitter=False, rotate=False)
                np_img = np.array(img)
-                    of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3
-                            * idx)
-                    of.write(np_img.astype('float32').tobytes())
+                ofs.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 *
+                         idx)
+                ofs.write(np_img.astype('float32').tobytes())
+                ofs.flush()

                #save label(int64_t) to file
                label_int = (int)(label)
                np_label = np.array(label_int)
-                    of.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3
-                            * num_images + idx * SIZE_INT64)
-                    of.write(np_label.astype('int64').tobytes())
-
-        print('The preprocessed binary file path {}\n'.format(output_file))
+                ofs.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 *
+                         num_images + idx * SIZE_INT64)
+                ofs.write(np_label.astype('int64').tobytes())
+                ofs.flush()
+                if (idx + 1) % per_parts == 0:
+                    done = (idx + 1) / per_parts
+                    print_processbar(done, full_parts)
+    print("Conversion finished.")
+
+
+def run_convert():
+    print('Start to download and convert 50000 images to binary file...')
+    cache_folder = os.path.expanduser('~/.cache/paddle/dataset/int8/download')
+    extract_folder = os.path.join(cache_folder, 'full_data')
+    data_dir = os.path.join(extract_folder, DATA_DIR_NAME)
+    file_list = os.path.join(data_dir, 'val_list.txt')
+    zip_path = os.path.join(cache_folder, 'full_imagenet_val.tar.gz')
+    output_file = os.path.join(cache_folder, 'int8_full_val.bin')
+    retry = 0
+    try_limit = 3
+
+    while not (os.path.exists(output_file) and
+               os.path.getsize(output_file) == FULL_SIZE_BYTES and
+               check_integrity(output_file, TARGET_HASH)):
+        if os.path.exists(output_file):
+            sys.stderr.write(
+                "\n\nThe existing binary file is broken. Start to generate new one...\n\n".
+                format(output_file))
+            os.remove(output_file)
+        if retry < try_limit:
+            retry = retry + 1
+        else:
+            raise RuntimeError(
+                "Can not convert the dataset to binary file with try limit {0}".
+                format(try_limit))
+        download_concat(cache_folder, zip_path)
+        extract(zip_path, extract_folder)
+        convert(file_list, data_dir, output_file)
+    print("\nSuccess! The binary file can be found at {0}".format(output_file))


 if __name__ == '__main__':
-    reader()
+    run_convert()
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -11,7 +11,7 @@ function(inference_download INSTALL_DIR URL FILENAME)
      ${EXTERNAL_PROJECT_LOG_ARGS}
      PREFIX                ${INSTALL_DIR}
      URL                   ${URL}/${FILENAME}
-      DOWNLOAD_COMMAND      wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME}
+      DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME}
      DOWNLOAD_DIR          ${INSTALL_DIR}
      DOWNLOAD_NO_PROGRESS  1
      CONFIGURE_COMMAND     ""
@@ -30,7 +30,7 @@ function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
      ${EXTERNAL_PROJECT_NAME}
      ${EXTERNAL_PROJECT_LOG_ARGS}
      PREFIX                ${INSTALL_DIR}
-      DOWNLOAD_COMMAND      wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
+      DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
                            ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
      DOWNLOAD_DIR          ${INSTALL_DIR}
      DOWNLOAD_NO_PROGRESS  1

--- a/paddle/fluid/operators/dgc_clip_by_norm_op.h
+++ b/paddle/fluid/operators/dgc_clip_by_norm_op.h
@@ -24,19 +24,22 @@ class DGCClipByNormKernel : public ClipByNormKernel<DeviceContext, T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    auto rampup_begin_step = context.Attr<float>("rampup_begin_step");
-    if (static_cast<int>(rampup_begin_step) >= 0) {
-      auto current_step_tensor =
-          context.Input<framework::Tensor>("current_step");
+    if (static_cast<int>(rampup_begin_step) < 0) {
+      return;
+    }
+
+    auto current_step_tensor = context.Input<framework::Tensor>("current_step");
    auto* current_step = current_step_tensor->data<T>();

-      if (static_cast<int>(*current_step) <
-          static_cast<int>(rampup_begin_step)) {
+    VLOG(10) << "current_step:" << *current_step
+             << ", rampup_begin_step:" << rampup_begin_step;
+
+    if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
      VLOG(10) << "current_step:" << *current_step
               << " < rampup_begin_step:" << rampup_begin_step
               << " so does't use dgc_clip_by_norm";
      return;
    }
-    }

    return ClipByNormKernel<DeviceContext, T>::Compute(context);
  };

--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ b/python/paddle/fluid/dygraph/layer_object_helper.py
@@ -65,7 +65,7 @@ class LayerObjectHelper(LayerHelperBase):
    def _input(self, inputs_in):
        inputs = self._multiple_input(inputs_in)
        if len(inputs) != 1:
-            raise "{0} layer only takes one input".format(self.layer_type)
+            raise "{0} layer only takes one input in".format(self.layer_type)
        return inputs[0]

    def _multiple_param_attr(self, length, param_attr_in=None):
@@ -74,7 +74,8 @@ class LayerObjectHelper(LayerHelperBase):
            param_attr = [param_attr]

        if len(param_attr) != 1 and len(param_attr) != length:
-            raise ValueError("parameter number mismatch")
+            raise ValueError("parameter number mismatch in {}".format(
+                self.name))
        elif len(param_attr) == 1 and length != 1:
            tmp = [None] * length
            for i in six.moves.range(length):
@@ -91,6 +92,10 @@ class LayerObjectHelper(LayerHelperBase):

        Returns input, param_attr
        """
+        param_attr_in = ParamAttr._to_attr(param_attr_in)
+        if isinstance(param_attr_in, bool):
+            raise ValueError('Param_attr should not be False in {}'.format(
+                self.name))
        inputs = inputs_in if (inputs_in is not None) else []
        inputs = self._multiple_input(inputs)
        param_attrs = self._multiple_param_attr(len(inputs), param_attr_in)
@@ -112,8 +117,8 @@ class LayerObjectHelper(LayerHelperBase):
            if dtype is None:
                dtype = each.dtype
            elif dtype != each.dtype:
-                raise ValueError("Data Type mismatch: %d to %d" %
-                                 (dtype, each.dtype))
+                raise ValueError("Data Type mismatch: %d to %d in %s" %
+                                 (dtype, each.dtype, self.name))
        return dtype

    def get_parameter(self, name):
@@ -126,7 +131,8 @@ class LayerObjectHelper(LayerHelperBase):
        """
        param = self.main_program.global_block().var(name)
        if not isinstance(param, Parameter):
-            raise ValueError("no Parameter name %s found" % name)
+            raise ValueError("no Parameter name %s found in %s" %
+                             (name, self.name))
        return param

    def append_bias_op(self,
@@ -184,7 +190,8 @@ class LayerObjectHelper(LayerHelperBase):
        if isinstance(act, six.string_types):
            act = {'type': act}
        else:
-            raise TypeError(str(act) + " should be unicode or str")
+            raise TypeError(
+                str(act) + " should be unicode or str in %s ", self.name)

        if (use_cudnn is not None) and use_cudnn:
            act['use_cudnn'] = use_cudnn
@@ -211,5 +218,6 @@ class LayerObjectHelper(LayerHelperBase):
        """
        param = param
        if not isinstance(param, cls):
-            raise TypeError("The input {0} parameter of method {1} must be {2}",
-                            param, self.layer_type, cls.__name__)
+            raise TypeError(
+                "The input {0} parameter of method {1} must be {2}, in layer {3}",
+                param, self.layer_type, cls.__name__, self.name)
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -20,7 +20,7 @@ import numpy as np
 from .. import core
 from ..layers import utils
 from . import layers
-from ..framework import Variable, OpProtoHolder
+from ..framework import Variable, OpProtoHolder, Parameter
 from ..layers import layer_function_generator
 from ..param_attr import ParamAttr
 from ..initializer import Normal, Constant, NumpyArrayInitializer
@@ -460,44 +460,67 @@ class FC(layers.Layer):
        self._param_attr = param_attr
        self._bias_attr = bias_attr
        self._act = act
+        self.__w = list()
+
+    @property
+    def _w(self, i=0):
+        return self.__w[i]
+
+    @_w.setter
+    def _w(self, value, i=0):
+        assert isinstance(value, Parameter)
+        self.__w[i] = value

    def _build_once(self, input):
-        input_shape = input.shape
+        i = 0
+        for inp, param in self._helper.iter_inputs_and_params(input,
+                                                              self._param_attr):
+            input_shape = inp.shape
+
            param_shape = [
-            reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:], 1)
+                reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:],
+                       1)
            ] + [self._size]
-        self._w = self.create_parameter(
-            attr=self._param_attr,
+            self.__w.append(
+                self.add_parameter(
+                    '_w%d' % i,
+                    self.create_parameter(
+                        attr=param,
                        shape=param_shape,
                        dtype=self._dtype,
-            is_bias=False)
+                        is_bias=False)))
+            i += 1

-        if self._bias_attr:
        size = list([self._size])
        self._b = self.create_parameter(
-                attr=self._bias_attr,
-                shape=size,
-                dtype=self._dtype,
-                is_bias=True)
-        else:
-            self._b = None
+            attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)

    def forward(self, input):
+        mul_results = list()
+        i = 0
+        for inp, param in self._helper.iter_inputs_and_params(input,
+                                                              self._param_attr):
            tmp = self._helper.create_variable_for_type_inference(self._dtype)
            self._helper.append_op(
                type="mul",
-            inputs={"X": input,
-                    "Y": self._w},
+                inputs={"X": inp,
+                        "Y": self.__w[i]},
                outputs={"Out": tmp},
                attrs={
                    "x_num_col_dims": self._num_flatten_dims,
                    "y_num_col_dims": 1
                })
+            i += 1
+            mul_results.append(tmp)

-        pre_bias = self._helper.create_variable_for_type_inference(self._dtype)
+        if len(mul_results) == 1:
+            pre_bias = mul_results[0]
+        else:
+            pre_bias = self._helper.create_variable_for_type_inference(
+                self._dtype)
            self._helper.append_op(
                type="sum",
-            inputs={"X": [tmp]},
+                inputs={"X": mul_results},
                outputs={"Out": pre_bias},
                attrs={"use_mkldnn": False})


--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -493,7 +493,8 @@ class Variable(object):
        self._ivar._run_backward()

    def gradient(self):
-        return np.array(self._ivar._grad_value())
+        new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True)
+        return np.array(new_ivar.value().get_tensor())

    def clear_gradient(self):
        self._ivar._clear_gradient()

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -832,7 +832,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
            type=x.type, name=name, dtype=x.dtype, persistable=False)

        helper.append_op(
-            type="clip_by_norm",
+            type="dgc_clip_by_norm",
            inputs={"X": x,
                    "current_step": self._global_step_var},
            attrs={
@@ -845,7 +845,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
    def _append_clip_norm(self, grad_var, clip_norm):
        with grad_var.block.program._backward_role_guard():
            return self._clip_by_norm(
-                x=grad_var, max_norm=clip_norm, name=grad_var.name + "@DGC")
+                x=grad_var, max_norm=clip_norm, name=grad_var.name)

    def _dgc_op(self, param_var, clip_var, grad_var, u_var, v_var, k_var,
                encoded_var):

--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
@@ -304,7 +304,7 @@ use_py_reader = False
 sync = False

 # how many batches we use
-batch_num = 2
+batch_num = 50

 np.random.seed = 1
 src_word_np = np.random.randint(
@@ -1076,19 +1076,19 @@ class TestDygraphTransformer(unittest.TestCase):
                        static_param_updated[static_param_name_list[k -
                                                                    4]] = out[k]

-        self.assertTrue(np.allclose(static_avg_cost_value, dy_avg_cost.numpy()))
-        self.assertTrue(np.allclose(static_sum_cost_value, dy_sum_cost.numpy()))
        self.assertTrue(
-            np.allclose(
-                static_predict_value, dy_predict.numpy(), atol=1e-5))
+            np.array_equal(static_avg_cost_value, dy_avg_cost.numpy()))
        self.assertTrue(
-            np.allclose(static_token_num_value, dy_token_num.numpy()))
+            np.array_equal(static_sum_cost_value, dy_sum_cost.numpy()))
+        self.assertTrue(
+            np.array_equal(static_predict_value, dy_predict.numpy()))
+        self.assertTrue(
+            np.array_equal(static_token_num_value, dy_token_num.numpy()))
+
        for key, value in six.iteritems(static_param_init):
-            self.assertTrue(np.allclose(value, dy_param_init[key]))
+            self.assertTrue(np.array_equal(value, dy_param_init[key]))
        for key, value in six.iteritems(static_param_updated):
-            self.assertTrue(
-                np.allclose(
-                    value, dy_param_updated[key], atol=1e-4))
+            self.assertTrue(np.array_equal(value, dy_param_updated[key]))


 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -76,6 +76,41 @@ class LayerTest(unittest.TestCase):


 class TestLayer(LayerTest):
+    def test_fc(self):
+        # pdb.set_trace()
+        inp = np.ones([3, 32, 32], dtype='float32')
+        with self.static_graph():
+            t = layers.data(
+                name='data',
+                shape=[3, 32, 32],
+                dtype='float32',
+                append_batch_size=False)
+            ret = layers.fc(t, size=4, bias_attr=False, num_flatten_dims=1)
+            ret2 = layers.fc(ret, size=4)
+            static_ret = self.get_static_graph_result(
+                feed={'data': inp}, fetch_list=[ret2])[0]
+        with self.static_graph():
+            t = layers.data(
+                name='data',
+                shape=[3, 32, 32],
+                dtype='float32',
+                append_batch_size=False)
+            fc1 = nn.FC('fc1', size=4, bias_attr=False, num_flatten_dims=1)
+            fc2 = nn.FC('fc2', size=4)
+            ret = fc1(t)
+            ret2 = fc2(ret)
+            static_ret2 = self.get_static_graph_result(
+                feed={'data': inp}, fetch_list=[ret2])[0]
+        with self.dynamic_graph():
+            t = base.to_variable(inp)
+            fc1 = nn.FC('fc1', size=4, bias_attr=False, num_flatten_dims=1)
+            fc2 = nn.FC('fc2', size=4)
+            ret = fc1(t)
+            dy_ret = fc2(ret)
+
+        self.assertTrue(np.array_equal(static_ret, static_ret2))
+        self.assertTrue(np.array_equal(static_ret, dy_ret._numpy()))
+
    def test_layer_norm(self):
        inp = np.ones([3, 32, 32], dtype='float32')
        with self.static_graph():