diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9ad69738eb2ac21d6ff2624f11d17a38410d5c1f..b9d53cb50f2811c889423e79c53a3903ae578a08 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -47,34 +47,37 @@ find_package(Threads REQUIRED)
 
 include(simd)
 
-################################ Configurations #######################################
+################################ Exposed Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
-option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
+option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
+option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
+option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
 option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
+option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
+option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
+option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
+option(ON_INFER         "Turn on inference optimization."               OFF)
+option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
+################################ Internal Configurations #######################################
+option(WITH_AMD_GPU     "Compile PaddlePaddle with AMD GPU"             OFF)
 option(WITH_NGRAPH      "Compile PaddlePaddle with nGraph support."     OFF)
-option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
-option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
-option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
 option(WITH_JEMALLOC    "Compile PaddlePaddle with jemalloc"            OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
-option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
 option(WITH_PSLIB       "Compile with pslib support"                    OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
 option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
-option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(ANAKIN_BUILD_FAT_BIN "Build anakin cuda fat-bin lib for all device plantform, ignored when WITH_ANAKIN=OFF" OFF)
 option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plantform. ignored when WITH_ANAKIN=OFF" ON)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
-option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
-option(ON_INFER         "Turn on inference optimization."               OFF)
-option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
-option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
+option(WITH_INFERENCE_API_TEST   "Test fluid inference C++ high-level api interface"  OFF)
+option(WITH_HIGH_LEVEL_API_TEST   "Test fluid python high-level api interface"  OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
 option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)
+option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ON)
 
 # PY_VERSION
 if(NOT PY_VERSION)
@@ -194,9 +197,14 @@ if(WITH_GPU)
     include(anakin_subgraph)
 endif()
 
-if(WITH_GPU AND NOT WIN32)
+if(WIN32 OR APPLE OR NOT WITH_GPU OR ON_INFER)
+    set(WITH_DGC OFF)
+endif()
+
+if(WITH_DGC)
     message(STATUS "add dgc lib.")
     include(external/dgc)
+    add_definitions(-DPADDLE_WITH_DGC)
 endif()
 
 if(WITH_MKL OR WITH_MKLML)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 69da9b98198de358348621ecdb444f2f81c7757f..09eb437aede4364f8aa285d5296f21cd8460fca1 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -221,6 +221,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
             -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
             -DCMAKE_INSTALL_LIBDIR=lib
+            -DBUILD_SHARED_LIBS=OFF
         CMAKE_CACHE_ARGS
             -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
             -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index b7c32f80db0dcb826f3f67ffb55da1c715785add..a7dce4dfdb530b13bea9df128694f0946714ccff 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -131,15 +131,6 @@ elseif (NOT CBLAS_FOUND OR WIN32)
             )
 endif ()
 
-if (WITH_GPU AND NOT WIN32)
-    set(dgc_dir "${FLUID_INSTALL_DIR}/third_party/install/dgc")
-    copy(dgc_lib
-            SRCS ${DGC_INSTALL_DIR}/lib ${DGC_INSTALL_DIR}/include
-            DSTS ${dgc_dir} ${dgc_dir}
-            DEPS dgc)
-endif()
-
-
 if (WITH_MKLDNN)
     set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
     copy(mkldnn_lib
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index d71d792b4e0147afc0ec09808d2004cb4b48ba46..7e3c8e6454a87bdcf94b6581487aadd510ff93f7 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -13,6 +13,7 @@ paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, d
 paddle.fluid.cuda_places (ArgSpec(args=['device_ids'], varargs=None, keywords=None, defaults=(None,)), ('document', '7d9a51fc9cf3c5245b5227080a8064c3'))
 paddle.fluid.cpu_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', '4c0cd83f0b401fc2ff84c70974e5d210'))
 paddle.fluid.cuda_pinned_places (ArgSpec(args=['device_count'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd0c3ebd813c39958c92b78e3eef7e912'))
+paddle.fluid.in_dygraph_mode (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'f06314a1cb30c96b5808dde2219c2dae'))
 paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03'))
 paddle.fluid.Executor.infer_from_dataset (ArgSpec(args=['self', 'program', 'dataset', 'scope', 'thread', 'debug', 'fetch_list', 'fetch_info', 'print_period'], varargs=None, keywords=None, defaults=(None, None, None, 0, False, None, None, 100)), ('document', '9c7decb955b9c4f718114179c8985581'))
@@ -117,6 +118,8 @@ paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name
 paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '66a622db727551761ce4eb73eaa7f6a4'))
 paddle.fluid.layers.reduce_min (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd50ac552b5d131468ed466d08bb2d38c'))
 paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'fcd8301a0ce15f219c7a4bcd0c1e8eca'))
+paddle.fluid.layers.reduce_all (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '646ca4d4a2cc16084f59de44b6927eca'))
+paddle.fluid.layers.reduce_any (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'f36661060aeeaf6c6b1331e41b3726fa'))
 paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3'))
 paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce'))
 paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6'))
@@ -124,7 +127,7 @@ paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed
 paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6'))
 paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2'))
 paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571'))
-paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '6e428384ce6a77207fa2c70d9f011990'))
+paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '35c6a241bcc1a1fc89508860d82ad62b'))
 paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', 'b4cbe1ac451005df6dad12e9ffdccca9'))
 paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd3570c02f71bcd78e60b3f31dc8f5b32'))
 paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', 'aaba49c038ba927f0a8e45c0c9a686ab'))
@@ -142,7 +145,7 @@ paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'par
 paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '3f536aafba30d793287b52d231baff1b'))
 paddle.fluid.layers.softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, True, False)), ('document', 'bce1b75e3d95b75cacd1099655cbb3c3'))
 paddle.fluid.layers.smooth_l1 (ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'c6b175d253c55baf4b9c0eca9b1dda88'))
-paddle.fluid.layers.one_hot (ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None), ('document', '6148b6a555cbfb62fdcd030d8982c18c'))
+paddle.fluid.layers.one_hot (ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None), ('document', 'c87f620c15573442be7c84b50223b567'))
 paddle.fluid.layers.autoincreased_step_counter (ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)), ('document', '3f6c828594720c9b2da89c464be94478'))
 paddle.fluid.layers.reshape (ArgSpec(args=['x', 'shape', 'actual_shape', 'act', 'inplace', 'name'], varargs=None, keywords=None, defaults=(None, None, False, None)), ('document', '323c019f257e55ddea4a824a362de62f'))
 paddle.fluid.layers.squeeze (ArgSpec(args=['input', 'axes', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3229d06517f794e86ca3da14c38b1465'))
@@ -155,10 +158,10 @@ paddle.fluid.layers.label_smooth (ArgSpec(args=['label', 'prior_dist', 'epsilon'
 paddle.fluid.layers.roi_pool (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale'], varargs=None, keywords=None, defaults=(1, 1, 1.0)), ('document', 'c317aa595deb31649083c8faa91cdb97'))
 paddle.fluid.layers.roi_align (ArgSpec(args=['input', 'rois', 'pooled_height', 'pooled_width', 'spatial_scale', 'sampling_ratio', 'name'], varargs=None, keywords=None, defaults=(1, 1, 1.0, -1, None)), ('document', '12c5bbb8b38c42e623fbc47611d766e1'))
 paddle.fluid.layers.dice_loss (ArgSpec(args=['input', 'label', 'epsilon'], varargs=None, keywords=None, defaults=(1e-05,)), ('document', '1ba0508d573f65feecf3564dce22aa1d'))
-paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', '7a1966d7c3a48f1fc0881cdaf5d83b0b'))
+paddle.fluid.layers.image_resize (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'resample', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, 'BILINEAR', None, True, 1)), ('document', 'd1b08c11bb9277386fcf6ae70b6622d1'))
 paddle.fluid.layers.image_resize_short (ArgSpec(args=['input', 'out_short_len', 'resample'], varargs=None, keywords=None, defaults=('BILINEAR',)), ('document', '06211aefc50c5a3e940d7204d859cdf7'))
-paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', 'e4fb4ed511b2293b8f04f7e872afbfd7'))
-paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', '735fa9758a6d7ff3b47d7b827f961c1d'))
+paddle.fluid.layers.resize_bilinear (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners', 'align_mode'], varargs=None, keywords=None, defaults=(None, None, None, None, True, 1)), ('document', 'c45591fbc4f64a178fbca219e1546a58'))
+paddle.fluid.layers.resize_nearest (ArgSpec(args=['input', 'out_shape', 'scale', 'name', 'actual_shape', 'align_corners'], varargs=None, keywords=None, defaults=(None, None, None, None, True)), ('document', 'ae6d73cdc7f3a138d8a338ecdb33c1ae'))
 paddle.fluid.layers.gather (ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None), ('document', '98f1c86716b9b7f4dda83f20e2adeee2'))
 paddle.fluid.layers.scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '65f8e9d8ddfd0b412f940579c4faa342'))
 paddle.fluid.layers.sequence_scatter (ArgSpec(args=['input', 'index', 'updates', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '15b522457dfef103f0c20ca9d397678b'))
@@ -203,6 +206,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'sha
 paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'a418e3ccb5e2ac21bd60f5cc221d5860'))
 paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '01dbb91e7c74cb11336cd531013de51a'))
 paddle.fluid.layers.shape (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '17db0f814eb7bb5a3fac1ca6e60e16d8'))
+paddle.fluid.layers.rank (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'ee1386c42ecc8f424fe3fb21862fefc2'))
 paddle.fluid.layers.logical_and (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cdcf20c494c92060d10feb9374532f42'))
 paddle.fluid.layers.logical_or (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '0eae3f726a4afe590757552fa3ced012'))
 paddle.fluid.layers.logical_xor (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b0daaa3fa4a0aa62f9b58c43d959eb25'))
@@ -235,8 +239,9 @@ paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], vararg
 paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec'))
 paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
 paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
-paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', 'ad669cdf83e72a69ebc5ed79e36486de'))
+paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', '731b21c62a4add60a33bd76d802ffc5c'))
 paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
+paddle.fluid.layers.continuous_value_model (ArgSpec(args=['input', 'cvm', 'use_cvm'], varargs=None, keywords=None, defaults=(True,)), ('document', 'a07a44c2bacdcd09c1f5f35a96a0514e'))
 paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
 paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
 paddle.fluid.layers.read_file (ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None), ('document', 'b0a1c2fc51c27a106da28f3308c41f5e'))
@@ -271,6 +276,8 @@ paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, de
 paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8'))
 paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292'))
 paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb'))
+paddle.fluid.layers.linspace (ArgSpec(args=['start', 'stop', 'num', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '495e21e9a848c2d075a102802fc67756'))
+paddle.fluid.layers.zeros_like (ArgSpec(args=['x', 'out'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c7e4cfffc93ae89c8f6f53b6d650f923'))
 paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -280,7 +287,11 @@ paddle.fluid.layers.increment (ArgSpec(args=['x', 'value', 'in_place'], varargs=
 paddle.fluid.layers.array_write (ArgSpec(args=['x', 'i', 'array'], varargs=None, keywords=None, defaults=(None,)), ('document', '40b6d15f4c86b2b09df340d7778ad713'))
 paddle.fluid.layers.create_array (ArgSpec(args=['dtype'], varargs=None, keywords=None, defaults=None), ('document', '2d4f20087080ba5105b55205ad5c5b6a'))
 paddle.fluid.layers.less_than (ArgSpec(args=['x', 'y', 'force_cpu', 'cond'], varargs=None, keywords=None, defaults=(None, None)), ('document', '067bbc799c66289ca8b8924c26b6673f'))
+paddle.fluid.layers.less_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd6b173ae1a149e0bdfe7b8bf69285957'))
+paddle.fluid.layers.greater_than (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '2c9bd414caa6c615539018d27001b44c'))
+paddle.fluid.layers.greater_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '62c667d24e7b07e166b47a53b61b2ff4'))
 paddle.fluid.layers.equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '80c29b1dc64718f0116de90d1ac88a77'))
+paddle.fluid.layers.not_equal (ArgSpec(args=['x', 'y', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', '56148fb1024687a08e96af79bdc5c929'))
 paddle.fluid.layers.array_read (ArgSpec(args=['array', 'i'], varargs=None, keywords=None, defaults=None), ('document', 'dd68bead34dfbaf6b0a163fc1cc3c385'))
 paddle.fluid.layers.array_length (ArgSpec(args=['array'], varargs=None, keywords=None, defaults=None), ('document', 'ffb8b9578ec66db565b223d313aa82a2'))
 paddle.fluid.layers.IfElse.__init__ (ArgSpec(args=['self', 'cond', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -296,12 +307,12 @@ paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs
 paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '7568c5ac7622a10288d3307a94134655'))
 paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f'))
 paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'c24e368e23afac1ed91a78a639d7a9c7'))
-paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', '72530f299d6451a567cf4a12dc3fb1ff'))
+paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'df6ceab6e6c9bd31e97914d7e7538137'))
+paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6d3e0a5d9aa519a9773a36e1620ea9b7'))
+paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '903387ec11f3d0bf46821d31a68cffa5'))
+paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '252890d4c3199a7623ab8667e13fd837'))
+paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '7a0000520f179f35239956a5ba55119f'))
 paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a'))
 paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732'))
 paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519'))
@@ -313,6 +324,7 @@ paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=Non
 paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7'))
 paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13'))
 paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9e27491c39ac74d0b1ffe506aec0ebb'))
+paddle.fluid.layers.rsqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c445467ebe58b3c0d7f0bba7795b6f56'))
 paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '64650ac42cf82e9920cb0b172b1d29fd'))
 paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad'))
 paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973'))
@@ -325,13 +337,13 @@ paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywor
 paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '48dfb45d773dbc30126c3a7f777de5ee'))
 paddle.fluid.layers.softplus (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '459c5781e9d1dd88283b7c5769d7872a'))
 paddle.fluid.layers.softsign (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '80846bcd4bd457207457a6d5411f4148'))
-paddle.fluid.layers.uniform_random (ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0)), ('document', '308b619af849caa82bbc31e897f5e641'))
+paddle.fluid.layers.uniform_random (ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0)), ('document', 'a8c4e972b7d6742c838a37abf407ed9a'))
 paddle.fluid.layers.hard_shrink (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c142f5884f3255e0d6075c286bbd531e'))
 paddle.fluid.layers.cumsum (ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '944d7c03057f5fc88bc78acd4d82f926'))
 paddle.fluid.layers.thresholded_relu (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '90566ea449ea4c681435546e2f70610a'))
 paddle.fluid.layers.prior_box (ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)), ('document', '14cac0ee643fa6e026ad82aeeee75bd8'))
 paddle.fluid.layers.density_prior_box (ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)), ('document', 'a0d762bb08de9ce93bc780aa57cd5cd9'))
-paddle.fluid.layers.multi_box_head (ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)), ('document', 'a6ab47a2fe681e52fabb7057ddf0efdd'))
+paddle.fluid.layers.multi_box_head (ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)), ('document', 'fe9afaee481dd09f28866df22756466f'))
 paddle.fluid.layers.bipartite_match (ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '3ddb9b966f193900193a95a3df77c3c1'))
 paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'c0b334f917828f95056f6ebe10907b1c'))
 paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)), ('document', 'c33093a82a46e3091e789e5572588db1'))
@@ -346,7 +358,7 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes',
 paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
-paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gtscore', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '57fa96922e42db8f064c3fb77f2255e8'))
+paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gt_box', 'gt_label', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gt_score', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '4d170807a13d33925d1049d2892832bf'))
 paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
@@ -360,8 +372,7 @@ paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_st
 paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', '882634f420f626642f0874481263da40'))
 paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'c717d9d1d78a53c809d01b8bc56f3cae'))
 paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'd9a95746353fd574be36dc28d8726c28'))
-paddle.fluid.layers.append_LARS (ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None), ('document', 'd24fa1e7d62ac8a534fc6a86002f84f8'))
-paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', '9588c64c26ffaef3c466e404a6af9d9b'))
+paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', 'f8b2727bccf0f368c997d7cf05847e49'))
 paddle.fluid.layers.linear_lr_warmup (ArgSpec(args=['learning_rate', 'warmup_steps', 'start_lr', 'end_lr'], varargs=None, keywords=None, defaults=None), ('document', '2ef3f5ca5cd71ea4217c418e5a7a0565'))
 paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.StateCell.__init__ (ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc
index e4e9861e37a4334220d5e39a5b44afafd668b7c3..0291b6f66a9e8cb6a3c16530084d3e3e7a6c39c1 100644
--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -72,7 +72,6 @@ bool DataFeed::PickOneFile(std::string* filename) {
   }
   VLOG(3) << "file_idx_=" << *file_idx_;
   *filename = filelist_[(*file_idx_)++];
-  // LOG(ERROR) << "pick file:" << *filename;
   return true;
 }
 
@@ -242,6 +241,11 @@ void InMemoryDataFeed<T>::SetTrainerNum(int trainer_num) {
   trainer_num_ = trainer_num;
 }
 
+template <typename T>
+void InMemoryDataFeed<T>::SetFleetSendBatchSize(int64_t size) {
+  fleet_send_batch_size_ = size;
+}
+
 template <typename T>
 void InMemoryDataFeed<T>::PutInsToChannel(const std::string& ins_str) {
 #ifdef _LINUX
@@ -361,8 +365,13 @@ void InMemoryDataFeed<T>::GlobalShuffle() {
   VLOG(3) << "GlobalShuffle() begin, thread_id=" << thread_id_;
   auto fleet_ptr = FleetWrapper::GetInstance();
   std::vector<std::vector<T*>> send_vec(trainer_num_);
+  std::vector<int> send_index(trainer_num_);
+  uint64_t reserve_len = fleet_send_batch_size_ / trainer_num_;
   for (auto& vec : send_vec) {
-    vec.reserve(fleet_send_batch_size_);
+    vec.reserve(reserve_len);
+  }
+  for (int i = 0; i < trainer_num_; ++i) {
+    send_index[i] = i;
   }
   std::vector<std::future<int32_t>> total_status;
   auto interval = GetMemoryDataInterval();
@@ -375,7 +384,10 @@ void InMemoryDataFeed<T>::GlobalShuffle() {
     int64_t node_id = random_num % trainer_num_;
     send_vec[node_id].push_back(&((*memory_data_)[i]));
     if (i % fleet_send_batch_size_ == 0 && i != 0) {
-      for (int j = 0; j < send_vec.size(); ++j) {
+      // shuffle the sequence of sending to avoid network timeout error
+      std::random_shuffle(send_index.begin(), send_index.end());
+      for (int index = 0; index < send_index.size(); ++index) {
+        int j = send_index[index];
         std::string send_str;
         SerializeIns(send_vec[j], &send_str);
         VLOG(3) << "send str_length=" << send_str.length()
@@ -388,7 +400,10 @@ void InMemoryDataFeed<T>::GlobalShuffle() {
       }
     }
   }
-  for (int j = 0; j < send_vec.size(); ++j) {
+  // shuffle the sequence of sending to avoid network timeout error
+  std::random_shuffle(send_index.begin(), send_index.end());
+  for (int index = 0; index < send_index.size(); ++index) {
+    int j = send_index[index];
     if (send_vec[j].size() != 0) {
       std::string send_str;
       SerializeIns(send_vec[j], &send_str);
@@ -450,6 +465,17 @@ void MultiSlotDataFeed::Init(
     if (slot.is_used()) {
       use_slots_.push_back(all_slots_[i]);
       use_slots_is_dense_.push_back(slot.is_dense());
+      std::vector<int> local_shape;
+      if (slot.is_dense()) {
+        // for batch size holder if is_dense
+        if (slot.shape(0) > 0) {
+          local_shape.push_back(0);
+        }
+      }
+      for (size_t i = 0; i < slot.shape_size(); ++i) {
+        local_shape.push_back(slot.shape(i));
+      }
+      use_slots_shape_.push_back(local_shape);
     }
   }
   feed_vec_.resize(use_slots_.size());
@@ -505,7 +531,7 @@ bool MultiSlotDataFeed::CheckFile(const char* filename) {
     char* endptr = const_cast<char*>(str);
     int len = line.length();
     for (size_t i = 0; i < all_slots_.size(); ++i) {
-      int num = strtol(endptr, &endptr, 10);
+      auto num = strtol(endptr, &endptr, 10);
       if (num < 0) {
         VLOG(0) << "error: the number of ids is a negative number: " << num;
         VLOG(0) << "please check line<" << instance_cout << "> in file<"
@@ -736,8 +762,8 @@ void MultiSlotDataFeed::PutToFeedVec(
     LoD data_lod{offset};
     feed_vec_[i]->set_lod(data_lod);
     if (use_slots_is_dense_[i]) {
-      int dim = total_instance / batch_size_;
-      feed_vec_[i]->Resize({batch_size_, dim});
+      use_slots_shape_[i][0] = batch_size_;
+      feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
     }
   }
 #endif
@@ -769,6 +795,16 @@ void MultiSlotInMemoryDataFeed::Init(
     if (slot.is_used()) {
       use_slots_.push_back(all_slots_[i]);
       use_slots_is_dense_.push_back(slot.is_dense());
+      std::vector<int> local_shape;
+      if (slot.is_dense()) {
+        if (slot.shape(0) > 0) {
+          local_shape.push_back(0);
+        }
+      }
+      for (size_t i = 0; i < slot.shape_size(); ++i) {
+        local_shape.push_back(slot.shape(i));
+      }
+      use_slots_shape_.push_back(local_shape);
     }
   }
   feed_vec_.resize(use_slots_.size());
@@ -924,8 +960,8 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
     LoD data_lod{offset};
     feed_vec_[i]->set_lod(data_lod);
     if (use_slots_is_dense_[i]) {
-      int dim = total_instance / batch_size_;
-      feed_vec_[i]->Resize({batch_size_, dim});
+      use_slots_shape_[i][0] = batch_size_;
+      feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
     }
   }
 #endif
diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h
index 8ea09b65ddd569e8ca8e24ba3b2416666d0eec92..d098c7858a98c644bd3cad78d3cf1e3b35ca026b 100644
--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -94,6 +94,8 @@ class DataFeed {
   virtual void SetThreadNum(int thread_num) {}
   // This function will do nothing at default
   virtual void SetTrainerNum(int trainer_num) {}
+  // This function will do nothing at default
+  virtual void SetFleetSendBatchSize(int64_t size) {}
   virtual void SetFileListMutex(std::mutex* mutex) {
     mutex_for_pick_file_ = mutex;
   }
@@ -140,6 +142,7 @@ class DataFeed {
   // object)
   std::vector<std::string> all_slots_;
   std::vector<std::string> all_slots_type_;
+  std::vector<std::vector<int>> use_slots_shape_;
   std::vector<int>
       use_slots_index_;  // -1: not used; >=0: the index of use_slots_
 
@@ -212,6 +215,7 @@ class InMemoryDataFeed : public PrivateQueueDataFeed<T> {
   virtual void SetThreadId(int thread_id);
   virtual void SetThreadNum(int thread_num);
   virtual void SetTrainerNum(int trainer_num);
+  virtual void SetFleetSendBatchSize(int64_t size);
   virtual void PutInsToChannel(const std::string& ins_str);
   virtual void FillMemoryDataToChannel();
   virtual void FillChannelToMemoryData();
diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto
index 77911306299b77748a2ad9437d49680748885003..03996e0e20a1729ee300a5ad37abc325876930b7 100644
--- a/paddle/fluid/framework/data_feed.proto
+++ b/paddle/fluid/framework/data_feed.proto
@@ -19,6 +19,7 @@ message Slot {
   required string type = 2;
   optional bool is_dense = 3 [ default = false ];
   optional bool is_used = 4 [ default = false ];
+  repeated int32 shape = 5; // we can define N-D Tensor
 }
 
 message MultiSlotDesc { repeated Slot slots = 1; }
diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc
index 600fc74710023c340a7b43053a38e1d82a11c976..a3b7b1e454ecec9da766b9b156c31b1317bb9d35 100644
--- a/paddle/fluid/framework/data_set.cc
+++ b/paddle/fluid/framework/data_set.cc
@@ -64,6 +64,17 @@ void DatasetImpl<T>::SetTrainerNum(int trainer_num) {
   }
 }
 
+// if you run distributed, and want to do global shuffle,
+// set this before global shuffle.
+// be sure you call CreateReaders before SetFleetSendBatchSize
+template <typename T>
+void DatasetImpl<T>::SetFleetSendBatchSize(int64_t size) {
+  fleet_send_batch_size_ = size;
+  for (auto reader : readers_) {
+    reader->SetFleetSendBatchSize(size);
+  }
+}
+
 template <typename T>
 void DatasetImpl<T>::SetHdfsConfig(const std::string& fs_name,
                                    const std::string& fs_ugi) {
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index 6fd3fcad28fa045326032200b7f26a18862454f4..bbe0f937abfa635b126062059abfcfb70adb996e 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -47,6 +47,8 @@ class Dataset {
   virtual void SetThreadNum(int thread_num) = 0;
   // set workers' num
   virtual void SetTrainerNum(int trainer_num) = 0;
+  // set fleet send batch size
+  virtual void SetFleetSendBatchSize(int64_t size) = 0;
   // set fs name and ugi
   virtual void SetHdfsConfig(const std::string& fs_name,
                              const std::string& fs_ugi) = 0;
@@ -59,6 +61,8 @@ class Dataset {
   virtual int GetThreadNum() = 0;
   // get worker num
   virtual int GetTrainerNum() = 0;
+  // get fleet send batch size
+  virtual int64_t GetFleetSendBatchSize() = 0;
   // get hdfs config
   virtual std::pair<std::string, std::string> GetHdfsConfig() = 0;
   // get data fedd desc
@@ -98,6 +102,7 @@ class DatasetImpl : public Dataset {
   virtual void SetFileList(const std::vector<std::string>& filelist);
   virtual void SetThreadNum(int thread_num);
   virtual void SetTrainerNum(int trainer_num);
+  virtual void SetFleetSendBatchSize(int64_t size);
   virtual void SetHdfsConfig(const std::string& fs_name,
                              const std::string& fs_ugi);
   virtual void SetDataFeedDesc(const std::string& data_feed_desc_str);
@@ -105,6 +110,7 @@ class DatasetImpl : public Dataset {
   virtual const std::vector<std::string>& GetFileList() { return filelist_; }
   virtual int GetThreadNum() { return thread_num_; }
   virtual int GetTrainerNum() { return trainer_num_; }
+  virtual int64_t GetFleetSendBatchSize() { return fleet_send_batch_size_; }
   virtual std::pair<std::string, std::string> GetHdfsConfig() {
     return std::make_pair(fs_name_, fs_ugi_);
   }
@@ -137,6 +143,7 @@ class DatasetImpl : public Dataset {
   std::string fs_name_;
   std::string fs_ugi_;
   unsigned int rand_seed;
+  int64_t fleet_send_batch_size_;
 };
 
 // use std::vector<MultiSlotType> as data type
diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 2c1f3ae638cf95c3ab49219909fe3b1f22137099..51231b981bf2dae7b038e808f52c78917a993928 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -24,15 +24,19 @@ if(WITH_DISTRIBUTE)
     endif()
 endif()
 
+set(all_reduce_deps all_reduce_op_handle)
 if(WITH_GPU)
-    set(dgc_deps "")
-    if(NOT WIN32)
-        set(dgc_deps dgc)
-    endif()
     nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
-            dynload_cuda variable_visitor ${dgc_deps})
+            dynload_cuda variable_visitor)
     nv_library(fused_all_reduce_op_handle SRCS fused_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
             dynload_cuda variable_visitor)
+
+    if(WITH_DGC)
+        nv_library(sparse_all_reduce_op_handle SRCS sparse_all_reduce_op_handle.cc DEPS op_handle_base scope 
+            lod_tensor ddim memory dynload_cuda variable_visitor dgc all_reduce_op_handle)
+        set(all_reduce_deps sparse_all_reduce_op_handle)
+    endif()
+
     if(WITH_DISTRIBUTE)
         nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope
             ddim dynload_cuda selected_rows_functor sendrecvop_rpc)
@@ -80,7 +84,7 @@ cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS grap
 cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
 
 cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
-        scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
+        scale_loss_grad_op_handle rpc_op_handle fetch_barrier_op_handle ${all_reduce_deps} reduce_op_handle broadcast_op_handle fused_broadcast_op_handle)
 
 cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph graph_helper fused_all_reduce_op_handle)
 
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc
index ed75b48090b27a9c430afe067467a1a39d711938..c9f06c64e447bfbcfeadbb29a5682c8b5b5085a0 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc
@@ -17,11 +17,6 @@
 #include "paddle/fluid/framework/details/reduce_and_gather.h"
 #include "paddle/fluid/framework/details/variable_visitor.h"
 #include "paddle/fluid/framework/operator.h"
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-#include "dgc/dgc.h"
-#endif
-
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -40,14 +35,11 @@ namespace details {
 AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
                                      const std::vector<Scope *> &local_scopes,
                                      const std::vector<platform::Place> &places,
-                                     const platform::NCCLContextMap *ctxs,
-                                     bool is_encoded, int nranks)
+                                     const platform::NCCLContextMap *ctxs)
     : OpHandleBase(node),
       local_scopes_(local_scopes),
       places_(places),
-      nccl_ctxs_(ctxs),
-      is_encoded_(is_encoded),
-      nranks_(nranks) {
+      nccl_ctxs_(ctxs) {
   if (nccl_ctxs_) {
     for (auto &p : places_) {
       this->SetDeviceContext(p, nccl_ctxs_->DevCtx(p));
@@ -62,92 +54,8 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
 #endif
 
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-void AllReduceOpHandle::RunImplEncoded() {
-  platform::RecordEvent record_event(Name());
-
-  WaitInputVarGenerated();
-
-  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
-  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), places_.size(),
-      "The NoDummyInputSize should be equal to the number of places.");
-  PADDLE_ENFORCE_EQ(
-      in_var_handles.size(), out_var_handles.size(),
-      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
-
-  std::vector<const LoDTensor *> ins;
-  std::vector<LoDTensor *> outs;
-  int k = -1;
-  for (size_t i = 0; i < local_scopes_.size(); ++i) {
-    auto &local_scope =
-        local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
-    auto original_name =
-        paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
-    auto encode_var_name = original_name + g_dgc_encoded;
-    auto *in_var = local_scope->FindVar(encode_var_name);
-    PADDLE_ENFORCE_NOT_NULL(in_var);
-    auto &in = in_var->Get<LoDTensor>();
-    ins.emplace_back(&in);
-
-    auto *out = local_scope->FindVar(out_var_handles[i]->name())
-                    ->GetMutable<LoDTensor>();
-    outs.emplace_back(out);
-
-    if (k < 0) {
-      k = GetKValue(in_var_handles[i]->name());
-    }
-  }
-
-  PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place()));
-  PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place()));
-  PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
-
-  int dtype = -1;
-  size_t in_numel = 0;
-  size_t out_numel = 0;
-  PADDLE_ENFORCE(nranks_ > 1);
-  std::vector<std::function<void()>> all_reduce_calls;
-
-  for (size_t i = 0; i < local_scopes_.size(); ++i) {
-    auto &place = places_[i];
-    auto &in = *ins[i];
-    void *in_tensor_buf = const_cast<void *>(in.data<void>());
-
-    auto &out = *outs[i];
-    float *out_tensor_buf = out.data<float>();
-
-    dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
-    in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
-    PADDLE_ENFORCE(in_numel % 2 == 0);
-    PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k));
-    out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
-
-    int dev_id = boost::get<platform::CUDAPlace>(place).device;
-    auto &nccl_ctx = nccl_ctxs_->at(dev_id);
-    auto stream = nccl_ctx.stream();
-    auto comm = nccl_ctx.comm_;
-
-    auto &allocator =
-        platform::DeviceTemporaryAllocator::Instance().Get(place, stream);
-    int encode_size = 2 * k * sizeof(int);
-    // dgc use ncclAllGather to get all the encoded data
-    // so the buffer need nranks.
-    int buf_size = nranks_ * encode_size;
-    auto tmp_ious_data = allocator.Allocate(buf_size);
-    void *gather_buff = reinterpret_cast<void *>(tmp_ious_data->ptr());
-
-    VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel
-             << ", nranks:" << nranks_ << ", gather_buf size:" << buf_size
-             << ", k:" << k << ", place:" << place << ", dtype:" << dtype;
-
-    all_reduce_calls.emplace_back([=] {
-      PADDLE_ENFORCE(paddle::communication::dgc::sparseAllGReduce(
-          in_tensor_buf, gather_buff, k, out_tensor_buf, out_numel, comm,
-          stream));
-    });
-  }
-
+void AllReduceOpHandle::RunAllReduceFuncs(
+    const std::vector<std::function<void()>> &all_reduce_calls) {
   this->RunAndRecordEvent([&] {
     if (all_reduce_calls.size() == 1UL) {
       // Do not use NCCLGroup when manage NCCL by per thread per device
@@ -178,68 +86,9 @@ void AllReduceOpHandle::RunImplEncoded() {
     }
   }
 }
-
-int AllReduceOpHandle::GetKValue(const std::string &grad_name) {
-  auto original_name = paddle::framework::GradOriginalVarName(grad_name);
-  auto var_name = original_name + g_dgc_k;
-  PADDLE_ENFORCE(local_scopes_.size() > 0);
-
-  auto *scope = local_scopes_[0];
-  auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
-  auto var = local_scope->FindVar(var_name);
-  PADDLE_ENFORCE_NOT_NULL(var);
-  auto tensor = var->Get<LoDTensor>().data<float>();
-  return *tensor;
-}
-#endif
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-bool AllReduceOpHandle::IsEncoded() {
-  if (!is_encoded_) {
-    return false;
-  }
-  auto counter_name = g_dgc_counter_name;
-  auto step_name = g_dgc_rampup_begin_step;
-  PADDLE_ENFORCE(local_scopes_.size() > 0);
-
-  auto *scope = local_scopes_[0];
-  auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
-  auto count_var = local_scope->FindVar(counter_name);
-  auto step_var = local_scope->FindVar(step_name);
-  if (count_var == nullptr || step_var == nullptr) {
-    PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name,
-                 step_var);
-  }
-
-  float count = *count_var->Get<LoDTensor>().data<float>();
-  float step = *step_var->Get<LoDTensor>().data<float>();
-  if (static_cast<int>(count) < static_cast<int>(step)) {
-    VLOG(10) << "in all_reduce currentstep:" << count
-             << " < rampup_begin_step:" << step
-             << " so not use sparse all reduce";
-    return false;
-  }
-
-  return true;
-}
-#else
-bool AllReduceOpHandle::IsEncoded() { return false; }
 #endif
 
 void AllReduceOpHandle::RunImpl() {
-  if (!IsEncoded()) {
-    RunImplNormal();
-    return;
-  }
-
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  RunImplEncoded();
-#else
-  PADDLE_THROW("Not compiled with CUDA");
-#endif
-}
-
-void AllReduceOpHandle::RunImplNormal() {
   platform::RecordEvent record_event(Name());
 
   WaitInputVarGenerated();
@@ -300,27 +149,7 @@ void AllReduceOpHandle::RunImplNormal() {
             comm, stream));
       });
     }
-    this->RunAndRecordEvent([&] {
-      if (all_reduce_calls.size() == 1UL) {
-        // Do not use NCCLGroup when manage NCCL by per thread per device
-        all_reduce_calls[0]();
-      } else {
-        platform::NCCLGroupGuard guard;
-        for (auto &call : all_reduce_calls) {
-          call();
-        }
-      }
-    });
-
-    if (FLAGS_sync_nccl_allreduce) {
-      for (auto &p : places_) {
-        int dev_id = boost::get<platform::CUDAPlace>(p).device;
-        auto &nccl_ctx = nccl_ctxs_->at(dev_id);
-        auto stream = nccl_ctx.stream();
-        cudaStreamSynchronize(stream);
-      }
-    }
-
+    RunAllReduceFuncs(all_reduce_calls);
 #else
     PADDLE_THROW("Not compiled with CUDA");
 #endif
diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h
index ca75186f6ceed3e48fe9326e85738d91bde0ca70..3effd0a8517212fdcffc754ba8ab96028f03eaac 100644
--- a/paddle/fluid/framework/details/all_reduce_op_handle.h
+++ b/paddle/fluid/framework/details/all_reduce_op_handle.h
@@ -28,19 +28,12 @@ namespace paddle {
 namespace framework {
 namespace details {
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-constexpr char g_dgc_counter_name[] = "__g_dgc_counter__";
-constexpr char g_dgc_rampup_begin_step[] = "__g_rampup_begin_step__";
-constexpr char g_dgc_encoded[] = "__dgc_encoded__";
-constexpr char g_dgc_k[] = "__dgc_k__";
-#endif
-
-struct AllReduceOpHandle : public OpHandleBase {
+class AllReduceOpHandle : public OpHandleBase {
+ public:
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
   AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places,
-                    const platform::NCCLContextMap *ctxs,
-                    bool is_encoded = false, int nranks = -1);
+                    const platform::NCCLContextMap *ctxs);
 #else
   AllReduceOpHandle(ir::Node *node, const std::vector<Scope *> &local_scopes,
                     const std::vector<platform::Place> &places);
@@ -54,18 +47,13 @@ struct AllReduceOpHandle : public OpHandleBase {
  protected:
   void RunImpl() override;
 
- private:
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-  void RunImplEncoded();
+  void RunAllReduceFuncs(
+      const std::vector<std::function<void()>> &all_reduce_calls);
   const platform::NCCLContextMap *nccl_ctxs_;
-  bool is_encoded_{false};
-  int nranks_{-1};
-  int GetKValue(const std::string &grad_name);
 #endif
-  void RunImplNormal();
-  bool IsEncoded();
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
index 8e8258ffb124e5008954a455264f5c0bc5cabc37..58ec427859e9f0ec4d29cc419f5bfe382e245852 100644
--- a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
+++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.cc
@@ -12,17 +12,18 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.h"
 #include <algorithm>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
-
 #include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/op_registry.h"
 
-DEFINE_uint32(fuse_parameter_memory_size, 0,  // 0 KB
+DEFINE_uint64(fuse_parameter_memory_size, 0,  // 0 KB
               "fuse_parameter_memory_size is up limited memory size "
               "of one group parameters' gradient which is the input "
               "of communication calling(e.g NCCLAllReduce). "
@@ -40,355 +41,365 @@ DEFINE_int32(
 namespace paddle {
 namespace framework {
 namespace details {
+// SetFuseParameterGroupsSize and SetFuseParameterMemorySize are used in unit
+// test, because it is invalid that seting 'FLAGS_fuse_parameter_memory_size'
+// and 'FLAGS_fuse_parameter_groups_size' in unit test.
+void SetFuseParameterGroupsSize(int group_size) {
+  FLAGS_fuse_parameter_groups_size = group_size;
+}
 
-static const char kUnKnow[] = "@UNKNOW@";
-static framework::proto::VarType::Type kDefaultDtype =
-    framework::proto::VarType::Type::VarType_Type_BOOL;
+int GetFuseParameterGroupsSize() { return FLAGS_fuse_parameter_groups_size; }
 
-class AllocContinuousSpaceForGradPass : public ir::Pass {
- protected:
-  void ApplyImpl(ir::Graph *graph) const override {
-    ir::Graph &result = *graph;
+void SetFuseParameterMemorySize(uint64_t memory_size) {
+  FLAGS_fuse_parameter_memory_size = memory_size;
+}
 
-    auto &places = Get<const std::vector<platform::Place>>(kPlaces);
-    auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
+uint64_t GetFuseParameterMemorySize() {
+  return FLAGS_fuse_parameter_memory_size;
+}
 
-    ResetAttribute<ParamsAndGrads>(kParamsAndGrads, &result);
-    ResetAttribute<GroupGradsAndParams>(kGroupGradsAndParams, &result);
+static const char kUnKnow[] = "@UNKNOW@";
+static framework::proto::VarType::Type kDefaultDtype =
+    framework::proto::VarType::Type::VarType_Type_BOOL;
 
-    // NOTE: The operator nodes should be in topology order.
-    std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
-    auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
-    for (auto &node : topo_nodes) {
-      RecordParamsAndGrads(node, &params_grads);
-    }
+void AllocContinuousSpaceForGradPass::ApplyImpl(ir::Graph *graph) const {
+  ir::Graph &result = *graph;
 
-    if (params_grads.size() == 0) {
-      VLOG(10) << "Doesn't find gradients";
-      return;
-    }
+  auto &places = Get<const std::vector<platform::Place>>(kPlaces);
+  auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
 
-    std::unordered_map<std::string, ir::Node *> vars;
-    for (ir::Node *node : result.Nodes()) {
-      if (node->IsVar() && node->Var()) {
-        // Note: The graph may have the same name node. For example, parameter
-        // is the input of operator and it also is the output of optimizer;
-        vars.emplace(node->Var()->Name(), node);
-      }
-    }
+  ResetAttribute<ParamsAndGrads>(kParamsAndGrads, &result);
+  ResetAttribute<GroupGradsAndParams>(kGroupGradsAndParams, &result);
 
-    auto &group_grads_params =
-        result.Get<GroupGradsAndParams>(kGroupGradsAndParams);
+  // NOTE: The operator nodes should be in topology order.
+  std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
+  auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
+  for (auto &node : topo_nodes) {
+    RecordParamsAndGrads(node, &params_grads);
+  }
 
-    // Note: the order of params_grads may be changed by SetGroupGradsAndParams.
-    SetGroupGradsAndParams(vars, params_grads, &group_grads_params);
+  if (params_grads.size() == 0) {
+    VLOG(10) << "Doesn't find gradients";
+    return;
+  }
 
-    params_grads.clear();
-    for (auto &group_p_g : group_grads_params) {
-      params_grads.insert(params_grads.begin(), group_p_g.begin(),
-                          group_p_g.end());
-    }
-    for (auto &p_g : params_grads) {
-      std::swap(p_g.first, p_g.second);
+  std::unordered_map<std::string, ir::Node *> vars;
+  for (ir::Node *node : result.Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      // Note: The graph may have the same name node. For example, parameter
+      // is the input of operator and it also is the output of optimizer;
+      vars.emplace(node->Var()->Name(), node);
     }
+  }
 
-    // Set Gradients as Persistable to prevent this var becoming reusable.
-    auto dtype = kDefaultDtype;
-    for (auto &p_g : params_grads) {
-      // Get gradient var
-      auto iter = vars.find(p_g.second);
-      PADDLE_ENFORCE(iter != vars.end(), "%s is not found.", p_g.second);
-      iter->second->Var()->SetPersistable(true);
-
-      PADDLE_ENFORCE(IsSupportedVarType(iter->second->Var()->GetType()));
+  auto &group_grads_params =
+      result.Get<GroupGradsAndParams>(kGroupGradsAndParams);
 
-      // Get Dtype
-      auto ele_dtype = iter->second->Var()->GetDataType();
-      if (dtype == kDefaultDtype) {
-        dtype = ele_dtype;
-        PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype,
-                          "The data type should not be bool.");
-      }
-      PADDLE_ENFORCE_EQ(ele_dtype, dtype,
-                        "The data type of input is not consistent.");
-    }
+  // Note: the order of params_grads may be changed by SetGroupGradsAndParams.
+  SetGroupGradsAndParams(vars, params_grads, &group_grads_params);
 
-    // Create a FusedVarsSet to avoid duplicating names for fused_var in other
-    // pass.
-    if (!result.Has(kFusedVars)) {
-      result.Set(kFusedVars, new FusedVars);
-    }
-    // the kFusedGrads is used be fuse_optimizer_op_pass.
-    result.Set(kFusedGrads, new FusedGrads);
-
-    // the fused_var_name should be unique, so it appends
-    // params_grads.begin()->second.
-    auto fused_var_name = std::string(kFusedVarNamePrefix) + "@GRAD@" +
-                          params_grads.begin()->second;
-    result.Get<FusedGrads>(kFusedGrads) = fused_var_name;
-    auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
-    PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0,
-                      "%s is duplicate in FusedVars.", fused_var_name);
-    fused_var_set.insert(fused_var_name);
-
-    InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars,
-                                      fused_var_name, params_grads);
+  params_grads.clear();
+  for (auto &group_p_g : group_grads_params) {
+    params_grads.insert(params_grads.begin(), group_p_g.begin(),
+                        group_p_g.end());
+  }
+  for (auto &p_g : params_grads) {
+    std::swap(p_g.first, p_g.second);
   }
 
-  template <typename AttrType>
-  void ResetAttribute(const std::string &attr_name, ir::Graph *graph) const {
-    if (graph->Has(attr_name)) {
-      VLOG(10) << attr_name << " is reset.";
-      graph->Erase(attr_name);
+  // Set Gradients as Persistable to prevent this var becoming reusable.
+  auto dtype = kDefaultDtype;
+  for (auto &p_g : params_grads) {
+    // Get gradient var
+    auto iter = vars.find(p_g.second);
+    PADDLE_ENFORCE(iter != vars.end(), "%s is not found.", p_g.second);
+    iter->second->Var()->SetPersistable(true);
+
+    PADDLE_ENFORCE(IsSupportedVarType(iter->second->Var()->GetType()));
+
+    // Get Dtype
+    auto ele_dtype = iter->second->Var()->GetDataType();
+    if (dtype == kDefaultDtype) {
+      dtype = ele_dtype;
+      PADDLE_ENFORCE_NE(ele_dtype, kDefaultDtype,
+                        "The data type should not be bool.");
     }
-    graph->Set(attr_name, new AttrType);
+    PADDLE_ENFORCE_EQ(ele_dtype, dtype,
+                      "The data type of input is not consistent.");
   }
 
-  void SetGroupGradsAndParams(
-      const std::unordered_map<std::string, ir::Node *> &var_nodes,
-      const ParamsAndGrads &params_grads,
-      GroupGradsAndParams *group_grads_params) const {
-    SetGroupAccordingToLayers(var_nodes, params_grads, group_grads_params);
-    SetGroupAccordingToMemorySize(var_nodes, group_grads_params);
-    SetGroupAccordingToGroupSize(var_nodes, group_grads_params);
+  // Create a FusedVarsSet to avoid duplicating names for fused_var in other
+  // pass.
+  if (!result.Has(kFusedVars)) {
+    result.Set(kFusedVars, new FusedVars);
   }
-
-  void SetGroupAccordingToLayers(
-      const std::unordered_map<std::string, ir::Node *> &var_nodes,
-      const ParamsAndGrads &params_grads,
-      GroupGradsAndParams *group_grads_params) const {
-    std::unordered_map<std::string, std::vector<int>> layer_params;
-
-    for (size_t i = 0; i < params_grads.size(); ++i) {
-      auto pos = params_grads[i].first.find_first_of(".");
-      if (pos == std::string::npos) {
-        layer_params[std::string(kUnKnow)].emplace_back(i);
-      } else {
-        layer_params[params_grads[i].first.substr(0, pos)].emplace_back(i);
-      }
+  // the kFusedGrads is used be fuse_optimizer_op_pass.
+  result.Set(kFusedGrads, new FusedGrads);
+
+  // the fused_var_name should be unique, so it appends
+  // params_grads.begin()->second.
+  auto fused_var_name = std::string(kFusedVarNamePrefix) + "@GRAD@" +
+                        params_grads.begin()->second;
+  result.Get<FusedGrads>(kFusedGrads) = fused_var_name;
+  auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
+  PADDLE_ENFORCE_EQ(fused_var_set.count(fused_var_name), 0,
+                    "%s is duplicate in FusedVars.", fused_var_name);
+  fused_var_set.insert(fused_var_name);
+
+  InitFusedVarsAndAllocSpaceForVars(places, local_scopes, vars, fused_var_name,
+                                    params_grads);
+}
+
+template <typename AttrType>
+void AllocContinuousSpaceForGradPass::ResetAttribute(
+    const std::string &attr_name, ir::Graph *graph) const {
+  if (graph->Has(attr_name)) {
+    VLOG(10) << attr_name << " is reset.";
+    graph->Erase(attr_name);
+  }
+  graph->Set(attr_name, new AttrType);
+}
+
+void AllocContinuousSpaceForGradPass::SetGroupGradsAndParams(
+    const std::unordered_map<std::string, ir::Node *> &var_nodes,
+    const ParamsAndGrads &params_grads,
+    GroupGradsAndParams *group_grads_params) const {
+  SetGroupAccordingToLayers(var_nodes, params_grads, group_grads_params);
+  SetGroupAccordingToMemorySize(var_nodes, group_grads_params);
+  SetGroupAccordingToGroupSize(var_nodes, group_grads_params);
+}
+
+void AllocContinuousSpaceForGradPass::SetGroupAccordingToLayers(
+    const std::unordered_map<std::string, ir::Node *> &var_nodes,
+    const ParamsAndGrads &params_grads,
+    GroupGradsAndParams *group_grads_params) const {
+  std::unordered_map<std::string, std::vector<int>> layer_params;
+
+  for (size_t i = 0; i < params_grads.size(); ++i) {
+    auto pos = params_grads[i].first.find_first_of(".");
+    if (pos == std::string::npos) {
+      layer_params[std::string(kUnKnow)].emplace_back(i);
+    } else {
+      layer_params[params_grads[i].first.substr(0, pos)].emplace_back(i);
     }
+  }
 
-    group_grads_params->reserve(layer_params.size());
-    for (size_t i = 0; i < params_grads.size(); ++i) {
-      auto pos = params_grads[i].first.find_first_of(".");
-      std::string key = kUnKnow;
-      if (pos != std::string::npos) {
-        key = params_grads[i].first.substr(0, pos);
-      }
-      auto iter = layer_params.find(key);
-      if (iter == layer_params.end()) continue;
-
-      group_grads_params->emplace_back();
-      auto &local_group_grads_params = group_grads_params->back();
-      for (auto &idx : iter->second) {
-        local_group_grads_params.emplace_back(
-            std::make_pair(params_grads[idx].second, params_grads[idx].first));
-      }
-      layer_params.erase(iter);
+  group_grads_params->reserve(layer_params.size());
+  for (size_t i = 0; i < params_grads.size(); ++i) {
+    auto pos = params_grads[i].first.find_first_of(".");
+    std::string key = kUnKnow;
+    if (pos != std::string::npos) {
+      key = params_grads[i].first.substr(0, pos);
     }
-
-    VLOG(10) << "SetGroupAccordingToLayers: ";
-    for (size_t i = 0; i < group_grads_params->size(); ++i) {
-      VLOG(10) << "group " << i;
-      std::stringstream out;
-      for (auto &p_g : group_grads_params->at(i)) {
-        out << "(" << p_g.second << ", " << p_g.first << "), ";
-      }
-      VLOG(10) << out.str();
+    auto iter = layer_params.find(key);
+    if (iter == layer_params.end()) continue;
+
+    group_grads_params->emplace_back();
+    auto &local_group_grads_params = group_grads_params->back();
+    for (auto &idx : iter->second) {
+      local_group_grads_params.emplace_back(
+          std::make_pair(params_grads[idx].second, params_grads[idx].first));
     }
+    layer_params.erase(iter);
   }
 
-  void SetGroupAccordingToMemorySize(
-      const std::unordered_map<std::string, ir::Node *> &var_nodes,
-      GroupGradsAndParams *group_grads_params) const {
-    if (FLAGS_fuse_parameter_memory_size == 0) {
-      return;
+  VLOG(10) << "SetGroupAccordingToLayers: ";
+  for (size_t i = 0; i < group_grads_params->size(); ++i) {
+    VLOG(10) << "group " << i;
+    std::stringstream out;
+    for (auto &p_g : group_grads_params->at(i)) {
+      out << "(" << p_g.second << ", " << p_g.first << "), ";
     }
-    size_t group_memory_size =
-        static_cast<size_t>(FLAGS_fuse_parameter_memory_size);
-    GroupGradsAndParams local_group_grads_params;
-
-    size_t j = 0;
+    VLOG(10) << out.str();
+  }
+}
+
+void AllocContinuousSpaceForGradPass::SetGroupAccordingToMemorySize(
+    const std::unordered_map<std::string, ir::Node *> &var_nodes,
+    GroupGradsAndParams *group_grads_params) const {
+  const uint64_t group_memory_size = GetFuseParameterMemorySize();
+  if (group_memory_size == 0) {
+    return;
+  }
+  GroupGradsAndParams local_group_grads_params;
+  size_t j = 0;
+  while (j < group_grads_params->size()) {
+    local_group_grads_params.emplace_back();
+    auto &group_p_g = local_group_grads_params.back();
+    size_t local_group_memory_size = 0;
     while (j < group_grads_params->size()) {
-      local_group_grads_params.emplace_back();
-      auto &group_p_g = local_group_grads_params.back();
-      size_t local_group_memory_size = 0;
-      while (j < group_grads_params->size()) {
-        std::for_each(
-            group_grads_params->at(j).begin(), group_grads_params->at(j).end(),
-            [&local_group_memory_size,
-             &var_nodes](const std::pair<std::string, std::string> &g_p) {
-              auto iter = var_nodes.find(g_p.second);
-              PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.",
-                             g_p.second);
-              auto shape = iter->second->Var()->GetShape();
-              size_t size =
-                  framework::SizeOfType(iter->second->Var()->GetDataType());
-              std::for_each(shape.begin(), shape.end(),
-                            [&size](const int64_t &n) { size *= n; });
-              local_group_memory_size += size;
-            });
-        group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
-                         group_grads_params->at(j).end());
-        ++j;
-        if (local_group_memory_size >= group_memory_size) {
-          break;
-        }
-      }
-    }
-
-    std::swap(*group_grads_params, local_group_grads_params);
-
-    VLOG(10) << string::Sprintf(
-        "SetGroupAccordingToMemorySize(memory_size: %d):",
-        FLAGS_fuse_parameter_memory_size);
-    for (size_t i = 0; i < group_grads_params->size(); ++i) {
-      VLOG(10) << "group " << i;
-      std::stringstream out;
-      for (auto &g_p : group_grads_params->at(i)) {
-        auto iter = var_nodes.find(g_p.second);
-        PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", g_p.second);
-        auto shape = iter->second->Var()->GetShape();
-        size_t size = framework::SizeOfType(iter->second->Var()->GetDataType());
-        std::for_each(shape.begin(), shape.end(),
-                      [&size](const int64_t &n) { size *= n; });
-        out << string::Sprintf("(%s(%d), %s)", g_p.second, size, g_p.first);
+      std::for_each(
+          group_grads_params->at(j).begin(), group_grads_params->at(j).end(),
+          [&local_group_memory_size,
+           &var_nodes](const std::pair<std::string, std::string> &g_p) {
+            auto iter = var_nodes.find(g_p.second);
+            PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.",
+                           g_p.second);
+            auto shape = iter->second->Var()->GetShape();
+            size_t size =
+                framework::SizeOfType(iter->second->Var()->GetDataType());
+            std::for_each(shape.begin(), shape.end(),
+                          [&size](const int64_t &n) { size *= n; });
+            local_group_memory_size += size;
+          });
+      group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
+                       group_grads_params->at(j).end());
+      ++j;
+      if (local_group_memory_size >= group_memory_size) {
+        break;
       }
-      VLOG(10) << out.str();
     }
   }
 
-  void SetGroupAccordingToGroupSize(
-      const std::unordered_map<std::string, ir::Node *> &var_nodes,
-      GroupGradsAndParams *group_grads_params) const {
-    if (FLAGS_fuse_parameter_groups_size == 1) {
-      return;
-    }
-    size_t group_size = static_cast<size_t>(FLAGS_fuse_parameter_groups_size);
-    if (FLAGS_fuse_parameter_groups_size == -1) {
-      group_size = group_grads_params->size();
-    }
-    PADDLE_ENFORCE_GT(group_size, 1);
-    size_t groups = (group_grads_params->size() + group_size - 1) / group_size;
-    GroupGradsAndParams local_group_grads_params;
-    local_group_grads_params.reserve(groups);
-
-    size_t j = 0;
-    for (size_t i = 0; i < groups; ++i) {
-      local_group_grads_params.emplace_back();
-      auto &group_p_g = local_group_grads_params.back();
-      group_p_g.reserve(group_size);
-      while (j < group_grads_params->size()) {
-        group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
-                         group_grads_params->at(j).end());
-        ++j;
-        if (j % group_size == 0) break;
-      }
-    }
-    std::swap(*group_grads_params, local_group_grads_params);
-
-    VLOG(10) << "SetGroupAccordingToGroupSize(group_size: " << group_size
-             << "): ";
-    for (size_t i = 0; i < group_grads_params->size(); ++i) {
-      VLOG(10) << "group " << i;
-      std::stringstream out;
-      for (auto &p_g : group_grads_params->at(i)) {
-        out << "(" << p_g.second << ", " << p_g.first << "), ";
-      }
-      VLOG(10) << out.str();
+  std::swap(*group_grads_params, local_group_grads_params);
+
+  VLOG(10) << string::Sprintf("SetGroupAccordingToMemorySize(memory_size: %d):",
+                              group_memory_size);
+  for (size_t i = 0; i < group_grads_params->size(); ++i) {
+    VLOG(10) << "group " << i;
+    std::stringstream out;
+    for (auto &g_p : group_grads_params->at(i)) {
+      auto iter = var_nodes.find(g_p.second);
+      PADDLE_ENFORCE(iter != var_nodes.end(), "%s is not found.", g_p.second);
+      auto shape = iter->second->Var()->GetShape();
+      size_t size = framework::SizeOfType(iter->second->Var()->GetDataType());
+      std::for_each(shape.begin(), shape.end(),
+                    [&size](const int64_t &n) { size *= n; });
+      out << string::Sprintf("(%s(%d), %s)", g_p.second, size, g_p.first);
     }
+    VLOG(10) << out.str();
   }
+}
 
- private:
-  bool IsSupportedVarType(const proto::VarType::Type &type) const {
-    // Current only support LOD_TENSOR.
-    return type == proto::VarType::LOD_TENSOR;
+void AllocContinuousSpaceForGradPass::SetGroupAccordingToGroupSize(
+    const std::unordered_map<std::string, ir::Node *> &var_nodes,
+    GroupGradsAndParams *group_grads_params) const {
+  if (GetFuseParameterGroupsSize() == 1) {
+    return;
   }
-
-  void RecordParamsAndGrads(ir::Node *node,
-                            ParamsAndGrads *params_grads) const {
-    try {
-      bool is_bk_op =
-          static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
-                                OpProtoAndCheckerMaker::OpRoleAttrName())) &
-                            static_cast<int>(OpRole::kBackward));
-      if (!is_bk_op) return;
-
-      // Currently, we assume that once gradient is generated, it can be
-      // broadcast, and each gradient is only broadcast once.
-      auto backward_vars =
-          boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
-              OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-      PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast<size_t>(0));
-
-      for (size_t i = 0; i < backward_vars.size(); i += 2) {
-        VLOG(10) << "Trainable parameter: " << backward_vars[i]
-                 << ", gradient: " << backward_vars[i + 1];
-
-        params_grads->emplace_back(std::make_pair(
-            backward_vars[i] /*param*/, backward_vars[i + 1] /*grad*/));
-      }
-    } catch (boost::bad_get e) {
+  const int group_size = GetFuseParameterGroupsSize() == -1
+                             ? static_cast<int>(group_grads_params->size())
+                             : GetFuseParameterGroupsSize();
+  PADDLE_ENFORCE_GT(group_size, 1);
+  size_t groups = (group_grads_params->size() + group_size - 1) / group_size;
+  GroupGradsAndParams local_group_grads_params;
+  local_group_grads_params.reserve(groups);
+
+  size_t j = 0;
+  for (size_t i = 0; i < groups; ++i) {
+    local_group_grads_params.emplace_back();
+    auto &group_p_g = local_group_grads_params.back();
+    group_p_g.reserve(group_size);
+    while (j < group_grads_params->size()) {
+      group_p_g.insert(group_p_g.end(), group_grads_params->at(j).begin(),
+                       group_grads_params->at(j).end());
+      ++j;
+      if (j % group_size == 0) break;
     }
   }
-
-  void InitFusedVarsAndAllocSpaceForVars(
-      const std::vector<platform::Place> &places,
-      const std::vector<Scope *> &local_scopes,
-      const std::unordered_map<std::string, ir::Node *> &vars,
-      const std::string &fused_var_name,
-      const ParamsAndGrads &params_grads) const {
-    //  Init Gradients and FusedVars
-    VLOG(10) << "Init FusedVars and Gradients.";
-    for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
-      auto &scope = *it;
-
-      PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
-                     "%s has existed in scope.", fused_var_name);
-      scope->Var(fused_var_name)->GetMutable<LoDTensor>();
-
-      for (auto &p_g : params_grads) {
-        auto iter = vars.find(p_g.second);
-        PADDLE_ENFORCE(iter != vars.end());
-        PADDLE_ENFORCE_NOT_NULL(iter->second->Var());
-        PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(),
-                          proto::VarType::LOD_TENSOR);
-        scope->Var(p_g.second)->GetMutable<LoDTensor>();
-      }
+  std::swap(*group_grads_params, local_group_grads_params);
+
+  VLOG(10) << string::Sprintf("SetGroupAccordingToGroupSize(group_size: %d):",
+                              group_size);
+  for (size_t i = 0; i < group_grads_params->size(); ++i) {
+    VLOG(10) << "group " << i;
+    std::stringstream out;
+    for (auto &p_g : group_grads_params->at(i)) {
+      out << "(" << p_g.second << ", " << p_g.first << "), ";
+    }
+    VLOG(10) << out.str();
+  }
+}
+
+bool AllocContinuousSpaceForGradPass::IsSupportedVarType(
+    const proto::VarType::Type &type) const {
+  // Current only support LOD_TENSOR.
+  return type == proto::VarType::LOD_TENSOR;
+}
+
+void AllocContinuousSpaceForGradPass::RecordParamsAndGrads(
+    ir::Node *node, ParamsAndGrads *params_grads) const {
+  try {
+    bool is_bk_op =
+        static_cast<bool>(boost::get<int>(node->Op()->GetAttr(
+                              OpProtoAndCheckerMaker::OpRoleAttrName())) &
+                          static_cast<int>(OpRole::kBackward));
+    if (!is_bk_op) return;
+
+    // Currently, we assume that once gradient is generated, it can be
+    // broadcast, and each gradient is only broadcast once.
+    auto backward_vars =
+        boost::get<std::vector<std::string>>(node->Op()->GetNullableAttr(
+            OpProtoAndCheckerMaker::OpRoleVarAttrName()));
+    PADDLE_ENFORCE_EQ(backward_vars.size() % 2, static_cast<size_t>(0));
+
+    for (size_t i = 0; i < backward_vars.size(); i += 2) {
+      VLOG(10) << "Trainable parameter: " << backward_vars[i]
+               << ", gradient: " << backward_vars[i + 1];
+
+      params_grads->emplace_back(std::make_pair(backward_vars[i] /*param*/,
+                                                backward_vars[i + 1] /*grad*/));
     }
+  } catch (boost::bad_get e) {
+  }
+}
+
+void AllocContinuousSpaceForGradPass::InitFusedVarsAndAllocSpaceForVars(
+    const std::vector<platform::Place> &places,
+    const std::vector<Scope *> &local_scopes,
+    const std::unordered_map<std::string, ir::Node *> &vars,
+    const std::string &fused_var_name,
+    const ParamsAndGrads &params_grads) const {
+  //  Init Gradients and FusedVars
+  VLOG(10) << "Init FusedVars and Gradients.";
+  for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
+    auto &scope = *it;
+
+    PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
+                   "%s has existed in scope.", fused_var_name);
+    scope->Var(fused_var_name)->GetMutable<LoDTensor>();
 
-    // Alloc continuous space for vars.
-    std::vector<std::string> grads_name;
-    std::vector<std::string> params_name;
-    grads_name.reserve(params_grads.size());
-    params_name.reserve(params_grads.size());
     for (auto &p_g : params_grads) {
-      params_name.emplace_back(p_g.first);
-      grads_name.emplace_back(p_g.second);
-    }
-    framework::ProgramDesc program_desc;
-    AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
-                              program_desc.MutableBlock(0));
-
-    for (size_t i = 0; i < local_scopes.size(); ++i) {
-      for (auto &op_desc : program_desc.Block(0).AllOps()) {
-        auto op = OpRegistry::CreateOp(*op_desc);
-        op->Run(*local_scopes[i], places[i]);
-      }
+      auto iter = vars.find(p_g.second);
+      PADDLE_ENFORCE(iter != vars.end());
+      PADDLE_ENFORCE_NOT_NULL(iter->second->Var());
+      PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(),
+                        proto::VarType::LOD_TENSOR);
+      scope->Var(p_g.second)->GetMutable<LoDTensor>();
     }
   }
 
-  void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
-                                 const std::vector<std::string> &grads_name,
-                                 const std::string &fused_var_name,
-                                 BlockDesc *global_block) const {
-    auto op_desc = global_block->AppendOp();
-    op_desc->SetType("alloc_continuous_space");
-    op_desc->SetInput("Input", params_name);
-    op_desc->SetOutput("Output", grads_name);
-    op_desc->SetOutput("FusedOutput", {fused_var_name});
+  // Alloc continuous space for vars.
+  std::vector<std::string> grads_name;
+  std::vector<std::string> params_name;
+  grads_name.reserve(params_grads.size());
+  params_name.reserve(params_grads.size());
+  for (auto &p_g : params_grads) {
+    params_name.emplace_back(p_g.first);
+    grads_name.emplace_back(p_g.second);
+  }
+  framework::ProgramDesc program_desc;
+  AppendAllocSpaceForVarsOp(params_name, grads_name, fused_var_name,
+                            program_desc.MutableBlock(0));
+
+  for (size_t i = 0; i < local_scopes.size(); ++i) {
+    for (auto &op_desc : program_desc.Block(0).AllOps()) {
+      auto op = OpRegistry::CreateOp(*op_desc);
+      op->Run(*local_scopes[i], places[i]);
+    }
   }
-};
+}
+
+void AllocContinuousSpaceForGradPass::AppendAllocSpaceForVarsOp(
+    const std::vector<std::string> &params_name,
+    const std::vector<std::string> &grads_name,
+    const std::string &fused_var_name, BlockDesc *global_block) const {
+  auto op_desc = global_block->AppendOp();
+  op_desc->SetType("alloc_continuous_space");
+  op_desc->SetInput("Input", params_name);
+  op_desc->SetOutput("Output", grads_name);
+  op_desc->SetOutput("FusedOutput", {fused_var_name});
+}
 
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.h b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6d56f17cc4ef7e07500aae8067211a7b9ac04b0
--- /dev/null
+++ b/paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.h
@@ -0,0 +1,79 @@
+//   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <algorithm>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+void SetFuseParameterGroupsSize(int group_size);
+int GetFuseParameterGroupsSize();
+
+void SetFuseParameterMemorySize(uint64_t memory_size);
+uint64_t GetFuseParameterMemorySize();
+
+class AllocContinuousSpaceForGradPass : public ir::Pass {
+ protected:
+  void ApplyImpl(ir::Graph *graph) const override;
+
+  template <typename AttrType>
+  void ResetAttribute(const std::string &attr_name, ir::Graph *graph) const;
+
+  void SetGroupGradsAndParams(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      const ParamsAndGrads &params_grads,
+      GroupGradsAndParams *group_grads_params) const;
+
+  void SetGroupAccordingToLayers(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      const ParamsAndGrads &params_grads,
+      GroupGradsAndParams *group_grads_params) const;
+
+  void SetGroupAccordingToMemorySize(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      GroupGradsAndParams *group_grads_params) const;
+
+  void SetGroupAccordingToGroupSize(
+      const std::unordered_map<std::string, ir::Node *> &var_nodes,
+      GroupGradsAndParams *group_grads_params) const;
+
+ private:
+  bool IsSupportedVarType(const proto::VarType::Type &type) const;
+
+  void RecordParamsAndGrads(ir::Node *node, ParamsAndGrads *params_grads) const;
+
+  void InitFusedVarsAndAllocSpaceForVars(
+      const std::vector<platform::Place> &places,
+      const std::vector<Scope *> &local_scopes,
+      const std::unordered_map<std::string, ir::Node *> &vars,
+      const std::string &fused_var_name,
+      const ParamsAndGrads &params_grads) const;
+
+  void AppendAllocSpaceForVarsOp(const std::vector<std::string> &params_name,
+                                 const std::vector<std::string> &grads_name,
+                                 const std::string &fused_var_name,
+                                 BlockDesc *global_block) const;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
index e9aad5d264d1745662848d1ba313b573d0974cb7..7f63c07b18f7c6147670656dfc567f8f2ae8429a 100644
--- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc
@@ -64,9 +64,12 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
               node->Op()->GetNullableAttr("epmap"));
           auto height_section = boost::get<std::vector<int64_t>>(
               node->Op()->GetNullableAttr("sections"));
+          auto trainer_id =
+              boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
           send_varname_to_ctx[send_var_name] =
               operators::distributed::RpcContext(send_var_name, send_varnames,
-                                                 epmap, height_section);
+                                                 epmap, height_section,
+                                                 trainer_id);
           VLOG(3) << "find and init an send op: "
                   << send_varname_to_ctx[send_var_name];
         } else if (node->Name() == "recv") {
@@ -75,9 +78,11 @@ void ProcessGraph(std::vector<ir::Graph *> graphs, Scope *scope) {
               node->Op()->GetNullableAttr("recv_varnames"));
           auto epmap = boost::get<std::vector<std::string>>(
               node->Op()->GetNullableAttr("epmap"));
+          auto trainer_id =
+              boost::get<int>(node->Op()->GetNullableAttr("trainer_id"));
           recv_varname_to_ctx[recv_var_name] =
               operators::distributed::RpcContext(recv_var_name, recv_varnames,
-                                                 epmap, {});
+                                                 epmap, {}, trainer_id);
           nodes_to_delete.push_back(node);
           VLOG(3) << "find and remove an recv op: "
                   << recv_varname_to_ctx[recv_var_name];
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index f8bf43bcb48226b4d1317a78ade7179741097378..196603bbff1db79e46ebbe8b18f1092fcbaac7f9 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -101,8 +101,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
                "mode.";
         strategy_.fuse_all_optimizer_ops_ = false;
       } else {
-        VLOG(10) << "Add alloc_continuous_space_for_grad_pass";
-        AppendPass("alloc_continuous_space_for_grad_pass");
         // NOTE: fuse_all_xx_ops will count the number of xx operator first,
         // if the number is zero, fuse_all_reduce_ops will do nothing.
         // Currently, only one type of optimization algorithm can be fused.
@@ -142,6 +140,19 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
       AppendPass("memory_optimize_pass");
     }
 
+    // runtime_context_cache pass should be the last pass to enable the attr of
+    // all original and fused operators. But no operators can be enabled this
+    // attr if putting it after MultiDevPass.
+    if (strategy_.cache_runtime_context_) {
+      VLOG(10) << "Add runtime_context_cache_pass";
+      AppendPass("runtime_context_cache_pass");
+    }
+
+    if (strategy_.cache_expected_kernel_) {
+      VLOG(10) << "Add expected_kernel_cache_pass";
+      AppendPass("expected_kernel_cache_pass");
+    }
+
     AppendMultiDevPass(strategy_);
 
     if (strategy_.fuse_all_reduce_ops_) {
@@ -243,7 +254,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
   CreatePassesFromStrategy(false);
 
   for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
-    VLOG(3) << "apply " << pass->Type();
+    VLOG(3) << "BuildStrategy::Apply pass:" << pass->Type();
     if (IsMultiDevPass(pass->Type())) {
       pass->Erase(kPlaces);
       pass->SetNotOwned<const std::vector<platform::Place>>(kPlaces, &places);
@@ -328,3 +339,5 @@ USE_PASS(graph_to_program_pass);
 USE_PASS(fuse_adam_op_pass);
 USE_PASS(fuse_sgd_op_pass);
 USE_PASS(fuse_all_reduce_op_pass);
+USE_PASS(runtime_context_cache_pass);
+USE_PASS(expected_kernel_cache_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index cc48c51e924039d93b2e1e18bea752611e7bef92..b1601cfbcd5e9c66f1bbecd1f6fe10bc279cea26 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -83,11 +83,11 @@ struct BuildStrategy {
 
   bool sync_batch_norm_{false};
 
-  bool memory_optimize_{true};
-  // TODO(dzhwinter):
-  // make enable_inplace, memory_optimize_
-  // memory_early_delete_ true by default
-  bool enable_inplace_{true};
+  // FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4
+  // to open them by default, we need to solve the fetch variable issue
+  bool memory_optimize_{false};
+
+  bool enable_inplace_{false};
 
   bool enable_sequential_execution_{false};
 
@@ -107,6 +107,9 @@ struct BuildStrategy {
   std::vector<std::string> trainers_endpoints_;
   bool remove_unnecessary_lock_{true};
 
+  bool cache_runtime_context_{false};
+  bool cache_expected_kernel_{true};
+
   // NOTE:
   // Before you add new options, think if it's a general strategy that works
   // with other strategy. If not, the strategy should be created through
diff --git a/paddle/fluid/framework/details/dgc_const_values.h b/paddle/fluid/framework/details/dgc_const_values.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbe50dc91160e1d7d5175daa150ec9c45aa60a6f
--- /dev/null
+++ b/paddle/fluid/framework/details/dgc_const_values.h
@@ -0,0 +1,32 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+constexpr char g_dgc_counter_name[] = "__g_dgc_counter__";
+constexpr char g_dgc_rampup_begin_step[] = "__g_rampup_begin_step__";
+constexpr char g_dgc_u[] = "__dgc_u__";
+constexpr char g_dgc_v[] = "__dgc_v__";
+constexpr char g_dgc_k[] = "__dgc_k__";
+constexpr char g_dgc_encoded[] = "__dgc_encoded__";
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/fuse_adam_op_pass.cc b/paddle/fluid/framework/details/fuse_adam_op_pass.cc
index 0ef75e319244e2ccc63dfa3f93f0cd764cf67633..f95d93fd5575ae538274c4c0322cf661c631849a 100644
--- a/paddle/fluid/framework/details/fuse_adam_op_pass.cc
+++ b/paddle/fluid/framework/details/fuse_adam_op_pass.cc
@@ -24,7 +24,7 @@ namespace details {
 const std::string FuseAdamOpPass::GetOpType() const { return "adam"; }
 
 const std::vector<std::string> FuseAdamOpPass::GetAuxiliaryVarNames() const {
-  return {"Param", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow"};
+  return {"Moment1", "Moment2", "Beta1Pow", "Beta2Pow"};
 }
 
 void FuseAdamOpPass::FuseOptimizerOps(
@@ -77,16 +77,16 @@ void FuseAdamOpPass::FuseAdamOps(
   VLOG(10) << "Insert adam to graph ";
   OpDesc adam_desc(adam_ops[0]->Op()->Block());
   adam_desc.SetType("adam");
-  adam_desc.SetInput("Param", {fused_vars_name.at("Param")});
-  adam_desc.SetInput("Grad", {fused_vars_name.at("Grad")});
+  adam_desc.SetInput(kParam, {fused_vars_name.at(kParam)});
+  adam_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)});
   adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")});
   adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")});
   // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
-  adam_desc.SetInput("LearningRate", adam_ops[0]->Op()->Input("LearningRate"));
+  adam_desc.SetInput(kLearningRate, adam_ops[0]->Op()->Input(kLearningRate));
   adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow"));
   adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow"));
 
-  adam_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")});
+  adam_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)});
   adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")});
   adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")});
   adam_desc.SetAttr("beta1", beta1);
diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc
index b49f095d428a017dd1a3bed2788a048af9afa6bb..25aa3019d102293725d836cf1f8e9fce8462408b 100644
--- a/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc
@@ -29,7 +29,9 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
   auto &local_scopes = Get<const std::vector<Scope *>>(kLocalScopes);
 
   const std::string fuse_op_type = GetOpType();
-  const std::vector<std::string> aux_var_names = GetAuxiliaryVarNames();
+  std::vector<std::string> aux_var_names = GetAuxiliaryVarNames();
+  aux_var_names.emplace_back(kParam);
+  aux_var_names.emplace_back(kGrad);
 
   // Step 1: Get the specified op and auxiliary variables.
   std::vector<ir::Node *> topo_nodes = ir::TopologySortOperations(result);
@@ -61,7 +63,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
     result.Set(kFusedVars, new FusedVars);
   }
   std::unordered_map<std::string, std::string> fused_vars_name;
-  fused_vars_name.reserve(aux_var_names.size() + 1);
+  fused_vars_name.reserve(aux_var_names.size());
   auto &fused_var_set = result.Get<FusedVars>(kFusedVars);
   const std::string prefix(kFusedVarNamePrefix);
   // NOTE: the fused_var_name should be unique.
@@ -75,39 +77,103 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
   }
 
   // Step 3: Get the fused Gradient's name
-  auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
-  if (!result.Has(kFusedGrads)) {
-    PADDLE_THROW(
-        "The alloc_continuous_space_for_grad_pass should be called before this "
-        "pass.");
-  }
-  auto &fused_grad = result.Get<FusedGrads>(kFusedGrads);
-  auto &fused_vars = result.Get<FusedVars>(kFusedVars);
-  auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad);
-  PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad.");
-  fused_vars_name.emplace("Grad", fused_grad);
-
-  // Step 4: Sort the parameters and auxiliary variables according
-  // to parameters' name to make variables' name correspond correctly.
-  PADDLE_ENFORCE(result.Has(kParamsAndGrads), "Does't find kParamsAndGrads.");
-  PADDLE_ENFORCE_EQ(params_grads.size(), aux_var_set.begin()->second.size(),
-                    "The size of params_grads and aux_var_set are not equal.");
-  SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops);
-
-  // Step 5: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
+  bool grad_fused = false;
+  if (result.Has(kParamsAndGrads)) {
+    auto &params_grads = result.Get<ParamsAndGrads>(kParamsAndGrads);
+    PADDLE_ENFORCE_EQ(
+        params_grads.size(), aux_var_set.at(kGrad).size(),
+        "The number of gradients and optimizer ops is not equal.");
+    std::unordered_set<std::string> opt_grad_set(aux_var_set.at(kGrad).begin(),
+                                                 aux_var_set.at(kGrad).end());
+    size_t same_grad_num = 0;
+    for (auto &p_g : params_grads) {
+      if (opt_grad_set.count(p_g.second)) {
+        ++same_grad_num;
+      }
+    }
+
+    // NOTE(zcd): the gradient of kParamsAndGrads may be different with the
+    // kGrad.
+    if (same_grad_num == aux_var_set.at(kGrad).size()) {
+      if (!result.Has(kFusedGrads)) {
+        PADDLE_THROW(
+            "The alloc_continuous_space_for_grad_pass should be called before "
+            "this pass.");
+      }
+      auto &fused_grad = result.Get<FusedGrads>(kFusedGrads);
+      auto &fused_vars = result.Get<FusedVars>(kFusedVars);
+      auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad);
+      PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad.");
+      fused_vars_name[kGrad] = fused_grad;
+
+      // Sort the parameters and auxiliary variables according
+      // to parameters' name to make variables' name correspond correctly.
+      SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops);
+      grad_fused = true;
+    }
+  }
+
+  // Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g.
   // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately.
+  aux_var_names.pop_back();
+  if (!grad_fused) {
+    InitFusedGradsAndAllocSpaceForGrads(
+        places, local_scopes, aux_var_set.at(kParam), aux_var_set.at(kGrad),
+        fused_vars_name.at(kGrad), &result);
+  }
   InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names,
                                     aux_var_set, fused_vars_name);
 
-  // Step 6: Fuse optimizer Ops and Scale Ops
+  // Step 5: Fuse optimizer Ops and Scale Ops
   FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result);
 
-  // Step 7: Remove optimizer Ops
+  // Step 6: Remove optimizer Ops
   for (auto &opt_op : opt_ops) {
     graph->RemoveNode(opt_op);
   }
 }
 
+void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads(
+    const std::vector<platform::Place> &places,
+    const std::vector<Scope *> &local_scopes,
+    const std::vector<std::string> &params,
+    const std::vector<std::string> &grads, const std::string &fused_grad_name,
+    ir::Graph *result) const {
+  // Get Var Nodes
+  std::unordered_map<std::string, ir::Node *> vars;
+  for (ir::Node *node : result->Nodes()) {
+    if (node->IsVar() && node->Var()) {
+      // Note: The graph may have the same name node. For example, parameter
+      // is the input of operator and it also is the output of optimizer;
+      vars.emplace(node->Var()->Name(), node);
+    }
+  }
+  // Init Grads
+  for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) {
+    auto &scope = *it;
+    VLOG(10) << "Init " << fused_grad_name;
+    PADDLE_ENFORCE(scope->FindVar(fused_grad_name) == nullptr,
+                   "%s has existed in scope.", fused_grad_name);
+    scope->Var(fused_grad_name)->GetMutable<LoDTensor>();
+
+    for (auto &grad_var_name : grads) {
+      auto iter = vars.find(grad_var_name);
+      PADDLE_ENFORCE(iter != vars.end());
+      PADDLE_ENFORCE_NOT_NULL(iter->second->Var());
+      PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(),
+                        proto::VarType::LOD_TENSOR);
+      scope->Var(grad_var_name)->GetMutable<LoDTensor>();
+    }
+  }
+  // Define Ops
+  ProgramDesc program_desc;
+  auto *global_block = program_desc.MutableBlock(0);
+  AppendAllocContinuousSpace(params, grads, fused_grad_name, global_block,
+                             false, false);
+  // Run Ops
+  RunInitOps(places, local_scopes, *global_block);
+}
+
 void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
     const std::vector<platform::Place> &places,
     const std::vector<Scope *> &local_scopes,
@@ -115,37 +181,49 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars(
     const std::unordered_map<std::string, std::vector<std::string>>
         &aux_var_set,
     const std::unordered_map<std::string, std::string> &fused_vars_name) const {
-  VLOG(10) << "Init FusedVars.";
-  // Alloc parameters and auxiliary vars in the respective scope.
-  size_t idx = local_scopes.size();
-  for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend();
-       ++iter, --idx) {
-    auto &scope = *iter;
-    for (auto &var_name : aux_var_names) {
-      auto fused_var_name = fused_vars_name.at(var_name);
-      VLOG(10) << "Init " << fused_var_name;
-      PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
-                     "%s has exist in scope[%d]", fused_var_name, idx);
-      scope->Var(fused_var_name)->GetMutable<LoDTensor>();
-    }
+  // Init Vars
+  for (auto &var_name : aux_var_names) {
+    auto &fused_var_name = fused_vars_name.at(var_name);
+    InitVars(local_scopes, fused_var_name);
   }
-
+  // Define Ops
   ProgramDesc program_desc;
   auto *global_block = program_desc.MutableBlock(0);
   for (auto &var_name : aux_var_names) {
-    AppendAllocContinuousSpace(aux_var_set.at(var_name),
-                               fused_vars_name.at(var_name), true,
-                               global_block);
+    AppendAllocContinuousSpace(
+        aux_var_set.at(var_name), aux_var_set.at(var_name),
+        fused_vars_name.at(var_name), global_block, true);
   }
+  // Run Ops
+  RunInitOps(places, local_scopes, *global_block);
+}
 
+void FuseOptimizerOpPass::RunInitOps(const std::vector<platform::Place> &places,
+                                     const std::vector<Scope *> &local_scopes,
+                                     const BlockDesc &global_block) const {
   for (size_t i = 0; i < local_scopes.size(); ++i) {
-    for (auto &op_desc : global_block->AllOps()) {
+    for (auto &op_desc : global_block.AllOps()) {
       auto op = OpRegistry::CreateOp(*op_desc);
       op->Run(*local_scopes[i], places[i]);
     }
   }
 }
 
+void FuseOptimizerOpPass::InitVars(const std::vector<Scope *> &local_scopes,
+                                   const std::string &fused_var_name) const {
+  VLOG(10) << "Init FusedVars.";
+  // Alloc parameters and auxiliary vars in the respective scope.
+  size_t idx = local_scopes.size();
+  for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend();
+       ++iter, --idx) {
+    auto &scope = *iter;
+    VLOG(10) << "Init " << fused_var_name;
+    PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr,
+                   "%s has exist in scope[%d]", fused_var_name, idx);
+    scope->Var(fused_var_name)->GetMutable<LoDTensor>();
+  }
+}
+
 void FuseOptimizerOpPass::SortParametersAndAuxVars(
     const std::vector<std::pair<std::string, std::string>> &params_grads,
     std::unordered_map<std::string, std::vector<std::string>> *aux_vars_set,
@@ -203,15 +281,16 @@ void FuseOptimizerOpPass::GetSpecifiedOpsAndVars(
 }
 
 void FuseOptimizerOpPass::AppendAllocContinuousSpace(
-    const std::vector<std::string> &args, const std::string &out_arg,
-    bool copy_data, BlockDesc *global_block) const {
+    const std::vector<std::string> &in_args,
+    const std::vector<std::string> &out_args, const std::string &fused_out_arg,
+    BlockDesc *global_block, bool copy_data, bool check_name) const {
   auto op_desc = global_block->AppendOp();
   op_desc->SetType("alloc_continuous_space");
-  op_desc->SetInput("Input", args);
-  op_desc->SetOutput("Output", args);
-  op_desc->SetOutput("FusedOutput", {out_arg});
+  op_desc->SetInput("Input", in_args);
+  op_desc->SetOutput("Output", out_args);
+  op_desc->SetOutput("FusedOutput", {fused_out_arg});
   op_desc->SetAttr("copy_data", copy_data);
-  op_desc->SetAttr("check_name", true);
+  op_desc->SetAttr("check_name", check_name);
 }
 
 void FuseOptimizerOpPass::InserInputAndOutputForOptOps(
diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.h b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h
index 0240f1594d7ef9d855eb6e96e8e8a32ee1d957ba..47efc1693dd31ca88787da3a9d6d06aa7ef65786 100644
--- a/paddle/fluid/framework/details/fuse_optimizer_op_pass.h
+++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h
@@ -27,6 +27,10 @@ namespace paddle {
 namespace framework {
 namespace details {
 
+constexpr char kGrad[] = "Grad";
+constexpr char kParam[] = "Param";
+constexpr char kLearningRate[] = "LearningRate";
+
 class FuseOptimizerOpPass : public ir::Pass {
  protected:
   void ApplyImpl(ir::Graph *graph) const override;
@@ -56,9 +60,18 @@ class FuseOptimizerOpPass : public ir::Pass {
       std::unordered_map<std::string, std::vector<std::string>> *aux_args_name)
       const;
 
-  void AppendAllocContinuousSpace(const std::vector<std::string> &args,
-                                  const std::string &out_arg, bool copy_data,
-                                  BlockDesc *global_block) const;
+  void AppendAllocContinuousSpace(const std::vector<std::string> &in_args,
+                                  const std::vector<std::string> &out_args,
+                                  const std::string &fused_out_arg,
+                                  BlockDesc *global_block, bool copy_data,
+                                  bool check_name = true) const;
+
+  void InitFusedGradsAndAllocSpaceForGrads(
+      const std::vector<platform::Place> &places,
+      const std::vector<Scope *> &local_scopes,
+      const std::vector<std::string> &params,
+      const std::vector<std::string> &grads, const std::string &fused_grad_name,
+      ir::Graph *result) const;
 
   void InitFusedVarsAndAllocSpaceForVars(
       const std::vector<platform::Place> &places,
@@ -68,6 +81,13 @@ class FuseOptimizerOpPass : public ir::Pass {
           &aux_var_set,
       const std::unordered_map<std::string, std::string> &fused_vars_name)
       const;
+
+  void RunInitOps(const std::vector<platform::Place> &places,
+                  const std::vector<Scope *> &local_scopes,
+                  const BlockDesc &global_block) const;
+
+  void InitVars(const std::vector<Scope *> &local_scopes,
+                const std::string &fused_var_name) const;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/details/fuse_sgd_op_pass.cc b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc
index f91c21e3cc869de1a6d67146eb99f27a2ca5497c..2219f3209f77de5cb34abfb9edb8bdea6a8eebb0 100644
--- a/paddle/fluid/framework/details/fuse_sgd_op_pass.cc
+++ b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc
@@ -24,7 +24,7 @@ namespace details {
 const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; }
 
 const std::vector<std::string> FuseSgdOpPass::GetAuxiliaryVarNames() const {
-  return {"Param"};
+  return {};
 }
 
 void FuseSgdOpPass::FuseOptimizerOps(
@@ -50,12 +50,12 @@ void FuseSgdOpPass::FuseSgdOps(
   // Add fused scale
   OpDesc Sgd_desc(sgd_ops[0]->Op()->Block());
   Sgd_desc.SetType("sgd");
-  Sgd_desc.SetInput("Param", {fused_vars_name.at("Param")});
-  Sgd_desc.SetInput("Grad", {fused_vars_name.at("Grad")});
-  Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")});
+  Sgd_desc.SetInput(kParam, {fused_vars_name.at(kParam)});
+  Sgd_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)});
+  Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)});
 
   // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal.
-  Sgd_desc.SetInput("LearningRate", sgd_ops[0]->Op()->Input("LearningRate"));
+  Sgd_desc.SetInput(kLearningRate, sgd_ops[0]->Op()->Input(kLearningRate));
 
   // NOTE: multi_devices_pass requires that every op should have a role.
   Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role);
diff --git a/paddle/fluid/framework/details/inplace_op_pass.cc b/paddle/fluid/framework/details/inplace_op_pass.cc
index 79150f719e379ca4e2b87d2e7db1b2daeee9aa67..84c9e4a379a5e07dc3a8e85409c804eebc390c73 100644
--- a/paddle/fluid/framework/details/inplace_op_pass.cc
+++ b/paddle/fluid/framework/details/inplace_op_pass.cc
@@ -305,6 +305,12 @@ void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
 
     VLOG(4) << "Try to inplace " << in_var_name << " with " << out_var_name;
 
+    if (var_nodes_[in_var_name].back() != in_node) {
+      VLOG(4) << "SKIP since " << in_var_name
+              << " is also used as output by other ops";
+      continue;
+    }
+
     bool can_replace = true;
     if (in_var_name == out_var_name) {
       can_replace = false;
@@ -527,6 +533,9 @@ void GraphView::Build(ir::Graph* g) {
   };
   for (auto& node : g->Nodes()) {
     if (!node->IsOp()) continue;
+    // avoid optimize the variable used in sub-blocks
+    if (OpHasSubBlock(node->Op())) update_skip_set(node);
+
     if (node->Name() == "send") update_skip_set(node);
     if (node->Name() == "recv") update_skip_set(node);
     if (node->Name() == "prefetch") update_skip_set(node);
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index f213e07b555ca9fc4b73a2f91412063f4e7f47d4..e9aab179d24945ee4a0067df7030192dedf56d58 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -34,6 +34,10 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
+#if defined(PADDLE_WITH_DGC)
+#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
+#endif
+
 namespace paddle {
 namespace framework {
 namespace details {
@@ -438,12 +442,22 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
   auto append_allreduce_op = [&](
       const std::vector<Scope *> &scopes,
       const std::vector<platform::Place> &places) -> OpHandleBase * {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_DGC)
+    if (is_encoded) {
+      result->Get<GraphOps>(kGraphOps).emplace_back(new SparseAllReduceOpHandle(
+          result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+          scopes, places, nccl_ctxs_, is_encoded,
+          static_cast<int>(strategy_.trainers_endpoints_.size()) *
+              places_.size()));
+    } else {
+      result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
+          result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
+          scopes, places, nccl_ctxs_));
+    }
+#elif defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
     result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
         result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
-        scopes, places, nccl_ctxs_, is_encoded,
-        static_cast<int>(strategy_.trainers_endpoints_.size()) *
-            places_.size()));
+        scopes, places, nccl_ctxs_));
 #else
     result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
         result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
@@ -561,7 +575,11 @@ void AllReduceSSAGraphBuilder::InsertCollectiveOp(
     CreateReduceOp(result, g_name, 0);
     CreateBroadcastOp(result, g_name, 0);
   } else {
+#if defined(PADDLE_WITH_DGC)
+    CreateAllReduceOp(result, g_name, IsEncoded(p_name));
+#else
     CreateAllReduceOp(result, g_name);
+#endif
   }
 }
 
@@ -965,8 +983,9 @@ int DistSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
   return op_dev_id;
 }
 
-bool DistSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
-  auto u_name = p_name + "__dgc_u__";
+#if defined(PADDLE_WITH_DGC)
+bool AllReduceSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
+  auto u_name = p_name + g_dgc_u;
   auto it = all_vars_.find(u_name);
   if (it == all_vars_.end()) {
     VLOG(10) << "can't find u_name, so it's not encoded:" << u_name;
@@ -975,6 +994,11 @@ bool DistSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
 
   return true;
 }
+#else
+bool AllReduceSSAGraphBuilder::IsEncoded(const std::string &p_name) const {
+  return false;
+}
+#endif
 
 void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
                                              const std::string &p_name,
@@ -992,11 +1016,7 @@ void DistSSAGraphBuilder::InsertCollectiveOp(ir::Graph *result,
         CreateReduceOp(result, g_name, 0);
         CreateBroadcastOp(result, g_name, 0);
       } else {
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
-        CreateAllReduceOp(result, g_name, IsEncoded(p_name));
-#else
-        PADDLE_ENFORCE(false, "Compiled withoud cuda!");
-#endif
+        CreateAllReduceOp(result, g_name);
       }
       break;
     default:
diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h
index 7cc68dd2d5a422cfa1ac3a4bfdd48545a6e5691d..0c4b3b0b8c963e99da5886f25e0df8146ce3695c 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h
@@ -113,6 +113,8 @@ class AllReduceSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
                                   const std::string &g_name) const;
 
   virtual void InsertPostprocessOps(ir::Graph *result) const {}
+
+  bool IsEncoded(const std::string &p_name) const;
 };
 
 class AsyncSSAGraphBuilder : public MultiDevSSAGraphBuilderBase {
@@ -203,8 +205,6 @@ class DistSSAGraphBuilder : public BalanceVarSSAGraphBuilder {
 
   mutable std::vector<std::unordered_set<std::string>> bcast_var_name_set_;
   mutable bool need_broadcast_var_{false};
-
-  bool IsEncoded(const std::string &p_name) const;
 };
 
 std::unordered_set<std::string> &MultiDevSSAGraphBuilder();
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index a9a4fb08a2ca4689e8b6a6f10f83d065332ac192..18de595983f52e56dba4f5069257f354132db51b 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -233,6 +233,12 @@ struct OpInfoFiller<T, kNoNeedBufferVarsInference> {
   }
 };
 
+// A fake OpInfoFiller of void
+template <>
+struct OpInfoFiller<void, kUnknown> {
+  void operator()(const char* op_type, OpInfo* info) const {}
+};
+
 }  // namespace details
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
index 137e0dd7708dcc77c3a927979cfb357249f1fdc9..1bd27263f7dad5f733c553c202444ba7cacd2510 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc
@@ -106,7 +106,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
   VLOG(1) << "set num_threads: " << strategy_.num_threads_
           << " to run the operators of the graph on each device.";
   for (size_t i = 0; i < places.size(); ++i) {
-    executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
+    executors_.emplace_back(new details::FastThreadedSSAGraphExecutor(
         strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get()));
   }
 }
diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
index 1e421f2a3a51363fe368859f7a34593c8c894077..faf071b05306a49c0049421bc72e4981c0bfc84c 100644
--- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h
@@ -14,12 +14,12 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
 #include <vector>
-
 #include "ThreadPool.h"
+#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
@@ -48,7 +48,8 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<platform::Place> places_;
   std::vector<std::unique_ptr<ir::Graph>> graphs_;
 
-  std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
+  std::vector<std::unique_ptr<details::FastThreadedSSAGraphExecutor>>
+      executors_;
   ExceptionHolder exception_holder_;
 };
 
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1bdd33fd5357c839aed008d03a9a99848c66101b
--- /dev/null
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -0,0 +1,188 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
+#include <algorithm>
+#include "dgc/dgc.h"
+#include "paddle/fluid/framework/details/container_cast.h"
+#include "paddle/fluid/framework/details/reduce_and_gather.h"
+#include "paddle/fluid/framework/details/variable_visitor.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/platform/profiler.h"
+
+DECLARE_bool(sync_nccl_allreduce);
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+SparseAllReduceOpHandle::SparseAllReduceOpHandle(
+    ir::Node *node, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    const platform::NCCLContextMap *ctxs, bool is_encoded, int nranks)
+    : AllReduceOpHandle(node, local_scopes, places, ctxs),
+      is_encoded_(is_encoded),
+      nranks_(nranks) {
+  // TODO(gongwb) :polish them!
+  if (is_encoded) {
+    VLOG(1) << "Use dgc allreduce mode";
+  }
+}
+
+void SparseAllReduceOpHandle::RunImplEncoded() {
+  platform::RecordEvent record_event(Name());
+
+  WaitInputVarGenerated();
+
+  auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
+  auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), places_.size(),
+      "The NoDummyInputSize should be equal to the number of places.");
+  PADDLE_ENFORCE_EQ(
+      in_var_handles.size(), out_var_handles.size(),
+      "The NoDummyInputSize and NoDummyOutputSize should be equal.");
+
+  std::vector<const LoDTensor *> ins;
+  std::vector<LoDTensor *> outs;
+  int k = -1;
+  for (size_t i = 0; i < local_scopes_.size(); ++i) {
+    auto &local_scope =
+        local_scopes_[i]->FindVar(kLocalExecScopeName)->Get<Scope *>();
+    auto original_name =
+        paddle::framework::GradOriginalVarName(in_var_handles[i]->name());
+    auto encode_var_name = original_name + g_dgc_encoded;
+    auto *in_var = local_scope->FindVar(encode_var_name);
+    PADDLE_ENFORCE_NOT_NULL(in_var, "%s should not be null", encode_var_name);
+    auto &in = in_var->Get<LoDTensor>();
+    ins.emplace_back(&in);
+
+    auto *out = local_scope->FindVar(out_var_handles[i]->name())
+                    ->GetMutable<LoDTensor>();
+    outs.emplace_back(out);
+
+    if (k < 0) {
+      k = GetKValue(in_var_handles[i]->name());
+    }
+  }
+
+  PADDLE_ENFORCE(platform::is_gpu_place(ins[0]->place()));
+  PADDLE_ENFORCE(platform::is_gpu_place(outs[0]->place()));
+  PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr.");
+
+  int dtype = -1;
+  size_t in_numel = 0;
+  size_t out_numel = 0;
+  PADDLE_ENFORCE(nranks_ > 1);
+  std::vector<std::function<void()>> all_reduce_calls;
+
+  for (size_t i = 0; i < local_scopes_.size(); ++i) {
+    auto &place = places_[i];
+    auto &in = *ins[i];
+    void *in_tensor_buf = const_cast<void *>(in.data<void>());
+
+    auto &out = *outs[i];
+    float *out_tensor_buf = out.data<float>();
+
+    dtype = (dtype == -1) ? platform::ToNCCLDataType(in.type()) : dtype;
+    in_numel = (in_numel == 0) ? static_cast<size_t>(in.numel()) : in_numel;
+    PADDLE_ENFORCE(in_numel % 2 == 0);
+    PADDLE_ENFORCE(in_numel / 2 == static_cast<size_t>(k));
+    out_numel = (out_numel == 0) ? static_cast<size_t>(out.numel()) : out_numel;
+
+    int dev_id = boost::get<platform::CUDAPlace>(place).device;
+    auto &nccl_ctx = nccl_ctxs_->at(dev_id);
+    auto stream = nccl_ctx.stream();
+    auto comm = nccl_ctx.comm_;
+
+    auto &allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(place, stream);
+    int encode_size = 2 * k * sizeof(int);
+    // dgc use ncclAllGather to get all the encoded data
+    // so the buffer need nranks.
+    int buf_size = nranks_ * encode_size;
+    auto tmp_ious_data = allocator.Allocate(buf_size);
+    void *gather_buff = reinterpret_cast<void *>(tmp_ious_data->ptr());
+
+    VLOG(10) << "in_numel:" << in_numel << ", out_numel:" << out_numel
+             << ", nranks:" << nranks_ << ", gather_buf size:" << buf_size
+             << ", k:" << k << ", place:" << place << ", dtype:" << dtype;
+
+    all_reduce_calls.emplace_back([=] {
+      PADDLE_ENFORCE(paddle::communication::dgc::sparseAllGReduce(
+          in_tensor_buf, gather_buff, k, out_tensor_buf, out_numel, comm,
+          stream));
+    });
+  }
+
+  RunAllReduceFuncs(all_reduce_calls);
+}
+
+int SparseAllReduceOpHandle::GetKValue(const std::string &grad_name) {
+  auto original_name = paddle::framework::GradOriginalVarName(grad_name);
+  auto var_name = original_name + g_dgc_k;
+  PADDLE_ENFORCE(local_scopes_.size() > 0);
+
+  auto *scope = local_scopes_[0];
+  auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  auto var = local_scope->FindVar(var_name);
+  PADDLE_ENFORCE_NOT_NULL(var);
+  auto tensor = var->Get<LoDTensor>().data<float>();
+  return *tensor;
+}
+
+bool SparseAllReduceOpHandle::IsEncoded() {
+  if (!is_encoded_) {
+    return false;
+  }
+  auto counter_name = g_dgc_counter_name;
+  auto step_name = g_dgc_rampup_begin_step;
+  PADDLE_ENFORCE(local_scopes_.size() > 0);
+
+  auto *scope = local_scopes_[0];
+  auto &local_scope = scope->FindVar(kLocalExecScopeName)->Get<Scope *>();
+  auto count_var = local_scope->FindVar(counter_name);
+  auto step_var = local_scope->FindVar(step_name);
+  if (count_var == nullptr || step_var == nullptr) {
+    PADDLE_THROW("not find count_var:%s or step_var:%s", counter_name,
+                 step_var);
+  }
+
+  float count = *count_var->Get<LoDTensor>().data<float>();
+  float step = *step_var->Get<LoDTensor>().data<float>();
+  if (static_cast<int>(count) < static_cast<int>(step)) {
+    VLOG(10) << "in all_reduce currentstep:" << count
+             << " < rampup_begin_step:" << step
+             << " so not use sparse all reduce";
+    return false;
+  }
+
+  return true;
+}
+
+void SparseAllReduceOpHandle::RunImpl() {
+  if (!IsEncoded()) {
+    AllReduceOpHandle::RunImpl();
+    return;
+  }
+
+  RunImplEncoded();
+}
+
+std::string SparseAllReduceOpHandle::Name() const {
+  return "sparse_all_reduce";
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
new file mode 100644
index 0000000000000000000000000000000000000000..ed6be65a2c8009fc417f4230b8169a4847e89440
--- /dev/null
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.h
@@ -0,0 +1,52 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
+#include "paddle/fluid/framework/details/dgc_const_values.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+class SparseAllReduceOpHandle : public AllReduceOpHandle {
+ public:
+  SparseAllReduceOpHandle(ir::Node *node,
+                          const std::vector<Scope *> &local_scopes,
+                          const std::vector<platform::Place> &places,
+                          const platform::NCCLContextMap *ctxs,
+                          bool is_encoded = false, int nranks = -1);
+  std::string Name() const override;
+
+ protected:
+  void RunImpl() override;
+  int GetKValue(const std::string &grad_name);
+  bool IsEncoded();
+  void RunImplEncoded();
+
+ private:
+  bool is_encoded_{false};
+  int nranks_{-1};
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc
index 4ca7842fa261a1b8178438d35ca5d626146663d4..386ffd84c57063e950cd8b0d57304c66190be4c4 100644
--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -21,40 +21,40 @@ namespace framework {
 
 void DownpourWorker::Initialize(const TrainerDesc& desc) {
   param_ = desc.downpour_param();
-  for (size_t i = 0; i < param_.sparse_table_size(); ++i) {
+  for (int i = 0; i < param_.sparse_table_size(); ++i) {
     uint64_t table_id =
         static_cast<uint64_t>(param_.sparse_table(i).table_id());
     TableParameter table = param_.sparse_table(i);
     sparse_key_names_[table_id].resize(table.sparse_key_name_size());
-    for (size_t j = 0; j < table.sparse_key_name_size(); ++j) {
+    for (int j = 0; j < table.sparse_key_name_size(); ++j) {
       sparse_key_names_[table_id][j] = table.sparse_key_name(j);
     }
     sparse_value_names_[table_id].resize(table.sparse_value_name_size());
-    for (size_t j = 0; j < table.sparse_value_name_size(); ++j) {
+    for (int j = 0; j < table.sparse_value_name_size(); ++j) {
       sparse_value_names_[table_id][j] = table.sparse_value_name(j);
     }
     sparse_grad_names_[table_id].resize(table.sparse_grad_name_size());
-    for (size_t j = 0; j < table.sparse_grad_name_size(); ++j) {
+    for (int j = 0; j < table.sparse_grad_name_size(); ++j) {
       sparse_grad_names_[table_id][j] = table.sparse_grad_name(j);
     }
     label_var_name_[table_id] = table.label_var_name();
   }
 
-  for (size_t i = 0; i < param_.dense_table_size(); ++i) {
+  for (int i = 0; i < param_.dense_table_size(); ++i) {
     uint64_t table_id = static_cast<uint64_t>(param_.dense_table(i).table_id());
     auto table = param_.dense_table(i);
     dense_value_names_[table_id].resize(table.dense_value_name_size());
-    for (size_t j = 0; j < table.dense_value_name_size(); ++j) {
+    for (int j = 0; j < table.dense_value_name_size(); ++j) {
       dense_value_names_[table_id][j] = table.dense_value_name(j);
     }
     dense_grad_names_[table_id].resize(table.dense_grad_name_size());
-    for (size_t j = 0; j < table.dense_grad_name_size(); ++j) {
+    for (int j = 0; j < table.dense_grad_name_size(); ++j) {
       dense_grad_names_[table_id][j] = table.dense_grad_name(j);
     }
   }
 
   skip_ops_.resize(param_.skip_ops_size());
-  for (size_t i = 0; i < param_.skip_ops_size(); ++i) {
+  for (int i = 0; i < param_.skip_ops_size(); ++i) {
     skip_ops_[i] = param_.skip_ops(i);
   }
 
@@ -83,14 +83,14 @@ void DownpourWorker::CollectLabelInfo(size_t table_idx) {
   LoDTensor* tensor = var->GetMutable<LoDTensor>();
   int64_t* label_ptr = tensor->data<int64_t>();
 
-  int global_index = 0;
+  size_t global_index = 0;
   for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) {
     VLOG(3) << "sparse_key_names_[" << i
             << "]: " << sparse_key_names_[table_id][i];
     Variable* fea_var = thread_scope_->FindVar(sparse_key_names_[table_id][i]);
     LoDTensor* tensor = fea_var->GetMutable<LoDTensor>();
     int64_t* ids = tensor->data<int64_t>();
-    int fea_idx = 0;
+    size_t fea_idx = 0;
     // tensor->lod()[0].size() == batch_size + 1
     for (auto lod_idx = 1u; lod_idx < tensor->lod()[0].size(); ++lod_idx) {
       for (; fea_idx < tensor->lod()[0][lod_idx]; ++fea_idx) {
@@ -138,7 +138,7 @@ void DownpourWorker::FillSparseValue(size_t table_idx) {
     auto& tensor_lod = tensor->lod()[0];
     LoD data_lod{tensor_lod};
     tensor_emb->set_lod(data_lod);
-    for (auto index = 0u; index < len; ++index) {
+    for (int index = 0; index < len; ++index) {
       if (ids[index] == 0u) {
         memcpy(ptr + table.emb_dim() * index, init_value.data() + 2,
                sizeof(float) * table.emb_dim());
@@ -192,7 +192,7 @@ void DownpourWorker::TrainFilesWithProfiler() {
     read_time += timeline.ElapsedSec();
     total_time += timeline.ElapsedSec();
     VLOG(3) << "program config size: " << param_.program_config_size();
-    for (size_t i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
+    for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
          ++i) {
       uint64_t tid = static_cast<uint64_t>(
           param_.program_config(0).pull_sparse_table_id(i));
@@ -244,8 +244,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
     }
 
     if (need_to_push_sparse_) {
-      for (size_t i = 0;
-           i < param_.program_config(0).push_sparse_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size();
+           ++i) {
         uint64_t tid = static_cast<uint64_t>(
             param_.program_config(0).push_sparse_table_id(i));
         TableParameter table;
@@ -268,8 +268,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
 
     if (need_to_push_dense_) {
       timeline.Start();
-      for (size_t i = 0;
-           i < param_.program_config(0).push_dense_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+           ++i) {
         uint64_t tid = static_cast<uint64_t>(
             param_.program_config(0).push_dense_table_id(i));
         fleet_ptr_->PushDenseVarsAsync(
@@ -315,8 +315,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
     }
 
     if (need_to_push_dense_) {
-      for (size_t i = 0;
-           i < param_.program_config(0).push_dense_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+           ++i) {
         uint64_t tid = static_cast<uint64_t>(
             param_.program_config(0).push_dense_table_id(i));
         pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid);
@@ -362,7 +362,7 @@ void DownpourWorker::TrainFiles() {
   int cur_batch;
   while ((cur_batch = device_reader_->Next()) > 0) {
     // pull sparse here
-    for (size_t i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
+    for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
          ++i) {
       uint64_t tid = static_cast<uint64_t>(
           param_.program_config(0).pull_sparse_table_id(i));
@@ -397,8 +397,8 @@ void DownpourWorker::TrainFiles() {
 
     if (need_to_push_sparse_) {
       // push gradients here
-      for (size_t i = 0;
-           i < param_.program_config(0).push_sparse_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size();
+           ++i) {
         uint64_t tid = static_cast<uint64_t>(
             param_.program_config(0).push_sparse_table_id(i));
         TableParameter table;
@@ -416,8 +416,8 @@ void DownpourWorker::TrainFiles() {
     }
 
     if (need_to_push_dense_) {
-      for (size_t i = 0;
-           i < param_.program_config(0).push_dense_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+           ++i) {
         uint64_t tid = static_cast<uint64_t>(
             param_.program_config(0).push_dense_table_id(i));
         fleet_ptr_->PushDenseVarsAsync(
@@ -461,8 +461,8 @@ void DownpourWorker::TrainFiles() {
     }
 
     if (need_to_push_dense_) {
-      for (size_t i = 0;
-           i < param_.program_config(0).push_dense_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+           ++i) {
         uint64_t tid = static_cast<uint64_t>(
             param_.program_config(0).push_dense_table_id(i));
         pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid);
diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt
index 7d363d1afdc8ac72741e6e4fea02fb96fe9347fa..12fc454fd262cdcf30f64757a6199c6a9331e1a2 100644
--- a/paddle/fluid/framework/fleet/CMakeLists.txt
+++ b/paddle/fluid/framework/fleet/CMakeLists.txt
@@ -3,3 +3,5 @@ if(WITH_PSLIB)
 else()
     cc_library(fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope)
 endif(WITH_PSLIB)
+
+cc_library(nccl_wrapper SRCS nccl_wrapper.cc DEPS framework_proto variable_helper scope)
diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc
index 8147c7746192a91bb82c2aa754c5664def4c142f..394ff24c466622956b18b3012c146f6f9ddd838e 100644
--- a/paddle/fluid/framework/fleet/fleet_wrapper.cc
+++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc
@@ -237,6 +237,7 @@ void FleetWrapper::PushDenseParamSync(
   std::vector<paddle::ps::Region> regions;
   for (auto& t : var_names) {
     Variable* var = scope.FindVar(t);
+    CHECK(var != nullptr) << "var[" << t << "] not found";
     LoDTensor* tensor = var->GetMutable<LoDTensor>();
     float* g = tensor->mutable_data<float>(place);
     paddle::ps::Region reg(g, tensor->numel());
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.cc b/paddle/fluid/framework/fleet/nccl_wrapper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..051f4b013c6eeb55f733910c14cae178f2f8b416
--- /dev/null
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/fleet/nccl_wrapper.h"
+#include <utility>
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+
+std::shared_ptr<NCCLWrapper> NCCLWrapper::s_instance_ = NULL;
+bool NCCLWrapper::is_initialized_ = false;
+
+void NCCLWrapper::InitNCCL() {
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE(platform::dynload::ncclCommInitRank(
+      &(nccl_info_.comm_), nccl_info_.global_ranks_, nccl_info_.nccl_id_,
+      nccl_info_.my_global_rank_));
+#endif
+  return;
+}
+
+void NCCLWrapper::SetNCCLId(const NCCLInfo& nccl_info) {
+#ifdef PADDLE_WITH_CUDA
+  nccl_info_.nccl_id_ = nccl_info.nccl_id_;
+#endif
+  return;
+}
+
+NCCLInfo NCCLWrapper::GetNCCLId() {
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE(platform::dynload::ncclGetUniqueId(&(nccl_info_.nccl_id_)));
+#endif
+  return nccl_info_;
+}
+
+void NCCLWrapper::SetRankInfo(const int local_rank, const int global_rank,
+                              const int ranks) {
+#ifdef PADDLE_WITH_CUDA
+  nccl_info_.local_rank_ = local_rank;
+  nccl_info_.my_global_rank_ = global_rank;
+  nccl_info_.global_ranks_ = ranks;
+  PADDLE_ENFORCE(cudaSetDevice(local_rank));
+  PADDLE_ENFORCE(cudaStreamCreate(&(nccl_info_.stream_)));
+#endif
+  return;
+}
+
+void NCCLWrapper::SyncVar(const int root_rank, const Scope& scope,
+                          const std::vector<std::string>& var_names) {
+#ifdef PADDLE_WITH_CUDA
+  for (auto& name : var_names) {
+    auto var = scope.FindVar(name);
+    LoDTensor* tensor = var->GetMutable<LoDTensor>();
+    int32_t total_size = tensor->numel();
+    PADDLE_ENFORCE(platform::dynload::ncclBcast(
+        reinterpret_cast<void*>(tensor->data<float>()), total_size, ncclFloat,
+        root_rank, nccl_info_.comm_, nccl_info_.stream_));
+    cudaStreamSynchronize(nccl_info_.stream_);
+  }
+#endif
+  return;
+}
+
+}  // end namespace framework
+}  // end namespace paddle
diff --git a/paddle/fluid/framework/fleet/nccl_wrapper.h b/paddle/fluid/framework/fleet/nccl_wrapper.h
new file mode 100644
index 0000000000000000000000000000000000000000..f29aa225419f4f62c302c8a93c8a9b89c29d15f5
--- /dev/null
+++ b/paddle/fluid/framework/fleet/nccl_wrapper.h
@@ -0,0 +1,83 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <atomic>
+#include <ctime>
+#include <map>
+#include <memory>
+#include <random>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable_helper.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/dynload/nccl.h"
+#endif
+#include "paddle/fluid/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+
+namespace paddle {
+namespace framework {
+
+class NCCLInfo {
+ public:
+  NCCLInfo() {}
+  virtual ~NCCLInfo() {}
+
+ public:
+  int local_rank_;
+  int global_ranks_;
+  int my_global_rank_;
+#ifdef PADDLE_WITH_CUDA
+  ncclUniqueId nccl_id_;
+  ncclComm_t comm_;
+  cudaStream_t stream_;
+#endif
+};
+
+class NCCLWrapper {
+ public:
+  virtual ~NCCLWrapper() {}
+  NCCLWrapper() {}
+
+  void InitNCCL();
+  void SetNCCLId(const NCCLInfo& nccl_info);
+  NCCLInfo GetNCCLId();
+  void SetRankInfo(const int local_rank, const int global_rank,
+                   const int ranks);
+  void SyncVar(const int root_rank, const Scope& scope,
+               const std::vector<std::string>& var_names);
+
+  static std::shared_ptr<NCCLWrapper> GetInstance() {
+    if (NULL == s_instance_) {
+      s_instance_.reset(new paddle::framework::NCCLWrapper());
+    }
+    return s_instance_;
+  }
+
+ public:
+  NCCLInfo nccl_info_;
+
+ private:
+  static std::shared_ptr<NCCLWrapper> s_instance_;
+
+ protected:
+  static bool is_initialized_;
+  DISABLE_COPY_AND_ASSIGN(NCCLWrapper);
+};
+
+}  // end namespace framework
+}  // end namespace paddle
diff --git a/paddle/fluid/framework/io/shell.cc b/paddle/fluid/framework/io/shell.cc
index bcfa4f44ff1c6561cbbd60b76f75de1c8461a88a..ab671cb5690df51c1cff141906c40cc9e74584fa 100644
--- a/paddle/fluid/framework/io/shell.cc
+++ b/paddle/fluid/framework/io/shell.cc
@@ -126,7 +126,7 @@ static int shell_popen_fork_internal(const char* real_cmd, bool do_read,
   }
 
   close_open_fds_internal();
-  if (execl("/bin/sh", "sh", "-c", real_cmd, NULL) < 0) {
+  if (execl("/bin/bash", "bash", "-c", real_cmd, NULL) < 0) {
     return -1;
   }
   exit(127);
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index ba1d7379c56d953a0f37d03deed6c47e46cbf129..16fc1721eb6f5d2517ad45289f2415ef41749df2 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -68,6 +68,7 @@ pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
 pass_library(runtime_context_cache_pass base)
+pass_library(expected_kernel_cache_pass base)
 pass_library(quant_conv2d_dequant_fuse_pass inference)
 pass_library(fillconstant_elementwisemul_fuse inference)
 
diff --git a/paddle/fluid/framework/ir/expected_kernel_cache_pass.cc b/paddle/fluid/framework/ir/expected_kernel_cache_pass.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4a99d4c1a9c0f0bd973097d281e380341fe88515
--- /dev/null
+++ b/paddle/fluid/framework/ir/expected_kernel_cache_pass.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/expected_kernel_cache_pass.h"
+#include <memory>
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void ExpectedKernelCachePass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(3) << "Applies Expected Kernel Cache strategy.";
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp() && n->Op()) {
+      n->Op()->SetAttr(kEnableCacheExpectedKernel, true);
+    }
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(expected_kernel_cache_pass,
+              paddle::framework::ir::ExpectedKernelCachePass);
diff --git a/paddle/fluid/framework/ir/expected_kernel_cache_pass.h b/paddle/fluid/framework/ir/expected_kernel_cache_pass.h
new file mode 100644
index 0000000000000000000000000000000000000000..bf0907d3feb7bccd163363da65505e0af3fb9bf6
--- /dev/null
+++ b/paddle/fluid/framework/ir/expected_kernel_cache_pass.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class ExpectedKernelCachePass : public Pass {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index 28a37f331c100695f0ffec7288db84f4493d68a0..12ce99c8788625e2aae6e07abdea565bb2c2ebb9 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -31,10 +31,10 @@ namespace paddle {
 namespace framework {
 namespace ir {
 namespace {
-void SortHelper(
-    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
-    ir::Node *node, std::unordered_set<ir::Node *> *visited,
-    std::vector<ir::Node *> *ret) {
+void SortHelper(const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>,
+                               ir::NodeComp> &adj_list,
+                ir::Node *node, std::unordered_set<ir::Node *> *visited,
+                std::vector<ir::Node *> *ret) {
   visited->insert(node);
 
   for (auto adj : adj_list.at(node)) {
@@ -50,7 +50,8 @@ void SortHelper(
 
 bool HasCircleHelper(
     ir::Node *node,
-    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+        &adj_list,
     std::unordered_set<ir::Node *> *visited,
     std::unordered_set<ir::Node *> *in_trace,
     std::vector<std::vector<ir::Node *>> *circles) {
@@ -84,7 +85,8 @@ bool HasCircleHelper(
 }
 
 bool HasCircleInternal(
-    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+        &adj_list,
     std::vector<std::vector<ir::Node *>> *circles) {
   std::unordered_set<ir::Node *> visited;
   std::unordered_set<ir::Node *> in_trace;
@@ -107,8 +109,8 @@ bool FindCircleSubGraph(const Graph &graph,
 }
 
 std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
-  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
-      BuildOperationAdjList(graph);
+  std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+      adj_list = BuildOperationAdjList(graph);
   PADDLE_ENFORCE(!HasCircleInternal(adj_list, nullptr));
   std::unordered_set<ir::Node *> visited;
   std::vector<ir::Node *> ret;
@@ -117,34 +119,30 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
       SortHelper(adj_list, adj.first, &visited, &ret);
     }
   }
+
   return ret;
 }
 
 // Build operator inlink edge table.
-std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
-    const Graph &graph) {
-  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
+std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+BuildOperationAdjList(const Graph &graph) {
+  std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+      adj_list;
 
   for (auto &n : graph.Nodes()) {
     if (!n->IsOp()) continue;
     if (adj_list.find(n) == adj_list.end()) {
-      adj_list[n] = std::unordered_set<ir::Node *>();
+      adj_list[n] = std::set<ir::Node *, ir::NodeComp>();
     }
-    std::vector<ir::Node *> nodes;
     for (auto &var : n->inputs) {
       for (auto &adj_n : var->inputs) {
         PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
         VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
                 << " -> " << n->Name() << reinterpret_cast<void *>(n)
                 << "  via " << var->Name() << reinterpret_cast<void *>(var);
-        nodes.push_back(adj_n);
+        adj_list[n].insert(adj_n);
       }
     }
-    std::sort(nodes.begin(), nodes.end(), [](ir::Node *node1, ir::Node *node2) {
-      return node1->id() > node2->id();
-    });
-    adj_list[n].insert(std::make_move_iterator(nodes.begin()),
-                       std::make_move_iterator(nodes.end()));
   }
   return adj_list;
 }
diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h
index 214de9ec7d85aee6021b18866295777e317aa79d..849a9c3be6904f3f9c3669d8fc9d750154863031 100644
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <map>
 #include <memory>
+#include <set>
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph.h"
@@ -25,6 +26,13 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+// Compare nodes via node id.
+struct NodeComp {
+  bool operator()(ir::Node *const &node1, ir::Node *const &node2) const {
+    return node1->id() < node2->id();
+  }
+};
+
 // Test if the graph contains circle.
 bool HasCircle(const Graph &graph);
 
@@ -57,8 +65,8 @@ std::vector<Node *> TopologyVarientSort(const Graph &graph, SortKind sort_kind);
 void CleanIndividualNodes(Graph *graph);
 
 // Build an adjacency list of operations for the `graph`.
-std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
-    const Graph &graph);
+std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+BuildOperationAdjList(const Graph &graph);
 
 template <typename T>
 std::vector<T *> FilterByNodeWrapper(const Graph &graph) {
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
index c7cf9b0dc342bbfaa80b622d7dcd0f6348f78d42..566b654f237cbd71e1983c971374ee13d7b36805 100644
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
@@ -23,7 +23,7 @@ namespace ir {
 void RuntimeContextCachePass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Applies Runtime Context Cache strategy.";
   for (const Node* n : graph->Nodes()) {
-    if (n->IsOp()) {
+    if (n->IsOp() && n->Op()) {
       n->Op()->SetAttr(kEnableCacheRuntimeContext, true);
     }
   }
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index e6f5b15af8cd440a9304235acfe62787c5f1b134..1ea93b7638a85e67bcc85a0c0e130d636938d6c5 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -241,6 +241,7 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
   outputs_ = outputs;
   attrs_ = attrs;
   need_update_ = true;
+  block_ = nullptr;
 }
 
 OpDesc::OpDesc(const OpDesc &other, BlockDesc *block) {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 168f287a455c644695b6eaff426ce31ded8d38a5..1723a9a78a0da6e3eac7f823f79fe802a916e5b3 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -880,7 +880,16 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
 
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
-  if (!HasAttr(kEnableCacheRuntimeContext)) {
+  // To reduce the elapsed time of HasAttr, we use bool variable to record the
+  // result of HasAttr.
+  if (!enable_cache_runtime_context && HasAttr(kEnableCacheRuntimeContext))
+    enable_cache_runtime_context = true;
+  if (!enable_cache_expected_kernel && HasAttr(kEnableCacheExpectedKernel))
+    enable_cache_expected_kernel = true;
+  if (!all_kernels_must_compute_runtime_shape &&
+      HasAttr(kAllKernelsMustComputeRuntimeShape))
+    all_kernels_must_compute_runtime_shape = true;
+  if (!enable_cache_runtime_context) {
     RuntimeContext ctx(Inputs(), Outputs(), scope);
     RunImpl(scope, place, &ctx);
   } else {
@@ -899,60 +908,33 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
-  // check if op[type] has kernel registered.
-  auto& all_op_kernels = AllOpKernels();
-  auto kernels_iter = all_op_kernels.find(type_);
-  if (kernels_iter == all_op_kernels.end()) {
-    PADDLE_THROW(
-        "There are no kernels which are registered in the %s operator.", type_);
+  if (!enable_cache_expected_kernel || !kernel_type_) {
+    ChooseKernel(*runtime_ctx, scope, place);
   }
 
-  OpKernelMap& kernels = kernels_iter->second;
-
-  auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx, nullptr));
-  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
-
-  auto kernel_iter = kernels.find(expected_kernel_key);
-#ifdef PADDLE_WITH_MKLDNN
-  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
-  if (kernel_iter == kernels.end() &&
-      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
-    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
-    expected_kernel_key.library_type_ = LibraryType::kPlain;
-    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
-    kernel_iter = kernels.find(expected_kernel_key);
-  }
-#endif
-  if (kernel_iter == kernels.end()) {
-    PADDLE_THROW("op %s does not have kernel for %s", type_,
-                 KernelTypeToString(expected_kernel_key));
-  }
-
-  std::vector<KernelConfig>* kernel_configs =
-      GetKernelConfig(expected_kernel_key);
+  std::vector<KernelConfig>* kernel_configs = GetKernelConfig(*kernel_type_);
 
   // do data transformScope &transfer_scope;
   std::vector<std::string> transfered_inplace_vars;
-  auto* transfer_scope = PrepareData(scope, expected_kernel_key,
-                                     &transfered_inplace_vars, runtime_ctx);
+  auto* transfer_scope =
+      PrepareData(scope, *kernel_type_, &transfered_inplace_vars, runtime_ctx);
 
   // exec scope is the scope that kernel actually executed on.
   const Scope& exec_scope =
       (transfer_scope == nullptr ? scope : *transfer_scope);
 
-  if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
-    dev_ctx = pool.Get(expected_kernel_key.place_);
+  if (!(kernel_type_->place_ == dev_ctx->GetPlace())) {
+    dev_ctx = pool.Get(kernel_type_->place_);
   }
 
-  if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
+  if (!all_kernels_must_compute_runtime_shape) {
     RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx);
     this->InferShape(&infer_shape_ctx);
   }
   // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
   // not Scope. Imperative mode only pass inputs and get outputs.
-  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx,
-                                       *runtime_ctx, kernel_configs));
+  (*kernel_func_)(ExecutionContext(*this, exec_scope, *dev_ctx, *runtime_ctx,
+                                   kernel_configs));
 
   if (!transfered_inplace_vars.empty()) {
     // there is inplace variable has been transfered.
@@ -978,6 +960,46 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 }
 
+void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
+                                      const Scope& scope,
+                                      const platform::Place& place) const {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(place);
+
+  // check if op[type] has kernel registered.
+  auto& all_op_kernels = AllOpKernels();
+  auto kernels_iter = all_op_kernels.find(type_);
+  if (kernels_iter == all_op_kernels.end()) {
+    PADDLE_THROW(
+        "There are no kernels which are registered in the %s operator.", type_);
+  }
+
+  OpKernelMap& kernels = kernels_iter->second;
+
+  auto expected_kernel_key = this->GetExpectedKernelType(
+      ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
+  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+  auto kernel_iter = kernels.find(expected_kernel_key);
+#ifdef PADDLE_WITH_MKLDNN
+  // workaround for missing MKLDNN kernel when FLAGS_use_mkldnn env var is set
+  if (kernel_iter == kernels.end() &&
+      expected_kernel_key.library_type_ == LibraryType::kMKLDNN) {
+    VLOG(3) << "missing MKLDNN kernel: fallbacking to PLAIN one";
+    expected_kernel_key.library_type_ = LibraryType::kPlain;
+    expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
+    kernel_iter = kernels.find(expected_kernel_key);
+  }
+#endif
+  if (kernel_iter == kernels.end()) {
+    PADDLE_THROW("op %s does not have kernel for %s", type_,
+                 KernelTypeToString(expected_kernel_key));
+  }
+
+  kernel_type_.reset(new OpKernelType(expected_kernel_key));
+  kernel_func_.reset(new OpKernelFunc(kernel_iter->second));
+}
+
 void OperatorWithKernel::TransferInplaceVarsBack(
     const Scope& scope, const std::vector<std::string>& inplace_vars,
     const Scope& transfer_scope) const {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index a02e53dcf764368601646a900833ac650c5bb31a..489b66099658d522fe1f1adaad763b66bdd22c91 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -70,6 +70,12 @@ constexpr char kNewGradSuffix[] = "@NEWGRAD@";
 /// this Op's execution to save the elapsed time.
 constexpr char kEnableCacheRuntimeContext[] = "@ENABLE_CACHE_RUNTIME_CONTEXT@";
 
+/// If an Op has attribtue kEnableCacheExpectedKernel, it means that in a same
+/// name scope and same place, since the expected kerenl of this Op does not
+/// change in the execution, it could be recorded only at the first iteration of
+/// this Op's execution to save the elapsed time.
+constexpr char kEnableCacheExpectedKernel[] = "@ENABLE_CACHE_EXPECTED_KERNEL@";
+
 /// If an Op has this attribute, all its kernels should calculate output
 /// variable's shape in the corresponding Compute() function. And
 /// OperatorWithKernel::RunImpl() would skip call this Op's InferShape()
@@ -491,10 +497,18 @@ class OperatorWithKernel : public OperatorBase {
                                const std::vector<std::string>& inplace_vars,
                                const Scope& exec_scope) const;
 
+  void ChooseKernel(const RuntimeContext& ctx, const Scope& scope,
+                    const platform::Place& place) const;
+
  protected:
   mutable OpKernelConfigsMap kernel_configs_map_;
+  mutable std::unique_ptr<OpKernelType> kernel_type_;
+  mutable std::unique_ptr<OpKernelFunc> kernel_func_;
   mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
   mutable const Scope* pre_scope_ = nullptr;
+  mutable bool enable_cache_runtime_context = false;
+  mutable bool enable_cache_expected_kernel = false;
+  mutable bool all_kernels_must_compute_runtime_shape = false;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 4245caf1689c76d72b410c742488c55562c8b998..c4bf2b7e8c017b22f917c9f9bd40e75b8cde08b2 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -221,7 +221,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
     PADDLE_ENFORCE(!member_->use_cuda_,
                    "gpu mode does not support async_mode_ now!");
     graphs.push_back(graph);
-    for (int i = 1; i < places.size(); ++i) {
+    for (size_t i = 1; i < places.size(); ++i) {
       auto *tmp_graph = new ir::Graph(graph->OriginProgram());
       async_graphs_.emplace_back(tmp_graph);
       graphs.push_back(tmp_graph);
@@ -315,7 +315,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
     graph = build_strategy.Apply(graph, {member_->places_[0]}, loss_var_name,
                                  {member_->local_scopes_[0]}, 1,
                                  member_->use_cuda_, member_->nccl_ctxs_.get());
-    for (int i = 1; i < member_->places_.size(); ++i) {
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
       graphs[i] =
           build_strategy.Apply(graphs[i], {member_->places_[i]}, loss_var_name,
                                {member_->local_scopes_[i]}, 1,
diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto
index 389c1a870fb54ad28806ad49632323b1c93676f4..4fc05ccf5c9be37e80b4ae7263166ad76eb6d6a7 100644
--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -76,7 +76,7 @@ message PullDenseWorkerParameter {
 
 message TableParameter {
   // dense table only
-  optional int64 table_id = 1;
+  optional uint64 table_id = 1;
   repeated string dense_value_name = 2;
   repeated string dense_grad_name = 3;
   repeated int32 push_dense_wait_times = 5;
diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
index 2e9c64d3e6854bf70c0aee06128b9f1b7c8c7439..66e6ac81623a1cd1c79981c1e4a97d974e9c2426 100644
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -45,12 +45,16 @@ class InferVarTypeContext {
 
   virtual bool HasInput(const std::string& name) const {
     PADDLE_ENFORCE_NOT_NULL(op_);
-    return op_->Inputs().count(name) > 0;
+    auto& inputs = op_->Inputs();
+    auto input = inputs.find(name);
+    return input != inputs.end() && !input->second.empty();
   }
 
   virtual bool HasOutput(const std::string& name) const {
     PADDLE_ENFORCE_NOT_NULL(op_);
-    return op_->Outputs().count(name) > 0;
+    auto& outputs = op_->Outputs();
+    auto output = outputs.find(name);
+    return output != outputs.end() && !output->second.empty();
   }
 
   virtual const std::vector<std::string>& Input(const std::string& name) const {
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 72c548d5e92dec3ec2638904f508c2777ee327c6..316095b4fb8ed8a574bb25847fa29bd45d9b50f0 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -464,7 +464,11 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
 
   void SetType(const std::string& name,
                framework::proto::VarType::Type type) override {
-    var_set_[name]->SetType(type);
+    if (name == "kLookupTablePath") {
+      VLOG(2) << "SUPER UGLY FIX, remove this when move imperative mode in C++";
+    } else {
+      var_set_[name]->SetType(type);
+    }
   }
 
   framework::proto::VarType::Type GetDataType(
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 04364e68342810f6babee1df0c5eb3b476de022a..f96c83936df590e5bd3abe89b7e7c2a6ddf92d01 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -33,8 +33,7 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep,
   // creating socket fd
   if ((server_fd = socket(AF_INET, SOCK_STREAM, 0)) == 0)
     PADDLE_THROW("create server fd failed");
-  if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt,
-                 sizeof(opt)))
+  if (setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)))
     PADDLE_THROW("set socket opt failed");
 
   address.sin_family = AF_INET;
diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h
index 51c86190439d1739a99bc91a712f663383bb9371..b4f44e56405a51082e60afd69fb6f011dab44b86 100644
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@@ -14,7 +14,7 @@
 #pragma once
 
 // network header files
-#ifndef _WIN32
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include <arpa/inet.h>
 #include <netinet/in.h>
 #include <stdlib.h>
@@ -26,7 +26,7 @@
 
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index fb433ff2a2bd113358152248120d0d2be94bd927..5e0be5d445eae9d6d857ab0d6c5816807b4af523 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -49,11 +49,6 @@ set(SHARED_INFERENCE_SRCS
     ${mkldnn_quantizer_src}
     ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
 
-# FIXME(gongwb): hidden libdgc.a
-if(WITH_GPU AND NOT WIN32)
-    set(fluid_modules ${fluid_modules} dgc)
-endif()
-
 if(WIN32)
   sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
               analysis_config ${mkldnn_quantizer_cfg} paddle_pass_builder)
diff --git a/paddle/fluid/inference/anakin/CMakeLists.txt b/paddle/fluid/inference/anakin/CMakeLists.txt
index e8fb56590563f49f920bfe71d160ec822cb3ca30..9ffe70471c42ca95953ebce10c9246c777d7b2a2 100644
--- a/paddle/fluid/inference/anakin/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/CMakeLists.txt
@@ -1,5 +1,5 @@
-cc_library(anakin_engine SRCS engine.cc DEPS framework_proto)
-cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto)
+cc_library(anakin_engine SRCS engine.cc DEPS framework_proto boost)
+cc_library(anakin_op_teller SRCS op_teller.cc DEPS framework_proto boost)
 target_link_libraries(anakin_engine anakin anakin_saber_common)
 cc_test(test_anakin_engine SRCS test_anakin_engine.cc DEPS anakin_engine)
 add_subdirectory(convert)
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 0109b4a4fa7617880f642f5a39639bca38475515..b54ea269ff250f02b6331807237e10ee65b0b0b4 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -231,6 +231,7 @@ void AnalysisConfig::Update() {
       pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
     }
     pass_builder()->DeletePass("runtime_context_cache_pass");
+    pass_builder()->DeletePass("expected_kernel_cache_pass");
   }
 
   if (use_mkldnn_) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 6942604b0723f8665f0e8b058d48a5356a1a01f4..fcab1ab186127e40701da9420426b4ef27c7f95d 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -259,6 +259,9 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
       return false;
     }
 
+    PADDLE_ENFORCE_NOT_NULL(input_ptr);
+    PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data());
+
     if (platform::is_cpu_place(place_)) {
       // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
       std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
@@ -829,6 +832,45 @@ std::string AnalysisPredictor::GetSerializedProgram() const {
   return inference_program_->Proto()->SerializeAsString();
 }
 
+// Add SaveOptimModel
+void AnalysisPredictor::SaveOptimModel(const std::string &dir) {
+  // save model
+  std::string model_name = dir + "/model";
+  std::ofstream outfile;
+  outfile.open(model_name, std::ios::out | std::ios::binary);
+  std::string inference_prog_desc = GetSerializedProgram();
+  outfile << inference_prog_desc;
+  // save params
+  framework::ProgramDesc save_program;
+  auto *save_block = save_program.MutableBlock(0);
+
+  const framework::ProgramDesc &main_program = program();
+  const framework::BlockDesc &global_block = main_program.Block(0);
+  std::vector<std::string> save_var_list;
+  for (framework::VarDesc *var : global_block.AllVars()) {
+    if (IsPersistable(var)) {
+      framework::VarDesc *new_var = save_block->Var(var->Name());
+      new_var->SetShape(var->GetShape());
+      new_var->SetDataType(var->GetDataType());
+      new_var->SetType(var->GetType());
+      new_var->SetLoDLevel(var->GetLoDLevel());
+      new_var->SetPersistable(true);
+
+      save_var_list.push_back(new_var->Name());
+    }
+  }
+  std::sort(save_var_list.begin(), save_var_list.end());
+  auto *op = save_block->AppendOp();
+  op->SetType("save_combine");
+  op->SetInput("X", save_var_list);
+  op->SetAttr("file_path", dir + "/params");
+  op->CheckAttrs();
+
+  platform::CPUPlace place;
+  framework::Executor exe(place);
+  exe.Run(save_program, scope(), 0, true, true);
+}
+
 template <>
 std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<AnalysisConfig>(
     const AnalysisConfig &config) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index e4c537f426650f16ced32d3cb61b944a78c35b43..b5e134ced70f8bf9ef0267bee08ec9836aeb5338 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -86,6 +86,10 @@ class AnalysisPredictor : public PaddlePredictor {
 
   bool MkldnnQuantize();
 
+  // save program to  model
+  // save parameters to params
+  void SaveOptimModel(const std::string &dir);
+
  protected:
   // For memory optimization.
   bool need_collect_var_shapes_for_memory_optim();
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 0429a287c74f9db5257181151d90b77da86c694c..6bc892638c28ca0b5bab82936bf9700289bed6b2 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -196,6 +196,9 @@ TEST(AnalysisPredictor, Clone) {
   }
 }
 
+// This function is not released yet, will fail on some machine.
+// TODO(Superjomn) Turn on it latter.
+/*
 TEST(AnalysisPredictor, memory_optim) {
   AnalysisConfig config(FLAGS_dirname);
   config.DisableGpu();
@@ -246,6 +249,7 @@ TEST(AnalysisPredictor, memory_optim) {
 
   inference::CompareResult(output, output1);
 }
+*/
 
 #ifdef PADDLE_WITH_MKLDNN
 class MkldnnQuantizerTest : public testing::Test {
diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc
index 7d57b6ec74468dbdb0519f85140629a0ac01c18d..fc2d7b48c2a1f89232dcb96d1899667230e2ddda 100644
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -54,6 +54,7 @@ PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
     memory_owned_ = other.memory_owned_;
   } else {
     Resize(other.length());
+    PADDLE_ENFORCE(!(other.length() > 0 && other.data() == nullptr));
     memcpy(data_, other.data(), other.length());
     length_ = other.length();
     memory_owned_ = true;
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 54f40563c3662af24e794422be4d3262d86c76a7..56996c5cff88f5b4a9094291a09996f8b8d70a23 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -169,6 +169,7 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
   std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
   // Hot fix the bug that result diff in multi-thread.
   // TODO(Superjomn) re-implement a real clone here.
+  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<NativePaddlePredictor *>(cls.get()));
   if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(nullptr)) {
     LOG(ERROR) << "fail to call Init";
     return nullptr;
@@ -210,6 +211,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
       return false;
     }
 
+    PADDLE_ENFORCE_NOT_NULL(input_ptr);
+    PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data());
     if (platform::is_cpu_place(place_)) {
       // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
       std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
@@ -316,6 +319,8 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   }
 
   std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
+  PADDLE_ENFORCE_NOT_NULL(
+      dynamic_cast<NativePaddlePredictor *>(predictor.get()));
   if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
     return nullptr;
   }
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 1a87ac378a232f153a994c6b11ff37f8f5419a36..9b0873aecb545067180723c363a38bed1552fb2a 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -99,6 +99,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
         "conv_elementwise_add_fuse_pass",       //
 #endif                                          //
         "transpose_flatten_concat_fuse_pass",
+        "expected_kernel_cache_pass",  //
   });
 
   use_gpu_ = true;
@@ -122,8 +123,8 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
       // will enhance this pass later.
       "runtime_context_cache_pass",     //
       "attention_lstm_fuse_pass",       //
-      "seqpool_concat_fuse_pass",       //
       "seqconv_eltadd_relu_fuse_pass",  //
+      // "seqpool_concat_fuse_pass",    //
       // "embedding_fc_lstm_fuse_pass", //
       "fc_lstm_fuse_pass",             //
       "mul_lstm_fuse_pass",            //
@@ -136,6 +137,7 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) {
       "conv_bn_fuse_pass",             //
       "conv_eltwiseadd_bn_fuse_pass",  //
       "is_test_pass",                  //
+      "expected_kernel_cache_pass",    //
   });
 
   use_gpu_ = false;
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index f4977d08c4d051b8a528e122c47948c3c81d153c..d82b88a77a9898a24090614a6db3439fd1acd74f 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -1,5 +1,5 @@
-nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
-nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto)
+nv_library(tensorrt_engine SRCS engine.cc trt_int8_calibrator.cc DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context boost)
+nv_library(tensorrt_op_teller SRCS op_teller.cc DEPS framework_proto boost)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
 add_subdirectory(plugin)
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 8ecb0310c9775393631b99681e13cbea7a5b781e..a4641140e06b9c6efb13cdb58eb2a0ee810c71c6 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -30,6 +30,7 @@ function(inference_analysis_api_int8_test target model_dir data_dir filename)
              --infer_data=${data_dir}/data.bin
              --warmup_batch_size=100
              --batch_size=50
+             --paddle_num_threads=${CPU_NUM_THREADS_ON_CI}
 	     --iterations=2)
 endfunction()
 
@@ -120,7 +121,8 @@ set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
 download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp%2Ftransformer_model.tar.gz" "temp%2Ftransformer_data.txt.tar.gz")
 inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc 
   EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 SERIAL)
+  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 
+       --paddle_num_threads=${CPU_NUM_THREADS_ON_CI} SERIAL)
 
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
index e10d239a5d1b30e089a110c6155520e3b035860a..c9da5b3ea5581e415f11c8f85e1d6aea757531ab 100644
--- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc
@@ -170,6 +170,15 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->SwitchIrOptim(true);
 }
 
+void SetOptimConfig(AnalysisConfig *cfg) {
+  std::string optimModelPath =
+      FLAGS_infer_model.substr(0, FLAGS_infer_model.find_last_of("/")) +
+      "/saved_optim_model";
+  cfg->SetModel(optimModelPath + "/model", optimModelPath + "/params");
+  cfg->SwitchIrOptim(true);
+  cfg->SwitchSpecifyInputNames();
+}
+
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
   DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
   std::vector<PaddleTensor> input_slots;
@@ -315,5 +324,44 @@ TEST(Analyzer_dam, compare_determine) {
                        input_slots_all);
 }
 
+// Save optim model
+TEST(Analyzer_dam, save_optim_model) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::string optimModelPath =
+      FLAGS_infer_model.substr(0, FLAGS_infer_model.find_last_of("/")) +
+      "/saved_optim_model";
+  mkdir(optimModelPath.c_str(), 0777);
+  auto predictor = CreateTestPredictor(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+      FLAGS_use_analysis);
+  (static_cast<AnalysisPredictor *>(predictor.get()))
+      ->SaveOptimModel(optimModelPath);
+}
+
+void CompareOptimAndOrig(const PaddlePredictor::Config *orig_config,
+                         const PaddlePredictor::Config *optim_config,
+                         const std::vector<std::vector<PaddleTensor>> &inputs) {
+  PrintConfig(orig_config, true);
+  PrintConfig(optim_config, true);
+  std::vector<std::vector<PaddleTensor>> orig_outputs, optim_outputs;
+  TestOneThreadPrediction(orig_config, inputs, &orig_outputs, false);
+  TestOneThreadPrediction(optim_config, inputs, &optim_outputs, false);
+  CompareResult(orig_outputs.back(), optim_outputs.back());
+}
+
+TEST(Analyzer_dam, compare_optim_orig) {
+  AnalysisConfig orig_cfg;
+  AnalysisConfig optim_cfg;
+  SetConfig(&orig_cfg);
+  SetOptimConfig(&optim_cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareOptimAndOrig(
+      reinterpret_cast<const PaddlePredictor::Config *>(&orig_cfg),
+      reinterpret_cast<const PaddlePredictor::Config *>(&optim_cfg),
+      input_slots_all);
+}
+
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index d4330e6cddf8818ace01be2f13a4c18a192c46e1..588c80aa607c8d79365bbdfbb42a3d3c7667dbb2 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -32,6 +32,17 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
   SetFakeImageInput(inputs, FLAGS_infer_model);
 }
 
+void SetOptimConfig(AnalysisConfig *cfg) {
+  std::string optimModelPath =
+      FLAGS_infer_model.substr(0, FLAGS_infer_model.find_last_of("/")) +
+      "/saved_optim_model";
+  cfg->SetModel(optimModelPath + "/model", optimModelPath + "/params");
+  cfg->DisableGpu();
+  cfg->SwitchIrOptim();
+  cfg->SwitchSpecifyInputNames();
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
+}
+
 // Easy for profiling independently.
 void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
@@ -87,13 +98,51 @@ TEST(Analyzer_resnet50, compare_mkldnn) { compare(true /* use_mkldnn */); }
 TEST(Analyzer_resnet50, compare_determine) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
-
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
   CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
                        input_slots_all);
 }
 
+// Save optim model
+TEST(Analyzer_resnet50, save_optim_model) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::string optimModelPath =
+      FLAGS_infer_model.substr(0, FLAGS_infer_model.find_last_of("/")) +
+      "/saved_optim_model";
+  mkdir(optimModelPath.c_str(), 0777);
+  auto predictor = CreateTestPredictor(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+      FLAGS_use_analysis);
+  (static_cast<AnalysisPredictor *>(predictor.get()))
+      ->SaveOptimModel(optimModelPath);
+}
+
+void CompareOptimAndOrig(const PaddlePredictor::Config *orig_config,
+                         const PaddlePredictor::Config *optim_config,
+                         const std::vector<std::vector<PaddleTensor>> &inputs) {
+  PrintConfig(orig_config, true);
+  PrintConfig(optim_config, true);
+  std::vector<std::vector<PaddleTensor>> orig_outputs, optim_outputs;
+  TestOneThreadPrediction(orig_config, inputs, &orig_outputs, false);
+  TestOneThreadPrediction(optim_config, inputs, &optim_outputs, false);
+  CompareResult(orig_outputs.back(), optim_outputs.back());
+}
+
+TEST(Analyzer_resnet50, compare_optim_orig) {
+  AnalysisConfig orig_cfg;
+  AnalysisConfig optim_cfg;
+  SetConfig(&orig_cfg);
+  SetOptimConfig(&optim_cfg);
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareOptimAndOrig(
+      reinterpret_cast<const PaddlePredictor::Config *>(&orig_cfg),
+      reinterpret_cast<const PaddlePredictor::Config *>(&optim_cfg),
+      input_slots_all);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
index 9f23b9f037bcaeb758312d011067ae29c82e73cd..5ee848c3cfa2117b2adeab5e563c5d07ce1d76ca 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -47,6 +47,7 @@ struct DataRecord {
       num_lines++;
       std::vector<std::string> data;
       split(line, '\t', &data);
+      PADDLE_ENFORCE(data.size() >= 4);
       // load title1 data
       std::vector<int64_t> title1_data;
       split_to_int64(data[0], ' ', &title1_data);
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
index d6f7f468a6c83bd6c4ac087931d0c6b7cac3cc1c..3cebf8e96984fad0de8d8c6775990f7c6a6cabe5 100644
--- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_tester.cc
@@ -150,6 +150,9 @@ void SetConfig(AnalysisConfig *cfg, bool use_mkldnn = false) {
   if (use_mkldnn) {
     cfg->EnableMKLDNN();
   }
+  // Enable seqpool_concat_fuse_pass, disabled by default since it takes much
+  // time
+  cfg->pass_builder()->InsertPass(2, "seqpool_concat_fuse_pass");
 }
 
 void profile(bool use_mkldnn = false) {
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
index bd4f1b61973fb0de06dcc288e329c94756d5ed47..a23297f29cf65d891f530850ffd184aa58e10886 100644
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
@@ -214,28 +214,23 @@ TEST(Analyzer_Transformer, fuse_statis) {
 }
 
 // Compare result of NativeConfig and AnalysisConfig
-// void compare(bool use_mkldnn = false) {
-//   AnalysisConfig cfg;
-//   SetConfig(&cfg);
-//   if (use_mkldnn) {
-//     cfg.EnableMKLDNN();
-//   }
-//
-//   std::vector<std::vector<PaddleTensor>> input_slots_all;
-//   SetInput(&input_slots_all);
-//   CompareNativeAndAnalysis(
-//       reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-//       input_slots_all);
-// }
-
-// TODO(yihuaxu):
-//    Disable compare and compare_mkldnn temporary, see
-//    https://github.com/paddlePaddle/Paddle/issues/16316 for details.
-// TEST(Analyzer_Transformer, compare) { compare(); }
-// #ifdef PADDLE_WITH_MKLDNN
-// TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */);
-// }
-// #endif
+void compare(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+
+TEST(Analyzer_Transformer, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
 
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index b0c23fbd534847c8aad244749761e9c072148796..b952b62f13ed6c1b6bd0b90bdc5898e9b8ef6f20 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -51,8 +51,6 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) {
      << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n";
   os << GenSpaces(num_spaces)
      << "specify_input_name: " << config.specify_input_name << "\n";
-  os << GenSpaces(num_spaces)
-     << "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n";
   num_spaces--;
   os << GenSpaces(num_spaces) << "}\n";
   return os;
@@ -72,8 +70,8 @@ std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) {
   }
   os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
      << "\n";
-  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
-     << "\n";
+  os << GenSpaces(num_spaces)
+     << "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n";
   os << GenSpaces(num_spaces)
      << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n";
   os << GenSpaces(num_spaces)
diff --git a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
index 842865933f2b4741aea034b19952d4c59344ba06..826c45311f478fb30fff173578427b875a1260bb 100644
--- a/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
+++ b/paddle/fluid/inference/tests/api/full_ILSVRC2012_val_preprocess.py
@@ -19,10 +19,11 @@ import sys
 import random
 import functools
 import contextlib
-from PIL import Image, ImageEnhance
+from PIL import Image
 import math
-from paddle.dataset.common import download, md5file
+from paddle.dataset.common import download
 import tarfile
+import StringIO
 
 random.seed(0)
 np.random.seed(0)
@@ -32,9 +33,11 @@ SIZE_FLOAT32 = 4
 SIZE_INT64 = 8
 FULL_SIZE_BYTES = 30106000008
 FULL_IMAGES = 50000
-DATA_DIR_NAME = 'ILSVRC2012'
-IMG_DIR_NAME = 'var'
-TARGET_HASH = '8dc592db6dcc8d521e4d5ba9da5ca7d2'
+TARGET_HASH = '22d2e0008dca693916d9595a5ea3ded8'
+FOLDER_NAME = "ILSVRC2012/"
+VALLIST_TAR_NAME = "ILSVRC2012/val_list.txt"
+CHUNK_SIZE = 8192
+
 img_mean = np.array([0.485, 0.456, 0.406]).reshape((3, 1, 1))
 img_std = np.array([0.229, 0.224, 0.225]).reshape((3, 1, 1))
 
@@ -62,8 +65,7 @@ def crop_image(img, target_size, center):
     return img
 
 
-def process_image(img_path, mode, color_jitter, rotate):
-    img = Image.open(img_path)
+def process_image(img):
     img = resize_short(img, target_size=256)
     img = crop_image(img, target_size=DATA_DIM, center=True)
     if img.mode != 'RGB':
@@ -99,26 +101,11 @@ def download_concat(cache_folder, zip_path):
                     outfile.write(infile.read())
 
 
-def extract(zip_path, extract_folder):
-    data_dir = os.path.join(extract_folder, DATA_DIR_NAME)
-    img_dir = os.path.join(data_dir, IMG_DIR_NAME)
-    print("Extracting...\n")
-
-    if not (os.path.exists(img_dir) and
-            len(os.listdir(img_dir)) == FULL_IMAGES):
-        tar = tarfile.open(zip_path)
-        tar.extractall(path=extract_folder)
-        tar.close()
-    print('Extracted. Full Imagenet Validation dataset is located at {0}\n'.
-          format(data_dir))
-
-
-def print_processbar(done, total):
-    done_filled = done * '='
-    empty_filled = (total - done) * ' '
-    percentage_done = done * 100 / total
+def print_processbar(done_percentage):
+    done_filled = done_percentage * '='
+    empty_filled = (100 - done_percentage) * ' '
     sys.stdout.write("\r[%s%s]%d%%" %
-                     (done_filled, empty_filled, percentage_done))
+                     (done_filled, empty_filled, done_percentage))
     sys.stdout.flush()
 
 
@@ -126,15 +113,13 @@ def check_integrity(filename, target_hash):
     print('\nThe binary file exists. Checking file integrity...\n')
     md = hashlib.md5()
     count = 0
-    total_parts = 50
-    chunk_size = 8192
-    onepart = FULL_SIZE_BYTES / chunk_size / total_parts
+    onepart = FULL_SIZE_BYTES / CHUNK_SIZE / 100
     with open(filename) as ifs:
         while True:
-            buf = ifs.read(8192)
+            buf = ifs.read(CHUNK_SIZE)
             if count % onepart == 0:
                 done = count / onepart
-                print_processbar(done, total_parts)
+                print_processbar(done)
             count = count + 1
             if not buf:
                 break
@@ -146,54 +131,61 @@ def check_integrity(filename, target_hash):
         return False
 
 
-def convert(file_list, data_dir, output_file):
+def convert(tar_file, output_file):
     print('Converting 50000 images to binary file ...\n')
-    with open(file_list) as flist:
-        lines = [line.strip() for line in flist]
-        num_images = len(lines)
-        with open(output_file, "w+b") as ofs:
-            #save num_images(int64_t) to file
-            ofs.seek(0)
-            num = np.array(int(num_images)).astype('int64')
-            ofs.write(num.tobytes())
-            per_parts = 1000
-            full_parts = FULL_IMAGES / per_parts
-            print_processbar(0, full_parts)
-            for idx, line in enumerate(lines):
-                img_path, label = line.split()
-                img_path = os.path.join(data_dir, img_path)
-                if not os.path.exists(img_path):
-                    continue
-
-                #save image(float32) to file
-                img = process_image(
-                    img_path, 'val', color_jitter=False, rotate=False)
-                np_img = np.array(img)
-                ofs.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 *
-                         idx)
-                ofs.write(np_img.astype('float32').tobytes())
-                ofs.flush()
-
-                #save label(int64_t) to file
-                label_int = (int)(label)
-                np_label = np.array(label_int)
-                ofs.seek(SIZE_INT64 + SIZE_FLOAT32 * DATA_DIM * DATA_DIM * 3 *
-                         num_images + idx * SIZE_INT64)
-                ofs.write(np_label.astype('int64').tobytes())
-                ofs.flush()
-                if (idx + 1) % per_parts == 0:
-                    done = (idx + 1) / per_parts
-                    print_processbar(done, full_parts)
+    tar = tarfile.open(name=tar_file, mode='r:gz')
+
+    print_processbar(0)
+
+    dataset = {}
+    for tarInfo in tar:
+        if tarInfo.isfile() and tarInfo.name != VALLIST_TAR_NAME:
+            dataset[tarInfo.name] = tar.extractfile(tarInfo).read()
+
+    with open(output_file, "w+b") as ofs:
+        ofs.seek(0)
+        num = np.array(int(FULL_IMAGES)).astype('int64')
+        ofs.write(num.tobytes())
+
+        per_percentage = FULL_IMAGES / 100
+
+        idx = 0
+        for imagedata in dataset.values():
+            img = Image.open(StringIO.StringIO(imagedata))
+            img = process_image(img)
+            np_img = np.array(img)
+            ofs.write(np_img.astype('float32').tobytes())
+            if idx % per_percentage == 0:
+                print_processbar(idx / per_percentage)
+            idx = idx + 1
+
+        val_info = tar.getmember(VALLIST_TAR_NAME)
+        val_list = tar.extractfile(val_info).read()
+
+        lines = val_list.split('\n')
+        val_dict = {}
+        for line_idx, line in enumerate(lines):
+            if line_idx == FULL_IMAGES:
+                break
+            name, label = line.split()
+            val_dict[name] = label
+
+        for img_name in dataset.keys():
+            remove_len = (len(FOLDER_NAME))
+            img_name_prim = img_name[remove_len:]
+            label = val_dict[img_name_prim]
+            label_int = (int)(label)
+            np_label = np.array(label_int)
+            ofs.write(np_label.astype('int64').tobytes())
+        print_processbar(100)
+    tar.close()
     print("Conversion finished.")
 
 
 def run_convert():
     print('Start to download and convert 50000 images to binary file...')
     cache_folder = os.path.expanduser('~/.cache/paddle/dataset/int8/download')
-    extract_folder = os.path.join(cache_folder, 'full_data')
-    data_dir = os.path.join(extract_folder, DATA_DIR_NAME)
-    file_list = os.path.join(data_dir, 'val_list.txt')
-    zip_path = os.path.join(cache_folder, 'full_imagenet_val.tar.gz')
+    zip_path = os.path.join(cache_folder, 'full_imagenet_val.tar.gz.partaa')
     output_file = os.path.join(cache_folder, 'int8_full_val.bin')
     retry = 0
     try_limit = 3
@@ -213,8 +205,7 @@ def run_convert():
                 "Can not convert the dataset to binary file with try limit {0}".
                 format(try_limit))
         download_concat(cache_folder, zip_path)
-        extract(zip_path, extract_folder)
-        convert(file_list, data_dir, output_file)
+        convert(zip_path, output_file)
     print("\nSuccess! The binary file can be found at {0}".format(output_file))
 
 
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 5cc54ed299c50b48c83de2742b715b16cf1f8cd0..d13469a8482304d04b99c96e70bac5c8b90e4043 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -55,6 +55,9 @@ DEFINE_bool(record_benchmark, false,
 DEFINE_double(accuracy, 1e-3, "Result Accuracy.");
 DEFINE_double(quantized_accuracy, 1e-2, "Result Quantized Accuracy.");
 DEFINE_bool(zero_copy, false, "Use ZeroCopy to speedup Feed/Fetch.");
+DEFINE_bool(warmup, false,
+            "Use warmup to calculate elapsed_time more accurately. "
+            "To reduce CI time, it sets false in default.");
 
 DECLARE_bool(profile);
 DECLARE_int32(paddle_num_threads);
@@ -367,7 +370,9 @@ void TestOneThreadPrediction(
     const std::vector<std::vector<PaddleTensor>> &inputs,
     std::vector<std::vector<PaddleTensor>> *outputs, bool use_analysis = true) {
   auto predictor = CreateTestPredictor(config, use_analysis);
-  PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
+  if (FLAGS_warmup) {
+    PredictionWarmUp(predictor.get(), inputs, outputs, 1, 0);
+  }
   PredictionRun(predictor.get(), inputs, outputs, 1, 0);
 }
 
@@ -395,7 +400,10 @@ void TestMultiThreadPrediction(
             ->SetMkldnnThreadID(static_cast<int>(tid) + 1);
       }
 #endif
-      PredictionWarmUp(predictor.get(), inputs, &outputs_tid, num_threads, tid);
+      if (FLAGS_warmup) {
+        PredictionWarmUp(predictor.get(), inputs, &outputs_tid, num_threads,
+                         tid);
+      }
       PredictionRun(predictor.get(), inputs, &outputs_tid, num_threads, tid);
     });
   }
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index 98ce225a0476b38c021b0b81489f69d7953ae456..ec10e36c3b3707a88eebe116aaf3de454fc199b5 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -116,7 +116,7 @@ void compare_continuous_input(std::string model_dir, bool use_tensorrt) {
       reinterpret_cast<const PaddlePredictor::Config*>(&analysis_config);
   auto native_pred = CreateTestPredictor(config, false);
   auto analysis_pred = CreateTestPredictor(config, true);
-  for (int i = 0; i < 100; i++) {
+  for (int i = 0; i < 20; i++) {
     std::vector<std::vector<PaddleTensor>> inputs_all;
     if (!FLAGS_prog_filename.empty() && !FLAGS_param_filename.empty()) {
       SetFakeImageInput(&inputs_all, model_dir, true, FLAGS_prog_filename,
@@ -133,11 +133,13 @@ void compare_continuous_input(std::string model_dir, bool use_tensorrt) {
 TEST(TensorRT_mobilenet, compare) {
   std::string model_dir = FLAGS_infer_model + "/mobilenet";
   compare(model_dir, /* use_tensorrt */ true);
+  // Open it when need.
+  // profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
 }
 
-TEST(TensorRT_resnet50, compare) {
+TEST(resnet50, compare_continuous_input) {
   std::string model_dir = FLAGS_infer_model + "/resnet50";
-  compare(model_dir, /* use_tensorrt */ true);
+  compare_continuous_input(model_dir, true);
 }
 
 TEST(TensorRT_resnext50, compare) {
@@ -145,24 +147,6 @@ TEST(TensorRT_resnext50, compare) {
   compare(model_dir, /* use_tensorrt */ true);
 }
 
-TEST(TensorRT_resnext50, profile) {
-  std::string model_dir = FLAGS_infer_model + "/resnext50";
-  // Set FLAGS_record_benchmark to true to record benchmark to file.
-  // FLAGS_record_benchmark=true;
-  FLAGS_model_name = "resnext50";
-  profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
-}
-
-TEST(resnext50, compare_analysis_native) {
-  std::string model_dir = FLAGS_infer_model + "/resnext50";
-  compare(model_dir, false /*use tensorrt*/);
-}
-
-TEST(TensorRT_mobilenet, analysis) {
-  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
-  compare(model_dir, false /* use_tensorrt */);
-}
-
 TEST(AnalysisPredictor, use_gpu) {
   std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
   AnalysisConfig config;
@@ -180,20 +164,5 @@ TEST(AnalysisPredictor, use_gpu) {
   }
 }
 
-TEST(TensorRT_mobilenet, profile) {
-  std::string model_dir = FLAGS_infer_model + "/" + "mobilenet";
-  profile(model_dir, true, false);
-}
-
-TEST(resnet50, compare_continuous_input) {
-  std::string model_dir = FLAGS_infer_model + "/resnet50";
-  compare_continuous_input(model_dir, true);
-}
-
-TEST(resnet50, compare_continuous_input_native) {
-  std::string model_dir = FLAGS_infer_model + "/resnet50";
-  compare_continuous_input(model_dir, false);
-}
-
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index fc6de70f5a89331cb8940b34c1c9ff5a164c2894..c93c9ef2f2337124da349517ad13b27acb10b2c1 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -2,6 +2,7 @@ include(ExternalProject)
 set(INFERENCE_URL "http://paddle-inference-dist.bj.bcebos.com" CACHE STRING "inference download url")
 set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
     "A path setting inference demo download directories.")
+set(CPU_NUM_THREADS_ON_CI 4 CACHE STRING "Run multi-threads on CI to reduce CI time.")
 
 function(inference_download INSTALL_DIR URL FILENAME)
   message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
diff --git a/paddle/fluid/op_use_default_grad_op_maker.spec b/paddle/fluid/op_use_default_grad_op_maker.spec
index f0e3d3e86f24cb68f6e9d41f48c9698b43bca13e..403be1fc2c97a189a541c0c887eaadfe4266a124 100644
--- a/paddle/fluid/op_use_default_grad_op_maker.spec
+++ b/paddle/fluid/op_use_default_grad_op_maker.spec
@@ -1,14 +1,7 @@
-abs
-acos
-asin
-atan
 attention_lstm
-brelu
 conv_shift
-cos
 cos_sim
 dequantize
-elu
 fc
 flatten
 fsp
@@ -21,17 +14,10 @@ fusion_seqconv_eltadd_relu
 fusion_seqexpand_concat_fc
 fusion_seqpool_concat
 fusion_squared_mat_sub
-gelu
 gru
-hard_shrink
 hierarchical_sigmoid
-leaky_relu
-log
-logsigmoid
-lookup_table
 lrn
 lstm_unit
-lstmp
 max_pool2d_with_index
 max_pool3d_with_index
 maxout
@@ -39,7 +25,6 @@ modified_huber_loss
 nce
 pool2d
 pool3d
-pow
 prelu
 quantize
 rank_loss
@@ -51,26 +36,10 @@ reduce_sum
 requantize
 reshape
 rnn_memory_helper
-round
 sequence_softmax
-sin
-softplus
-softshrink
-softsign
-space_to_depth
 spp
-square
-squared_l2_distance
-squared_l2_norm
 squeeze
-stanh
-swish
-tanh_shrink
-teacher_student_sigmoid_loss
 tensor_array_to_tensor
-thresholded_relu
 transpose
-tree_conv
 unpool
 unsqueeze
-warpctc
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index e52e83673fe1c9ad2426e45f233c5e62f5c2f06e..6e8d6f459c51170c0f29542154aa3b1c0fd894f1 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -72,7 +72,7 @@ endif()
 
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
-if (WITH_GPU AND NOT WIN32)
+if (WITH_DGC)
     op_library(dgc_op DEPS dgc)
     file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(dgc);\n")
     set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dgc)
diff --git a/paddle/fluid/operators/activation_cudnn_op.cu.cc b/paddle/fluid/operators/activation_cudnn_op.cu.cc
index a382414d5c473a9c36f92a9af56837da819e96a4..f03355eb441f99b54d78fe90bcb3bea116db58f1 100644
--- a/paddle/fluid/operators/activation_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/activation_cudnn_op.cu.cc
@@ -12,6 +12,9 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/platform/cudnn_desc.h"
@@ -82,6 +85,8 @@ template <typename T>
 struct CudnnReluGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnReluGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_RELU) {}
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 template <typename T>
@@ -94,6 +99,8 @@ struct CudnnRelu6GradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnRelu6GradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 6.0, CUDNN_ACTIVATION_CLIPPED_RELU) {
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 template <typename T>
@@ -105,6 +112,8 @@ template <typename T>
 struct CudnnSigmoidGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnSigmoidGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_SIGMOID) {}
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 template <typename T>
@@ -116,6 +125,8 @@ template <typename T>
 struct CudnnTanhGradFunctor : public CudnnActivationGradFunctor<T> {
   explicit CudnnTanhGradFunctor(const CUDADeviceContext& ctx)
       : CudnnActivationGradFunctor<T>(ctx, 0.0, CUDNN_ACTIVATION_TANH) {}
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 template <typename Functor>
@@ -140,10 +151,13 @@ class CudnnActivationGradKernel
  public:
   using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
+    static_assert(Functor::FwdDeps() == kDepOut, "Forward deps must be Out.");
+
     const framework::Tensor *X, *Out, *dOut;
     X = Out = dOut = nullptr;
     framework::Tensor* dX = nullptr;
-    ExtractActivationGradTensor(context, &X, &Out, &dOut, &dX);
+    ExtractActivationGradTensor<Functor::FwdDeps()>(context, &X, &Out, &dOut,
+                                                    &dX);
     dX->mutable_data<T>(context.GetPlace());
     auto& dev_ctx = context.template device_context<CUDADeviceContext>();
     Functor functor(dev_ctx);
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index c87e4b22b37027efd1293e74f72598283946e62d..348902c656cec1ea1eeaccc90feefd56d307111d 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -15,7 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/activation_op.h"
 #include <memory>
 #include <string>
+#include <type_traits>
 #include <unordered_map>
+#include <vector>
 #include "paddle/fluid/operators/mkldnn/mkldnn_activation_op.h"
 #include "paddle/fluid/platform/port.h"
 #ifdef PADDLE_WITH_CUDA
@@ -27,6 +29,25 @@ namespace operators {
 
 using paddle::framework::Tensor;
 
+template <typename GradFunctor>
+static constexpr bool CanInplaceAct() {
+  return GradFunctor::FwdDeps() == kDepOut || GradFunctor::FwdDeps() == kNoDeps;
+}
+
+std::unique_ptr<std::unordered_set<std::string>> GetInplaceOpSet() {
+  std::unique_ptr<std::unordered_set<std::string>> ret(
+      new std::unordered_set<std::string>());
+#define INSERT_INTO_INPLACE_OP_SET(op_type, __omitted, fwd_functor, \
+                                   bwd_functor)                     \
+  if (CanInplaceAct<bwd_functor<float>>()) {                        \
+    ret->insert(#op_type);                                          \
+  }
+
+  FOR_EACH_ACTIVATION_OP(INSERT_INTO_INPLACE_OP_SET);
+#undef INSERT_INTO_INPLACE_OP_SET
+  return ret;
+}
+
 #define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                    \
   class OP_NAME##OpMaker                                                     \
       : public ::paddle::framework::OpProtoAndCheckerMaker {                 \
@@ -50,26 +71,32 @@ using paddle::framework::Tensor;
     }                                                                        \
   }
 
-#define REGISTER_ACTIVATION_OP_GRAD_MAKER(OP_NAME, KERNEL_TYPE)              \
-  class OP_NAME##GradMaker                                                   \
-      : public ::paddle::framework::SingleGradOpDescMaker {                  \
-   public:                                                                   \
-    using ::paddle::framework::SingleGradOpDescMaker::SingleGradOpDescMaker; \
-                                                                             \
-   protected:                                                                \
-    std::unique_ptr<::paddle::framework::OpDesc> Apply() const override {    \
-      auto* op = new ::paddle::framework::OpDesc();                          \
-      op->SetType(#KERNEL_TYPE "_grad");                                     \
-      op->SetInput("Out", Output("Out"));                                    \
-      op->SetInput(::paddle::framework::GradVarName("Out"),                  \
-                   OutputGrad("Out"));                                       \
-                                                                             \
-      op->SetAttrMap(Attrs());                                               \
-                                                                             \
-      op->SetOutput(::paddle::framework::GradVarName("X"), InputGrad("X"));  \
-      return std::unique_ptr<::paddle::framework::OpDesc>(op);               \
-    }                                                                        \
+template <ActBwdOpFwdDeps kDepValue>
+class ActivationGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType(ForwardOpType() + "_grad");
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepX)) {
+      op->SetInput("X", Input("X"));
+    }
+
+    if (static_cast<int>(kDepValue) &
+        static_cast<int>(ActBwdOpFwdDeps::kDepOut)) {
+      op->SetInput("Out", Output("Out"));
+    }
+
+    return op;
   }
+};
 
 framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
                                       const framework::OperatorWithKernel& oper,
@@ -129,14 +156,15 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->ShareDim("Out", framework::GradVarName("X"));
-    ctx->ShareLoD("Out", framework::GradVarName("X"));
+    auto out_grad_name = framework::GradVarName("Out");
+    ctx->ShareDim(out_grad_name, framework::GradVarName("X"));
+    ctx->ShareLoD(out_grad_name, framework::GradVarName("X"));
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    return GetKernelType(ctx, *this, "Out");
+    return GetKernelType(ctx, *this, framework::GradVarName("Out"));
   }
 };
 
@@ -199,6 +227,15 @@ $out = \sqrt{x}$
 
 )DOC";
 
+UNUSED constexpr char RsqrtDoc[] = R"DOC(
+Rsqrt Activation Operator.
+
+Please make sure input is legal in case of numeric errors.
+
+$out = \frac{1}{\sqrt{x}}$
+
+)DOC";
+
 UNUSED constexpr char AbsDoc[] = R"DOC(
 Abs Activation Operator.
 
@@ -547,6 +584,7 @@ REGISTER_ACTIVATION_OP_MAKER(Gelu, GeluDoc);
 REGISTER_ACTIVATION_OP_MAKER(Tanh, TanhDoc);
 REGISTER_ACTIVATION_OP_MAKER(TanhShrink, TanhShrinkDoc);
 REGISTER_ACTIVATION_OP_MAKER(Sqrt, SqrtDoc);
+REGISTER_ACTIVATION_OP_MAKER(Rsqrt, RsqrtDoc);
 REGISTER_ACTIVATION_OP_MAKER(Abs, AbsDoc);
 REGISTER_ACTIVATION_OP_MAKER(Ceil, CeilDoc);
 REGISTER_ACTIVATION_OP_MAKER(Floor, FloorDoc);
@@ -559,78 +597,27 @@ REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc);
 REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc);
 REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
 
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Sigmoid, sigmoid);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu, relu);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Gelu, gelu);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Exp, exp);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Tanh, tanh);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Ceil, ceil);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Floor, floor);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Sqrt, sqrt);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(SoftRelu, soft_relu);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Relu6, relu6);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(Reciprocal, reciprocal);
-REGISTER_ACTIVATION_OP_GRAD_MAKER(HardSigmoid, hard_sigmoid);
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
 
-#define FOR_EACH_INPLACE_OP_FUNCTOR(__macro) \
-  __macro(Sigmoid, sigmoid);                 \
-  __macro(Relu, relu);                       \
-  __macro(Exp, exp);                         \
-  __macro(Tanh, tanh);                       \
-  __macro(Ceil, ceil);                       \
-  __macro(Floor, floor);                     \
-  __macro(Sqrt, sqrt);                       \
-  __macro(SoftRelu, soft_relu);              \
-  __macro(Relu6, relu6);                     \
-  __macro(Reciprocal, reciprocal);           \
-  __macro(HardSigmoid, hard_sigmoid);
-
-#define FOR_EACH_OP_FUNCTOR(__macro) \
-  __macro(LogSigmoid, logsigmoid);   \
-  __macro(SoftShrink, softshrink);   \
-  __macro(Abs, abs);                 \
-  __macro(Cos, cos);                 \
-  __macro(Acos, acos);               \
-  __macro(Sin, sin);                 \
-  __macro(Asin, asin);               \
-  __macro(Atan, atan);               \
-  __macro(Round, round);             \
-  __macro(Log, log);                 \
-  __macro(Square, square);           \
-  __macro(Gelu, gelu);               \
-  __macro(BRelu, brelu);             \
-  __macro(Pow, pow);                 \
-  __macro(STanh, stanh);             \
-  __macro(Softplus, softplus);       \
-  __macro(Softsign, softsign);       \
-  __macro(LeakyRelu, leaky_relu);    \
-  __macro(TanhShrink, tanh_shrink);  \
-  __macro(ELU, elu);                 \
-  __macro(HardShrink, hard_shrink);  \
-  __macro(Swish, swish);             \
-  __macro(ThresholdedRelu, thresholded_relu);
-
-#define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE)                   \
-  REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp,            \
-                    ::paddle::operators::OP_NAME##OpMaker,                     \
-                    ::paddle::operators::ActivationOpInferVarType,             \
-                    ::paddle::operators::OP_NAME##GradMaker,                   \
-                    ::paddle::framework::SingleOpInplaceInToOut);              \
-  REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad, \
-                    ::paddle::framework::SingleOpInplaceInToOut)
-
-#define REGISTER_ACTIVATION_OP(OP_NAME, KERNEL_TYPE)                    \
-  REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp,     \
-                    ::paddle::operators::OP_NAME##OpMaker,              \
-                    ::paddle::operators::ActivationOpInferVarType,      \
-                    ::paddle::framework::DefaultGradOpDescMaker<true>); \
-  REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad)
-
-#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)   \
+#define REGISTER_ACTIVATION_OP(KERNEL_TYPE, OP_NAME, functor, grad_functor) \
+  REGISTER_OPERATOR(                                                        \
+      KERNEL_TYPE, ops::ActivationOp, ops::OP_NAME##OpMaker,                \
+      ops::ActivationOpInferVarType,                                        \
+      ops::ActivationGradOpDescMaker<ops::grad_functor<float>::FwdDeps()>,  \
+      std::conditional<ops::CanInplaceAct<ops::grad_functor<float>>(),      \
+                       ::paddle::framework::SingleOpInplaceInToOut,         \
+                       void>::type);                                        \
+  REGISTER_OPERATOR(                                                        \
+      KERNEL_TYPE##_grad, ops::ActivationOpGrad,                            \
+      std::conditional<ops::CanInplaceAct<ops::grad_functor<float>>(),      \
+                       ::paddle::framework::SingleOpInplaceInToOut,         \
+                       void>::type)
+
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, op_name, functor,        \
+                                       grad_functor)                      \
   REGISTER_OP_CPU_KERNEL(                                                 \
       act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
                                       ops::functor<float>>,               \
@@ -643,6 +630,5 @@ namespace ops = paddle::operators;
       ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
                                 ops::grad_functor<double>>);
 
-FOR_EACH_OP_FUNCTOR(REGISTER_ACTIVATION_OP);
-FOR_EACH_INPLACE_OP_FUNCTOR(REGISTER_INPLACE_ACTIVATION_OP);
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
+FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
+FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CPU_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu
index d3a7ceed466a9b5e4d773f1531d198adff97eac2..9c7a8d8971cba4090db1bbc32c7eabf2285e7eff 100644
--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -15,7 +15,8 @@ limitations under the License. */
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)    \
+#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor,         \
+                                        grad_functor)                       \
   REGISTER_OP_CUDA_KERNEL(                                                  \
       act_type,                                                             \
       ops::ActivationKernel<plat::CUDADeviceContext, ops::functor<float>>,  \
@@ -30,4 +31,4 @@ namespace plat = paddle::platform;
       ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
                                 ops::grad_functor<plat::float16>>);
 
-FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
+FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL);
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index ff7e623f6f383ed2a8b8a40b3186d9c439ff1d86..1732f61582f79365d6872e15b9df1ee8f053903c 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 #include <glog/logging.h>
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -35,21 +36,29 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-/* Use ugly global variable, for the using in python layer side
-   Please refer to the layer_helper.py and get the details.
- */
-static std::unordered_set<std::string> InplaceOpSet = {
-    "sigmoid", "exp",        "relu",  "tanh",      "sqrt",        "ceil",
-    "floor",   "reciprocal", "relu6", "soft_relu", "hard_sigmoid"};
+enum ActBwdOpFwdDeps {
+  kNoDeps = 0x00,  // Do not need any forward input/output
+  kDepX = 0x01,    // Only need forward input X
+  kDepOut = 0x02,  // Only need forward output Out
+
+  // Never add kDepXOut, because Out can be always calculated
+  // by forward input X in backward part.
+  // FIXME(zjl): but in MKLDNN abs, X and Out are all needed...
+  // Developers should not rely on this enum value!
+  kDepXOut = 0x03
+};
+
+std::unique_ptr<std::unordered_set<std::string>> GetInplaceOpSet();
 
 static bool IsInplace(const std::string& op) {
-  bool inplace = InplaceOpSet.count(op);
+  static auto InplaceOpSet = GetInplaceOpSet();
+  bool inplace = InplaceOpSet->count(op);
   // for op_grad
   const int kGradSuffixLen = 4;
   if (op.size() > kGradSuffixLen &&
       op.compare(op.size() - kGradSuffixLen - 1, kGradSuffixLen, "grad")) {
     inplace =
-        InplaceOpSet.count(op.substr(0, op.size() - (kGradSuffixLen + 1)));
+        InplaceOpSet->count(op.substr(0, op.size() - (kGradSuffixLen + 1)));
   }
   return inplace;
 }
@@ -85,16 +94,21 @@ inline void ExtractActivationTensor(const framework::ExecutionContext& context,
                  context.op().Output("Out"));
 }
 
+template <ActBwdOpFwdDeps kDepValue>
 inline void ExtractActivationGradTensor(
     const framework::ExecutionContext& context, const framework::Tensor** X,
     const framework::Tensor** Out, const framework::Tensor** dOut,
     framework::Tensor** dX) {
-  auto out_var = context.InputVar("Out");
   auto out_grad_var = context.InputVar(framework::GradVarName("Out"));
   auto x_grad_var = context.OutputVar(framework::GradVarName("X"));
-  PADDLE_ENFORCE(out_var != nullptr,
-                 "Cannot get input Variable Out, variable name = %s",
-                 context.op().Input("Out"));
+  const framework::Variable* out_var = nullptr;
+
+  if (static_cast<int>(kDepValue) & static_cast<int>(kDepOut)) {
+    out_var = context.InputVar("Out");
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot get input Variable Out, variable name = %s",
+                   context.op().Input("Out"));
+  }
   PADDLE_ENFORCE(out_grad_var != nullptr,
                  "Cannot get input Variable %s, variable name = %s",
                  framework::GradVarName("Out"),
@@ -105,23 +119,36 @@ inline void ExtractActivationGradTensor(
                  context.op().Output(framework::GradVarName("X")));
 
   if (CanBeUsedBySelectedRows.count(context.op().Type())) {
-    *Out = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var);
     *dOut = paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(
         *out_grad_var);
     *dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(
         x_grad_var);
+
+    if (out_var) {
+      *Out =
+          paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var);
+    } else {
+      *Out = *dOut;  // fake out
+    }
+
   } else {
     *Out = context.Input<framework::Tensor>("Out");
     *dOut = context.Input<framework::Tensor>(framework::GradVarName("Out"));
     *dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    if (out_var) {
+      *Out = &(out_var->Get<framework::LoDTensor>());
+    } else {
+      *Out = *dOut;  // fake out
+    }
   }
+
   PADDLE_ENFORCE(*dX != nullptr,
                  "Cannot get output tensor %s, variable name = %s",
                  framework::GradVarName("X"),
                  context.op().Output(framework::GradVarName("X")));
 
-  bool inplace = IsInplace(context.op().Type());
-  if (!inplace) {
+  if (static_cast<int>(kDepValue) & static_cast<int>(kDepX)) {
     auto x_var = context.InputVar("X");
     PADDLE_ENFORCE(x_var != nullptr,
                    "Cannot get input tensor X, variable name = %s",
@@ -172,7 +199,8 @@ class ActivationGradKernel
     const framework::Tensor *X, *Out, *dOut;
     framework::Tensor* dX = nullptr;
     X = Out = dOut = nullptr;
-    ExtractActivationGradTensor(context, &X, &Out, &dOut, &dX);
+    ExtractActivationGradTensor<Functor::FwdDeps()>(context, &X, &Out, &dOut,
+                                                    &dX);
     dX->mutable_data<T>(context.GetPlace());
     auto dout = framework::EigenVector<T>::Flatten(detail::Ref(dOut));
     auto out = framework::EigenVector<T>::Flatten(detail::Ref(Out));
@@ -222,6 +250,8 @@ struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * out * (static_cast<T>(1) - out);
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 // Originally: logsigmoid(x) = -log (1 + exp(-x))
@@ -258,6 +288,8 @@ struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) =
         dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // exp(x) = e^x
@@ -276,6 +308,8 @@ struct ExpGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * out;
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 // relu(x) = max(x, 0)
@@ -294,6 +328,8 @@ struct ReluGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * (out > static_cast<T>(0)).template cast<T>();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 // gelu(x) = 0.5 * x *  (1 + erf(x / sqrt(2)))
@@ -338,6 +374,8 @@ struct GeluGradFunctor : BaseActivationFunctor<T> {
                   (-static_cast<T>(0.5) * x.square()).exp();
     dx.device(d) = dout * (first + second);
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
@@ -356,6 +394,8 @@ struct TanhGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * (static_cast<T>(1) - out * out);
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 // tanhshrink(x) = x - tanh(x)
@@ -375,6 +415,8 @@ struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * (x.tanh() * x.tanh());
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // tanhshrink(x) = x - tanh(x)
@@ -409,6 +451,8 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
     auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
     dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
@@ -443,6 +487,8 @@ struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
     auto temp2 = (x < -lambdaT).template cast<T>().eval();
     dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // sqrt(x) = x^(1/2)
@@ -461,6 +507,28 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = static_cast<T>(0.5) * dout / out;
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
+};
+
+// rsqrt(x) = x^(-1/2)
+template <typename T>
+struct RsqrtFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.rsqrt();
+  }
+};
+
+template <typename T>
+struct RsqrtGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = static_cast<T>(-0.5) * dout * out * out * out;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 // ceil(x) = ceiling(x)
@@ -479,6 +547,8 @@ struct ZeroGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = static_cast<T>(0) / out;
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; }
 };
 
 // floor(x) = flooring(x)
@@ -522,6 +592,8 @@ struct CosGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = -dout * x.unaryExpr(Sine<T>());
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // cosine(x) = cos(x)
@@ -541,6 +613,8 @@ struct SinGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * x.unaryExpr(Cosine<T>());
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // sine(x) = sin(x)
@@ -582,6 +656,8 @@ struct AcosGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) =
         -dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -614,6 +690,8 @@ struct AsinGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) =
         dout * static_cast<T>(1) / (static_cast<T>(1) - x.square()).sqrt();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -645,6 +723,8 @@ struct AtanGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * static_cast<T>(1) / (static_cast<T>(1) + x.square());
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // round(x) = [x]
@@ -672,6 +752,8 @@ struct AbsGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * x.sign();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepXOut; }
 };
 
 // reciprocal(x) = 1 / x
@@ -690,6 +772,8 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * static_cast<T>(-1) * out * out;
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 // log(x) = natural logarithm of x
@@ -708,6 +792,8 @@ struct LogGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * (static_cast<T>(1) / x);
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // square(x) = x^2
@@ -726,6 +812,8 @@ struct SquareGradFunctor : public BaseActivationFunctor<T> {
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * static_cast<T>(2) * x;
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -760,6 +848,8 @@ struct BReluGradFunctor : public BaseActivationFunctor<T> {
                    ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
                        .template cast<T>();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // relu6(x) = min(max(0, x), 6)
@@ -792,6 +882,8 @@ struct Relu6GradFunctor : public BaseActivationFunctor<T> {
         ((out > static_cast<T>(0)) * (out < static_cast<T>(threshold)))
             .template cast<T>();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 // softplus(x) = log(1 + exp(x))
@@ -821,6 +913,8 @@ struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) =
         dout * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp()));
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // softsign(x) = x / (1 + |x|)
@@ -842,6 +936,8 @@ struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) =
         dout * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -872,6 +968,8 @@ struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
     auto temp = ((out > -tmp) * (out < tmp)).template cast<T>().eval();
     dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 template <typename T>
@@ -901,6 +999,8 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
     auto temp2 = (x >= static_cast<T>(0)).template cast<T>().eval();
     dx.device(d) = dout * (temp1 + temp2).template cast<T>();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -928,9 +1028,11 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
             typename dX>
   void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
     dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>() +
-                   dout * (out + static_cast<T>(alpha)) *
+                   dout * static_cast<T>(alpha) * x.exp() *
                        (x < static_cast<T>(0)).template cast<T>();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 // FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198
@@ -958,6 +1060,8 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
     dx.device(d) = dout * static_cast<T>(factor) *
                    x.pow(static_cast<T>(factor) - static_cast<T>(1));
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -991,6 +1095,8 @@ struct STanhGradFunctor : public BaseActivationFunctor<T> {
     auto temp = (a * x).tanh() * (a * x).tanh();
     dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -1020,6 +1126,8 @@ struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
     auto th = static_cast<T>(threshold);
     dx.device(d) = dout * (x > th).template cast<T>();
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 template <typename T>
@@ -1053,6 +1161,8 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
                        .template cast<T>() *
                    static_cast<T>(slope);
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; }
 };
 
 template <typename T>
@@ -1077,49 +1187,55 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
 
   template <typename Device, typename X, typename Out, typename dOut,
             typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+  void operator()(Device d, X x, Out fake_out, dOut dout, dX dx) const {
     auto temp1 = static_cast<T>(1) /
                  (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+    auto out = x * temp1;
     auto temp2 = temp1 * (static_cast<T>(1) - (static_cast<T>(beta) * out));
     dx.device(d) = dout * ((static_cast<T>(beta) * out) + temp2);
   }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-#define FOR_EACH_KERNEL_FUNCTOR(__macro)                             \
-  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
-  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
-  __macro(exp, ExpFunctor, ExpGradFunctor);                          \
-  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
-  __macro(gelu, GeluFunctor, GeluGradFunctor);                       \
-  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
-  __macro(atan, AtanFunctor, AtanGradFunctor);                       \
-  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
-  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
-  __macro(abs, AbsFunctor, AbsGradFunctor);                          \
-  __macro(ceil, CeilFunctor, ZeroGradFunctor);                       \
-  __macro(floor, FloorFunctor, ZeroGradFunctor);                     \
-  __macro(cos, CosFunctor, CosGradFunctor);                          \
-  __macro(acos, AcosFunctor, AcosGradFunctor);                       \
-  __macro(sin, SinFunctor, SinGradFunctor);                          \
-  __macro(asin, AsinFunctor, AsinGradFunctor);                       \
-  __macro(round, RoundFunctor, ZeroGradFunctor);                     \
-  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
-  __macro(log, LogFunctor, LogGradFunctor);                          \
-  __macro(square, SquareFunctor, SquareGradFunctor);                 \
-  __macro(brelu, BReluFunctor, BReluGradFunctor);                    \
-  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);          \
-  __macro(pow, PowFunctor, PowGradFunctor);                          \
-  __macro(stanh, STanhFunctor, STanhGradFunctor);                    \
-  __macro(softplus, SoftplusFunctor, SoftplusGradFunctor);           \
-  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);           \
-  __macro(relu6, Relu6Functor, Relu6GradFunctor);                    \
-  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);       \
-  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor);    \
-  __macro(elu, ELUFunctor, ELUGradFunctor);                          \
-  __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor);    \
-  __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \
-  __macro(swish, SwishFunctor, SwishGradFunctor);                    \
-  __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor);
+#define FOR_EACH_ACTIVATION_OP(__macro)                                       \
+  __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
+  __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);  \
+  __macro(exp, Exp, ExpFunctor, ExpGradFunctor);                              \
+  __macro(relu, Relu, ReluFunctor, ReluGradFunctor);                          \
+  __macro(gelu, Gelu, GeluFunctor, GeluGradFunctor);                          \
+  __macro(tanh, Tanh, TanhFunctor, TanhGradFunctor);                          \
+  __macro(atan, Atan, AtanFunctor, AtanGradFunctor);                          \
+  __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor);  \
+  __macro(sqrt, Sqrt, SqrtFunctor, SqrtGradFunctor);                          \
+  __macro(rsqrt, Rsqrt, RsqrtFunctor, RsqrtGradFunctor);                      \
+  __macro(abs, Abs, AbsFunctor, AbsGradFunctor);                              \
+  __macro(ceil, Ceil, CeilFunctor, ZeroGradFunctor);                          \
+  __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                       \
+  __macro(cos, Cos, CosFunctor, CosGradFunctor);                              \
+  __macro(acos, Acos, AcosFunctor, AcosGradFunctor);                          \
+  __macro(sin, Sin, SinFunctor, SinGradFunctor);                              \
+  __macro(asin, Asin, AsinFunctor, AsinGradFunctor);                          \
+  __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
+  __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
+  __macro(log, Log, LogFunctor, LogGradFunctor);                              \
+  __macro(square, Square, SquareFunctor, SquareGradFunctor);                  \
+  __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor);                      \
+  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);         \
+  __macro(pow, Pow, PowFunctor, PowGradFunctor);                              \
+  __macro(stanh, STanh, STanhFunctor, STanhGradFunctor);                      \
+  __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);          \
+  __macro(softsign, Softsign, SoftsignFunctor, SoftsignGradFunctor);          \
+  __macro(relu6, Relu6, Relu6Functor, Relu6GradFunctor);                      \
+  __macro(leaky_relu, LeakyRelu, LeakyReluFunctor, LeakyReluGradFunctor);     \
+  __macro(tanh_shrink, TanhShrink, TanhShrinkFunctor, TanhShrinkGradFunctor); \
+  __macro(elu, ELU, ELUFunctor, ELUGradFunctor);                              \
+  __macro(hard_shrink, HardShrink, HardShrinkFunctor, HardShrinkGradFunctor); \
+  __macro(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,                      \
+          HardSigmoidGradFunctor);                                            \
+  __macro(swish, Swish, SwishFunctor, SwishGradFunctor);                      \
+  __macro(thresholded_relu, ThresholdedRelu, ThresholdedReluFunctor,          \
+          ThresholdedReluGradFunctor);
diff --git a/paddle/fluid/operators/affine_channel_op.cc b/paddle/fluid/operators/affine_channel_op.cc
index 268a5b894a95df8e27730879473b457a31e18cd6..27370a3c29a073f3ce6f01fd5aaf28b5ef1ca3a6 100644
--- a/paddle/fluid/operators/affine_channel_op.cc
+++ b/paddle/fluid/operators/affine_channel_op.cc
@@ -79,9 +79,13 @@ class AffineChannelOp : public framework::OperatorWithKernel {
                            : x_dims[x_dims.size() - 1]);
 
     PADDLE_ENFORCE_EQ(scale_dims.size(), 1UL);
-    PADDLE_ENFORCE_EQ(scale_dims[0], C);
     PADDLE_ENFORCE_EQ(b_dims.size(), 1UL);
-    PADDLE_ENFORCE_EQ(b_dims[0], C);
+    if (ctx->IsRuntime() || scale_dims[0] > 0) {
+      PADDLE_ENFORCE_EQ(scale_dims[0], C);
+    }
+    if (ctx->IsRuntime() || b_dims[0] > 0) {
+      PADDLE_ENFORCE_EQ(b_dims[0], C);
+    }
 
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", "Out");
diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h
index 87d23831486e658374d4c011412fdef57be1b994..73df8a38b96c30196a7e39d2cf1e348f2a7722ec 100644
--- a/paddle/fluid/operators/affine_grid_op.h
+++ b/paddle/fluid/operators/affine_grid_op.h
@@ -121,9 +121,11 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
     // TODO(wanghaoshuang): Refine batched matrix multiply
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
     for (int i = 0; i < n; ++i) {
-      Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
+      Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
+          {static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
       Tensor sliced_theta = theta->Slice(i, i + 1).Resize({2, 3});
-      Tensor sliced_out = output->Slice(i, i + 1).Resize({h * w, 2});
+      Tensor sliced_out = output->Slice(i, i + 1).Resize(
+          {static_cast<int64_t>(h) * static_cast<int64_t>(w), 2});
       blas.MatMul(sliced_grid, false, sliced_theta, true, T(1), &sliced_out,
                   T(0));
     }
@@ -161,8 +163,10 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
     // TODO(wanghaoshuang): Refine batched matrix multiply
     auto blas = math::GetBlas<DeviceContext, T>(ctx);
     for (int i = 0; i < n; ++i) {
-      Tensor sliced_grid = grid.Slice(i, i + 1).Resize({h * w, 3});
-      Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize({h * w, 2});
+      Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
+          {static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
+      Tensor sliced_out_grad = output_grad->Slice(i, i + 1).Resize(
+          {static_cast<int64_t>(h) * static_cast<int64_t>(w), 2});
       Tensor sliced_theta_grad = theta_grad->Slice(i, i + 1).Resize({2, 3});
       blas.MatMul(sliced_out_grad, true, sliced_grid, false, T(1),
                   &sliced_theta_grad, T(0));
diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc
index 912ec79910301b67bc520b1aa78d3fa1fd165d1f..aecd3d430231855fa29cf7716eb636cdb28182ce 100644
--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -64,12 +64,19 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
 
   auto c_dims = ctx->GetInputDim("C0");
   PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2.");
-  PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D);
+  if (ctx->IsRuntime()) {
+    PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D);
+  }
+
   if (ctx->HasInput("H0")) {
     auto h_dims = ctx->GetInputDim("H0");
-    PADDLE_ENFORCE(h_dims == c_dims,
-                   "The dimension of Input(H0) and Input(C0) "
-                   "should be the same.");
+    PADDLE_ENFORCE_EQ(h_dims.size(), 2UL, "Input(H0)'s rank must be 2.");
+    if (ctx->IsRuntime() ||
+        (framework::product(c_dims) > 0 && framework::product(h_dims) > 0)) {
+      PADDLE_ENFORCE(h_dims == c_dims,
+                     "The dimension of Input(H0) and Input(C0) "
+                     "should be the same.");
+    }
   }
 
   auto atten_w_dims = ctx->GetInputDim("AttentionWeight");
@@ -79,6 +86,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                     "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
   PADDLE_ENFORCE_EQ(atten_w_dims[1], 1,
                     "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
+
   if (ctx->HasInput("AttentionBias")) {
     auto atten_b_dims = ctx->GetInputDim("AttentionBias");
     PADDLE_ENFORCE_EQ(atten_b_dims.size(), 2,
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 494d26f58f23ad1e445bbe8d7f8ce1037e5aa598..0cc3e1f2b8350acb693ad7ba4f4dab270068723b 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -65,11 +65,22 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
       (data_layout == DataLayout::kNCHW ? x_dims[1]
                                         : x_dims[x_dims.size() - 1]);
 
-  PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
-  PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
-  PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
-  PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C);
+  auto scale_dim = ctx->GetInputDim("Scale");
+  auto bias_dim = ctx->GetInputDim("Bias");
 
+  PADDLE_ENFORCE_EQ(scale_dim.size(), 1UL);
+  PADDLE_ENFORCE_EQ(scale_dim.size(), 1UL);
+
+  bool check = true;
+  if ((!ctx->IsRuntime()) && (framework::product(scale_dim) <= 0 ||
+                              framework::product(bias_dim) <= 0)) {
+    check = false;
+  }
+
+  if (check) {
+    PADDLE_ENFORCE_EQ(scale_dim[0], C);
+    PADDLE_ENFORCE_EQ(scale_dim[0], C);
+  }
   ctx->SetOutputDim("Y", x_dims);
   ctx->SetOutputDim("MeanOut", {C});
   ctx->SetOutputDim("VarianceOut", {C});
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc
index bd69f422e5dbd5a5dc95150b10daa302f47ec5ff..f2c30cd7e8c6674866b8dfa482f1bc5195f689c2 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.cc
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc
@@ -41,9 +41,11 @@ class BilinearTensorProductOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The input(Y) must be a 2D Tensor.");
     PADDLE_ENFORCE_EQ(weight_dims.size(), 3UL,
                       "The input(Weight) must be a 3D tensor.");
-    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
-                      "The first dimension(batch_size) of input(X) must be "
-                      "equal to the first dimension of the input(Y).");
+    if (ctx->IsRuntime() || (x_dims[0] > 0 && y_dims[0] > 0)) {
+      PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
+                        "The first dimension(batch_size) of input(X) must be "
+                        "equal to the first dimension of the input(Y).");
+    }
     PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1],
                       "The second dimension of input(X) must be equal to "
                       "the second dimension of the input(Weight).");
diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc
index b2dbaecfcfd67cc679d02e22d4e89cfedeeba80c..51c4d878142dcd93a170c9ea4211b9c6ec8e4422 100644
--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ b/paddle/fluid/operators/bpr_loss_op.cc
@@ -32,10 +32,14 @@ class BprLossOp : public framework::OperatorWithKernel {
     int rank = x_dims.size();
     PADDLE_ENFORCE_EQ(rank, label_dims.size(),
                       "Input(X) and Input(Label) shall have the same rank.");
-    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                      framework::slice_ddim(label_dims, 0, rank - 1),
-                      "Input(X) and Input(Label) shall have the same shape "
-                      "except the last dimension.");
+
+    if (ctx->IsRuntime() || (framework::product(x_dims) > 0 &&
+                             framework::product(label_dims) > 0)) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                        framework::slice_ddim(label_dims, 0, rank - 1),
+                        "Input(X) and Input(Label) shall have the same shape "
+                        "except the last dimension.");
+    }
 
     auto y_dims = x_dims;
     y_dims[rank - 1] = 1;
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 1f71555180361a1522b7a1c8383fe128bc4edcd0..b1a6d66b80efdae3e78d7c3321a6107d2dd607aa 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -49,7 +49,15 @@ class ConcatOp : public framework::OperatorWithKernel {
     for (size_t i = 1; i < n; i++) {
       for (size_t j = 0; j < in_zero_dims_size; j++) {
         if (j == axis) {
-          out_dims[axis] += ins[i][j];
+          if (ctx->IsRuntime()) {
+            out_dims[axis] += ins[i][j];
+          } else {
+            if (ins[i][j] == -1) {
+              out_dims[axis] = -1;
+            } else {
+              out_dims[axis] += ins[i][j];
+            }
+          }
         } else {
           if (ctx->IsRuntime()) {
             // check all shape in run time
diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc
index 2e7f3edd55c3353bacddec3dd4ffaba9e0208136..37a82a8067f84722fc37e2469c739faf25f7540b 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -71,8 +71,16 @@ class BinaryLogicalOpInferShape : public framework::InferShapeBase {
                    "Input(Y) of %s operator must not be null", comment.type);
     auto dim_x = context->GetInputDim("X");
     auto dim_y = context->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y),
-                      "The number of elements in X and Y should be same");
+
+    int product_x = framework::product(dim_x);
+    int product_y = framework::product(dim_y);
+    bool check = context->IsRuntime() || (product_x >= 0 && product_y >= 0);
+    if (check) {
+      PADDLE_ENFORCE_EQ(
+          product_x, product_y,
+          "The number of elements in X and Y should be same, %d != %d",
+          product_x, product_y);
+    }
 
     context->SetOutputDim("Out", context->GetInputDim("X"));
     context->ShareLoD("X", "Out");
diff --git a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
index 45f18ac9255bdd75d8cbb5e1dd30ebba52260850..2ca5242c5c935e2156bf95689c53b0c29809c235 100644
--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
@@ -81,8 +81,10 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *context) const override {
     PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index");
-    PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
-                      "The number of element of subscript index must be 1");
+    if (context->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
+                        "The number of element of subscript index must be 1");
+    }
     if (!context->HasInput("X")) {
       return;
     }
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index 619e12e6ba7c73e46beafadd50770aedfb52c964..e1281602bf0d1bf25a2c4dfa32f495ed724d24eb 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -68,9 +68,14 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
 
   std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
   for (size_t i = 0; i < strides.size(); ++i) {
-    output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
-                                          dilations[i], paddings[i],
-                                          strides[i]));
+    if ((!ctx->IsRuntime()) &&
+        (in_dims[i + 2] <= 0 || filter_dims[i + 2] <= 0)) {
+      output_shape.push_back(-1);
+    } else {
+      output_shape.push_back(ConvOutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                            dilations[i], paddings[i],
+                                            strides[i]));
+    }
   }
   ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
   ctx->ShareLoD("Input", "Output");
diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc
index 08506ddd18ed35831702814e70962cb36ec958b1..fa4edb70b48e529102f11a1b0b9cac2110a33966 100644
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
@@ -36,14 +36,17 @@ class ConvShiftOp : public framework::OperatorWithKernel {
     auto y_dims = ctx->GetInputDim("Y");
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Input(Y)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
-                      "The 1st dimension of Input(X) and Input(Y) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(y_dims[1] % 2, 1,
-                      "The 2nd dimension of Input(Y) should be odd.");
-    PADDLE_ENFORCE_LE(y_dims[1], x_dims[1],
-                      "The 2nd dimension of Input(Y) should be less than or "
-                      "equal to the 2nd dimension of Input(X).");
+    if (ctx->IsRuntime() || (x_dims[0] > 0 && y_dims[0] > 0))
+      PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
+                        "The 1st dimension of Input(X) and Input(Y) should "
+                        "be equal.");
+    if (ctx->IsRuntime() || y_dims[1] > 0)
+      PADDLE_ENFORCE_EQ(y_dims[1] % 2, 1,
+                        "The 2nd dimension of Input(Y) should be odd.");
+    if (ctx->IsRuntime() || (x_dims[1] > 0 && y_dims[1] > 0))
+      PADDLE_ENFORCE_LE(y_dims[1], x_dims[1],
+                        "The 2nd dimension of Input(Y) should be less than or "
+                        "equal to the 2nd dimension of Input(X).");
     ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
   }
diff --git a/paddle/fluid/operators/cos_sim_op.cc b/paddle/fluid/operators/cos_sim_op.cc
index 30ec74d8442d2f42510220b825988b340f79d0a2..93304ec6700b795c923f24a5d0663884b818b9b3 100644
--- a/paddle/fluid/operators/cos_sim_op.cc
+++ b/paddle/fluid/operators/cos_sim_op.cc
@@ -40,17 +40,27 @@ class CosSimOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
 
-    PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
-                      "Ranks of Input(X) and Input(Y) must be equal.");
-    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "Rank of Input(X) must not be less than 2.");
-    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()),
-                      framework::slice_ddim(y_dims, 1, y_dims.size()),
-                      "All dimensions except the 1st of Input(X) and Input(Y) "
-                      "must be equal.");
-    PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1,
-                   "The 1st dimension of Input(Y) must be equal to Input(X) or"
-                   " just 1 (which will be broadcasted to match Input(X)).");
+    bool check = true;
+    if ((!ctx->IsRuntime()) &&
+        (framework::product(x_dims) <= 0 || framework::product(y_dims) <= 0)) {
+      check = false;
+    }
+
+    if (check) {
+      PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
+                        "Ranks of Input(X) and Input(Y) must be equal.");
+      PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                        "Rank of Input(X) must not be less than 2.");
+      PADDLE_ENFORCE_EQ(
+          framework::slice_ddim(x_dims, 1, x_dims.size()),
+          framework::slice_ddim(y_dims, 1, y_dims.size()),
+          "All dimensions except the 1st of Input(X) and Input(Y) "
+          "must be equal.");
+      PADDLE_ENFORCE(
+          x_dims[0] == y_dims[0] || y_dims[0] == 1,
+          "The 1st dimension of Input(Y) must be equal to Input(X) or"
+          " just 1 (which will be broadcasted to match Input(X)).");
+    }
 
     // resize tensor
     ctx->SetOutputDim("Out", {x_dims[0], 1});
diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc
index e053ae57739d3d96209e9ca180cc041f8b55396e..c701e895af00baffe49838d130d451319ae42c46 100644
--- a/paddle/fluid/operators/crf_decoding_op.cc
+++ b/paddle/fluid/operators/crf_decoding_op.cc
@@ -95,20 +95,23 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
         transition_dims[0] - 2, transition_dims[1],
         "An invalid dimension for the Input(Transition), which should "
         "be a 2-D tensor with shape [(D + 2) x D].");
-    PADDLE_ENFORCE_EQ(
-        emission_dims[1], transition_dims[1],
-        "The 2nd dimension of the Input(Emission) and the Input(Transition) "
-        "should be equal to the tag number.");
-
+    if (ctx->IsRuntime() || (emission_dims[1] > 0 && transition_dims[1] > 0)) {
+      PADDLE_ENFORCE_EQ(
+          emission_dims[1], transition_dims[1],
+          "The 2nd dimension of the Input(Emission) and the Input(Transition) "
+          "should be equal to the tag number.");
+    }
     if (ctx->HasInput("Label")) {
       auto label_dims = ctx->GetInputDim("Label");
       PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
                      "The Input(Label) should be a 2-D tensor with the 2nd "
                      "dimensions fixed to 1.");
-      PADDLE_ENFORCE_EQ(
-          emission_dims[0], label_dims[0],
-          "The height of Input(Emission) and the height of Input(Label) "
-          "should be the same.");
+      if (ctx->IsRuntime() || (emission_dims[0] > 0 && label_dims[0] > 0)) {
+        PADDLE_ENFORCE_EQ(
+            emission_dims[0], label_dims[0],
+            "The height of Input(Emission) and the height of Input(Label) "
+            "should be the same.");
+      }
     }
 
     ctx->ShareLoD("Emission", /*->*/ "ViterbiPath");
diff --git a/paddle/fluid/operators/cvm_op.cc b/paddle/fluid/operators/cvm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..53ed86ade48ce52d49285495388f93f1bc4f5d9e
--- /dev/null
+++ b/paddle/fluid/operators/cvm_op.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/cvm_op.h"
+#include <memory>
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class CVMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("CVM"), "Input(CVM) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto cvm_dims = ctx->GetInputDim("CVM");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(cvm_dims.size(), 2UL, "Input(CVM)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(cvm_dims[1], 2UL,
+                      "The 2nd dimension of "
+                      "Input(CVM) should be 2.");
+
+    if (ctx->Attrs().Get<bool>("use_cvm")) {
+      ctx->SetOutputDim("Y", {x_dims[0], x_dims[1]});
+    } else {
+      ctx->SetOutputDim("Y", {x_dims[0], x_dims[1] - 2});
+    }
+    ctx->ShareLoD("X", /*->*/ "Y");
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of
+  // cvm
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
+  }
+};
+
+class CVMGradientOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("CVM"), "Input(CVM) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto cvm_dims = ctx->GetInputDim("CVM");
+    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(cvm_dims.size(), 2, "Input(CVM)'s rank should be 2.");
+
+    PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
+                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
+                      "be equal.");
+
+    PADDLE_ENFORCE_EQ(cvm_dims[1], 2,
+                      "When Attr(soft_label) == false, the 2nd dimension of "
+                      "Input(CVM) should be 2.");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of
+  // cvm
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   platform::CPUPlace());
+  }
+};
+
+class CVMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(LodTensor, default LodTensor<float>), a 2-D tensor with shape "
+             "[N x D],"
+             " where N is the batch size and D is the emebdding dim. ");
+    AddInput("CVM",
+             "(Tensor),  a 2-D Tensor with shape [N x 2], where N is the batch "
+             "size, 2 is show and click.");
+    AddOutput("Y",
+              "(LodTensor, default LodTensor<float>), a 2-D tensor with shape "
+              "[N x K].");
+    AddAttr<bool>("use_cvm", "bool, use cvm or not").SetDefault(true);
+    AddComment(R"DOC(
+CVM Operator.
+
+      We assume that input X is a embedding vector with cvm_feature(show and click), which shape is [N * D] (D is 2(cvm_feature) + embedding dim, N is batch_size)
+      if use_cvm is True, we will log(cvm_feature), and output shape is [N * D].
+      if use_cvm is False, we will remove cvm_feature from input, and output shape is [N * (D - 2)].
+
+)DOC");
+  }
+};
+
+class CVMGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+    op->SetType("cvm_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("CVM", Input("CVM"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(cvm, ops::CVMOp, ops::CVMOpMaker, ops::CVMGradOpDescMaker);
+
+REGISTER_OPERATOR(cvm_grad, ops::CVMGradientOp);
+
+REGISTER_OP_CPU_KERNEL(cvm, ops::CVMOpKernel<float>, ops::CVMOpKernel<double>);
+
+REGISTER_OP_CPU_KERNEL(cvm_grad, ops::CVMGradOpKernel<float>,
+                       ops::CVMGradOpKernel<double>);
diff --git a/paddle/fluid/operators/cvm_op.h b/paddle/fluid/operators/cvm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..38e5a2afa11feace17b8d870cdc3ef0ed38745d7
--- /dev/null
+++ b/paddle/fluid/operators/cvm_op.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+class CVMOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const LoDTensor* x = context.Input<LoDTensor>("X");
+    const T* x_data = x->data<T>();
+    auto lod = x->lod()[0];
+    int64_t item_size = x->numel() / x->dims()[0];
+    int offset = 2;
+    if (!context.Attr<bool>("use_cvm")) {
+      item_size -= offset;
+    }
+    LoDTensor* y = context.Output<LoDTensor>("Y");
+    T* y_data = y->mutable_data<T>(context.GetPlace());
+
+    int seq_num = static_cast<int>(lod.size()) - 1;
+    for (int i = 0; i < seq_num; ++i) {
+      int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
+
+      for (int j = 0; j < seq_len; ++j) {
+        if (context.Attr<bool>("use_cvm")) {
+          std::memcpy(y_data, x_data, item_size * sizeof(T));
+          y_data[0] = log(y_data[0] + 1);
+          y_data[1] = log(y_data[1] + 1) - y_data[0];
+          x_data += item_size;
+          y_data += item_size;
+        } else {
+          std::memcpy(y_data, x_data + offset, item_size * sizeof(T));
+          x_data += item_size + offset;
+          y_data += item_size;
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class CVMGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    LoDTensor* dx = context.Output<LoDTensor>(framework::GradVarName("X"));
+    T* dx_data = dx->mutable_data<T>(context.GetPlace());
+
+    const Tensor* cvm = context.Input<Tensor>("CVM");
+    const T* cvm_data = cvm->data<T>();
+    int offset = 2;
+    const framework::LoDTensor* dOut =
+        context.Input<framework::LoDTensor>(framework::GradVarName("Y"));
+    const T* dout_data = dOut->data<T>();
+
+    auto lod = dx->lod()[0];
+    int64_t item_size = dx->numel() / dx->dims()[0];
+    if (!context.Attr<bool>("use_cvm")) {
+      item_size -= offset;
+    }
+
+    int seq_num = static_cast<int>(lod.size()) - 1;
+    for (int i = 0; i < seq_num; ++i) {
+      int64_t seq_len = static_cast<int64_t>(lod[i + 1] - lod[i]);
+
+      for (int j = 0; j < seq_len; ++j) {
+        if (context.Attr<bool>("use_cvm")) {
+          std::memcpy(dx_data, dout_data, item_size * sizeof(T));
+          dx_data[0] = cvm_data[0];
+          dx_data[1] = cvm_data[1];
+          dx_data += item_size;
+          dout_data += item_size;
+        } else {
+          std::memcpy(dx_data + offset, dout_data, item_size * sizeof(T));
+          dx_data[0] = cvm_data[0];
+          dx_data[1] = cvm_data[1];
+          dx_data += item_size + offset;
+          dout_data += item_size;
+        }
+      }
+      cvm_data += offset;
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/data_norm_op.cc b/paddle/fluid/operators/data_norm_op.cc
index 45bce6e5203f8c1dbb744e0f954f7f0a71c53372..a5c76db6fa44217a558cfaecd2d7628168c11d78 100644
--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/data_norm_op.h"
+#include <memory>
 #include <string>
 #include "paddle/fluid/framework/data_layout.h"
 #ifdef PADDLE_WITH_MKLDNN
@@ -65,9 +66,11 @@ class DataNormOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize").size(), 1UL);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum").size(), 1UL);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum").size(), 1UL);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize")[0], C);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum")[0], C);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum")[0], C);
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize")[0], C);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum")[0], C);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum")[0], C);
+    }
 
     ctx->SetOutputDim("Y", x_dims);
     ctx->SetOutputDim("Means", {C});
diff --git a/paddle/fluid/operators/detection/gpc.cc b/paddle/fluid/operators/detection/gpc.cc
index 7c0823c0487d39eece5be08322e7d182b931ba3c..f46aaf7d0a7b2d48f18ba6cccb555bbb691ad353 100644
--- a/paddle/fluid/operators/detection/gpc.cc
+++ b/paddle/fluid/operators/detection/gpc.cc
@@ -24,6 +24,7 @@
  **/
 
 #include "paddle/fluid/operators/detection/gpc.h"
+#include "paddle/fluid/platform/enforce.h"
 
 namespace gpc {
 
@@ -689,6 +690,7 @@ static bbox *create_contour_bboxes(gpc_polygon *p) {
 
   gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
                    const_cast<char *>("Bounding box creation"));
+  PADDLE_ENFORCE_NOT_NULL(box);
 
   /* Construct contour bounding boxes */
   for (c = 0; c < p->num_contours; c++) {
@@ -852,6 +854,7 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
   /* Create an extended hole array */
   gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
                   const_cast<char *>("contour hole addition"));
+  PADDLE_ENFORCE_NOT_NULL(extended_hole);
 
   /* Create an extended contour array */
   gpc_malloc<gpc_vertex_list>(extended_contour,
@@ -969,6 +972,7 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
   /* Build scanbeam table from scanbeam tree */
   gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
                      const_cast<char *>("sbt creation"));
+  PADDLE_ENFORCE_NOT_NULL(sbt);
   build_sbt(&scanbeam, sbt, sbtree);
   scanbeam = 0;
   free_sbtree(&sbtree);
@@ -1604,6 +1608,7 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
   /* Build scanbeam table from scanbeam tree */
   gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
                      const_cast<char *>("sbt creation"));
+  PADDLE_ENFORCE_NOT_NULL(sbt);
   build_sbt(&scanbeam, sbt, sbtree);
   scanbeam = 0;
   free_sbtree(&sbtree);
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index 6c37da17f4011d38efcdc5406331f1be173dd0dd..e8a186b611e7bcc1ec7069b82148bb8f5f601532 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -171,8 +171,8 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
          
          The output of previous network is in shape [N, C, H, W], while H and W
          should be the same, H and W specify the grid size, each grid point predict 
-         given number boxes, this given number, which following will be represented as S,
-         is specified by the number of anchors, In the second dimension(the channel
+         given number bounding boxes, this given number, which following will be represented as S,
+         is specified by the number of anchor clusters in each scale. In the second dimension(the channel
          dimension), C should be equal to S * (class_num + 5), class_num is the object 
          category number of source dataset(such as 80 in coco dataset), so in the 
          second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
@@ -203,7 +203,7 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
          thresh, the confidence score loss of this anchor box will be ignored.
 
          Therefore, the yolov3 loss consist of three major parts, box location loss,
-         confidence score loss, and classification loss. The L2 loss is used for 
+         confidence score loss, and classification loss. The L1 loss is used for 
          box coordinates (w, h), and sigmoid cross entropy loss is used for box 
          coordinates (x, y), confidence score loss and classification loss.
 
diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc
index e1d113f8542da8827b9e36e44fc1bac6c07c9257..554e50725ffa5fc30849dc62fe525d72c6561a8b 100644
--- a/paddle/fluid/operators/detection_map_op.cc
+++ b/paddle/fluid/operators/detection_map_op.cc
@@ -51,8 +51,10 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(label_dims.size(), 2,
                       "The rank of Input(Label) must be 2, "
                       "the shape is [N, 6].");
-    PADDLE_ENFORCE(label_dims[1] == 6 || label_dims[1] == 5,
-                   "The shape of Input(Label) is [N, 6] or [N, 5].");
+    if (ctx->IsRuntime() || label_dims[1] > 0) {
+      PADDLE_ENFORCE(label_dims[1] == 6 || label_dims[1] == 5,
+                     "The shape of Input(Label) is [N, 6] or [N, 5].");
+    }
 
     if (ctx->HasInput("PosCount")) {
       PADDLE_ENFORCE(ctx->HasInput("TruePos"),
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 972b4f67a8388ce68952fa90aaa224cd45c6d226..f6531ec9edca7b425d28853f542d5e46783ba699 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -9,6 +9,9 @@ else()
 endif()
 configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY)
 
+cc_library(async_sparse_param_update_recorder SRCS async_sparse_param_update_recorder.cc DEPS enforce simple_threadpool)
+cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_recorder_test.cc DEPS async_sparse_param_update_recorder)
+
 # FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files
 set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 if(WITH_GRPC)
@@ -20,7 +23,7 @@ if(WITH_GRPC)
         collective_client.cc collective_server.cc
         ${GRPC_SRCS}
       PROTO send_recv.proto 
-      DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS})
+      DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS} async_sparse_param_update_recorder)
 
   set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS})
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3f3b6b959e30194c10b1a58d6fc3e7a61ad01313
--- /dev/null
+++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc
@@ -0,0 +1,27 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+std::once_flag AsyncSparseParamUpdateRecorder::init_flag_;
+std::unique_ptr<AsyncSparseParamUpdateRecorder>
+    AsyncSparseParamUpdateRecorder::recorder_(nullptr);
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
new file mode 100644
index 0000000000000000000000000000000000000000..eadd842c7f6ead56006fd0c34814b1b7bd9b62f4
--- /dev/null
+++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h
@@ -0,0 +1,183 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <functional>
+#include <future>  // NOLINT
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include <ThreadPool.h>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+class ConcurrentSet {
+ public:
+  ConcurrentSet() : pool_(new ::ThreadPool(1)) {}
+  ~ConcurrentSet() {}
+
+  std::future<void> Update(const std::vector<int64_t>& rows) {
+    auto task = [this, rows] {
+      if (VLOG_IS_ON(3)) {
+        std::ostringstream sstream;
+        sstream << "[";
+        for (auto& id : rows) {
+          sstream << id << ", ";
+        }
+        sstream << "]";
+        VLOG(3) << "update ids -> " << sstream.str();
+      }
+      for (auto row : rows) {
+        set_.insert(row);
+      }
+    };
+    return pool_->enqueue(std::move(task));
+  }
+
+  std::future<void> GetAndClear(std::vector<int64_t>* result) {
+    auto task = [this, &result] {
+      result->clear();
+      for (auto& id : set_) {
+        result->push_back(id);
+      }
+      if (VLOG_IS_ON(3)) {
+        std::ostringstream sstream;
+        sstream << "[";
+        for (auto& id : *result) {
+          sstream << id << ", ";
+        }
+        sstream << "]";
+        VLOG(3) << "result ids size: " << result->size() << " "
+                << sstream.str();
+      }
+      set_.clear();
+    };
+    return pool_->enqueue(std::move(task));
+  }
+
+ private:
+  std::unordered_set<int64_t> set_;
+  std::unique_ptr<::ThreadPool> pool_{nullptr};
+};
+
+class AsyncSparseParamUpdateRecorder {
+  using TrainerToRows = std::vector<std::unique_ptr<ConcurrentSet>>;
+
+ public:
+  AsyncSparseParamUpdateRecorder(
+      int trainer_num,
+      const std::unordered_map<std::string, std::string>& grad_to_param)
+      : trainer_num_(trainer_num), grad_to_param_(grad_to_param) {
+    if (VLOG_IS_ON(3)) {
+      std::ostringstream sstream;
+      sstream << "[";
+      for (auto& item : grad_to_param) {
+        sstream << item.first << ":" << item.second << ", ";
+      }
+      sstream << "]";
+      VLOG(3) << "trainer_num: " << trainer_num
+              << " grad_to_param_: " << sstream.str();
+    }
+    for (auto& iter : grad_to_param) {
+      param_to_grad_[iter.second] = iter.first;
+      auto& param_name = iter.second;
+      param_to_updated_rows_[param_name] = TrainerToRows();
+      auto& trainer_to_rows = param_to_updated_rows_[param_name];
+      for (auto i = 0; i < trainer_num; ++i) {
+        trainer_to_rows.emplace_back(new ConcurrentSet());
+      }
+    }
+  }
+
+  ~AsyncSparseParamUpdateRecorder() = default;
+
+  void Update(const std::string& grad_name,
+              const std::vector<int64_t>& update_rows) {
+    VLOG(3) << "update grad: " << grad_name
+            << " row size: " << update_rows.size();
+    auto& param_name = grad_to_param_.at(grad_name);
+    auto& trainer_to_rows = param_to_updated_rows_.at(param_name);
+
+    std::vector<std::future<void>> fs;
+    for (auto& set : trainer_to_rows) {
+      fs.push_back(set->Update(update_rows));
+    }
+    for (auto& f : fs) {
+      f.wait();
+    }
+  }
+
+  void GetAndClear(const std::string& param_name, int trainer_id,
+                   std::vector<int64_t>* result) {
+    VLOG(3) << "GetAndClear param: " << param_name
+            << " for trainer: " << trainer_id;
+    PADDLE_ENFORCE_LT(trainer_id, trainer_num_);
+    param_to_updated_rows_.at(param_name)[trainer_id]
+        ->GetAndClear(result)
+        .wait();
+  }
+
+  bool HasParam(const std::string& param_name) {
+    return param_to_grad_.find(param_name) != param_to_grad_.end();
+  }
+
+  bool HasGrad(const std::string& grad_name) {
+    return grad_to_param_.find(grad_name) != grad_to_param_.end();
+  }
+
+ private:
+  const int trainer_num_;
+  std::unordered_map<std::string, std::string> grad_to_param_;
+  std::unordered_map<std::string, std::string> param_to_grad_;
+  std::unordered_map<std::string, TrainerToRows> param_to_updated_rows_;
+
+  // init recorder
+ public:
+  static void Init(
+      int trainer_num,
+      const std::unordered_map<std::string, std::string>& grad_to_param) {
+    InitImpl(trainer_num, grad_to_param);
+  }
+
+  static AsyncSparseParamUpdateRecorder* GetInstance() {
+    return recorder_.get();
+  }
+
+ private:
+  // Init is called by GetInstance.
+  static void InitImpl(
+      int trainer_num,
+      const std::unordered_map<std::string, std::string>& grad_to_param) {
+    if (recorder_ == nullptr) {
+      recorder_.reset(
+          new AsyncSparseParamUpdateRecorder(trainer_num, grad_to_param));
+    }
+  }
+
+  static std::once_flag init_flag_;
+  static std::unique_ptr<AsyncSparseParamUpdateRecorder> recorder_;
+};
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..67e8fd8a0edc4510d0abe885c821e75b528254f8
--- /dev/null
+++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
+
+#include <algorithm>
+
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace operators {
+namespace distributed {
+
+TEST(ConcurrentSet, All) {
+  ConcurrentSet concurrent_set;
+  std::vector<int64_t> in1 = {1, 2, 3, 4};
+  std::vector<int64_t> in2 = {2, 3, 5, 6};
+
+  std::vector<std::future<void>> futures;
+  futures.push_back(concurrent_set.Update(in1));
+  futures.push_back(concurrent_set.Update(in2));
+
+  for (auto &f : futures) {
+    f.wait();
+  }
+
+  std::unordered_set<int64_t> in;
+  std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin()));
+  std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin()));
+
+  std::vector<int64_t> ret;
+  concurrent_set.GetAndClear(&ret).wait();
+
+  std::unordered_set<int64_t> out;
+  std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin()));
+
+  EXPECT_EQ(in, out);
+
+  concurrent_set.GetAndClear(&ret).wait();
+  EXPECT_EQ(ret.size(), 0);
+}
+
+TEST(AsyncSparseParamUpdateRecorder, All) {
+  std::unordered_map<std::string, std::string> grad_to_param;
+  grad_to_param["grad1"] = "param1";
+  grad_to_param["grad2"] = "param2";
+
+  int trainer_num = 10;
+
+  AsyncSparseParamUpdateRecorder recorder(trainer_num, grad_to_param);
+  std::vector<int64_t> in1 = {1, 2, 3, 4};
+  std::vector<int64_t> in2 = {2, 3, 5, 6};
+
+  std::unordered_set<int64_t> in;
+  std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin()));
+  std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin()));
+
+  recorder.Update("grad1", in1);
+  recorder.Update("grad1", in2);
+
+  EXPECT_TRUE(recorder.HasParam("param1"));
+  EXPECT_TRUE(recorder.HasParam("param2"));
+  EXPECT_FALSE(recorder.HasParam("param3"));
+
+  EXPECT_TRUE(recorder.HasGrad("grad1"));
+  EXPECT_TRUE(recorder.HasGrad("grad2"));
+  EXPECT_FALSE(recorder.HasGrad("grad3"));
+
+  std::vector<int64_t> ret;
+  EXPECT_ANY_THROW(recorder.GetAndClear("param1", trainer_num, &ret));
+
+  for (int i = 0; i < trainer_num; ++i) {
+    std::vector<int64_t> ret;
+    std::unordered_set<int64_t> out;
+
+    recorder.GetAndClear("param1", i, &ret);
+    std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin()));
+
+    EXPECT_EQ(in, out);
+
+    recorder.GetAndClear("param1", i, &ret);
+    EXPECT_EQ(ret.size(), 0);
+  }
+}
+
+}  // namespace distributed
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
index a1a3443348129b5cdf057592fced8fdff238ac09..4c22ad8eb4d4b2e23d8a6720e726eb9e2998314e 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc
@@ -234,6 +234,7 @@ VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep,
                                      const framework::Scope& scope,
                                      const std::string& var_name,
                                      const std::string& out_var_name,
+                                     const std::string& table_name,
                                      int64_t time_out) {
   return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC,
                       time_out);
diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h
index 501a593b11d35c160348e42ee47216a85647aac4..51864dfdca53eb4b1d9045188a6347781130e785 100644
--- a/paddle/fluid/operators/distributed/brpc/brpc_client.h
+++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h
@@ -21,8 +21,10 @@ limitations under the License. */
 #include <functional>
 #include <iostream>
 #include <map>
+#include <memory>
 #include <mutex>  // NOLINT
 #include <string>
+#include <unordered_map>
 #include <vector>
 
 #include "brpc/channel.h"
@@ -66,6 +68,7 @@ class BRPCClient : public RPCClient {
                            const framework::Scope& scope,
                            const std::string& var_name,
                            const std::string& out_var_name,
+                           const std::string& table_name = "",
                            int64_t time_out = FLAGS_rpc_deadline) override;
 
   VarHandlePtr AsyncGetMonomerBarrier(
@@ -107,13 +110,11 @@ class BRPCClient : public RPCClient {
   void SendComplete() override;
 
  private:
-  VarHandlePtr _AsyncGetVar(const std::string& ep,
-                            const platform::DeviceContext& ctx,
-                            const framework::Scope& scope,
-                            const std::string& var_name,
-                            const std::string& out_var_name,
-                            const std::string& method_name,
-                            int64_t time_out = FLAGS_rpc_deadline);
+  VarHandlePtr _AsyncGetVar(
+      const std::string& ep, const platform::DeviceContext& ctx,
+      const framework::Scope& scope, const std::string& var_name,
+      const std::string& out_var_name, const std::string& method_name,
+      const std::string& table_name, int64_t time_out = FLAGS_rpc_deadline);
 
   void Proceed();
   ChannelQueuePtr GetChannel(const std::string& ep);
diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc
index eba18c67771fa26eed855b0f19591e06101f424d..b528bcdd32b11d686f44596d9a1bb663b21691f4 100644
--- a/paddle/fluid/operators/distributed/communicator.cc
+++ b/paddle/fluid/operators/distributed/communicator.cc
@@ -32,6 +32,9 @@ DEFINE_int32(communicator_send_queue_size, 20,
 DEFINE_int32(communicator_max_send_grad_num_before_recv, 20,
              "max grad num to send before recv parameters");
 DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv");
+DEFINE_int32(communicator_send_wait_times, 5,
+             "times that send thread will wait if merge num does not reach "
+             "max_merge_var_num");
 DEFINE_int32(communicator_max_merge_var_num, 20,
              "max var num to merge and send");
 DEFINE_bool(communicator_fake_rpc, false,
@@ -65,6 +68,8 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx,
           << FLAGS_communicator_max_send_grad_num_before_recv;
   VLOG(0) << "communicator_thread_pool_size: "
           << FLAGS_communicator_thread_pool_size;
+  VLOG(0) << "communicator_send_wait_times: "
+          << FLAGS_communicator_send_wait_times;
   VLOG(0) << "communicator_max_merge_var_num: "
           << FLAGS_communicator_max_merge_var_num;
   VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc;
@@ -101,20 +106,32 @@ void Communicator::SendThread() {
           VLOG(3) << var_name << " merge and send";
           std::vector<std::shared_ptr<Variable>> vars;
           size_t merged_var_num = 0;
-          while (var_queue->Size() > 0 &&
-                 merged_var_num < FLAGS_communicator_max_merge_var_num) {
-            vars.push_back(var_queue->Pop());
-            // only count the send number of the first var
-            if (var_name == send_varname_to_queue_.begin()->first) {
-              grad_num_.fetch_add(1, std::memory_order_relaxed);
+          size_t wait_times = 0;
+          while (merged_var_num < FLAGS_communicator_max_merge_var_num) {
+            if (var_queue->Size() == 0) {
+              VLOG(3) << "wait_times -> " << wait_times;
+              if (wait_times >= FLAGS_communicator_send_wait_times) {
+                break;
+              }
+              std::this_thread::sleep_for(std::chrono::milliseconds(10));
+              wait_times++;
+              continue;
+            } else {
+              wait_times = 0;
+
+              vars.push_back(var_queue->Pop());
+              // only count the send number of the first var
+              if (var_name == send_varname_to_queue_.begin()->first) {
+                grad_num_.fetch_add(1, std::memory_order_relaxed);
+              }
+              merged_var_num++;
             }
-            merged_var_num++;
           }
           auto before_merge = GetCurrentUS();
           MergeVars(var_name, vars, send_scope_.get());
           auto after_merge = GetCurrentUS();
-          VLOG(3) << "merge " << var_name << " use time "
-                  << after_merge - before_merge;
+          VLOG(3) << "merge " << merged_var_num << " " << var_name
+                  << " use time " << after_merge - before_merge;
           auto send_functor = distributed::ParameterSend<float>();
           auto &ctx = send_varname_to_ctx_.at(var_name);
           if (!FLAGS_communicator_fake_rpc) {
diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h
index 41155bfc31bb31520fdcf5bd50b203f2e1f2c516..37c39eb15112f745f6a25e95ce65d431d825182e 100644
--- a/paddle/fluid/operators/distributed/communicator.h
+++ b/paddle/fluid/operators/distributed/communicator.h
@@ -109,7 +109,7 @@ inline void MergeVars(const std::string& var_name,
   auto* out_var = scope->Var(var_name);
   if (var0->IsType<framework::LoDTensor>()) {
     auto dims = var0->Get<framework::LoDTensor>().dims();
-    VLOG(3) << "merge " << var_name << " LoDTensor " << dims;
+    VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims;
 
     // init output tensor
     auto* out_t = out_var->GetMutable<framework::LoDTensor>();
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
index 61e94dae3c7a107e10fa5e5518651014cec078bc..8504110c6e9dbfe22b78063999ed4a9e36850e2c 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc
@@ -128,9 +128,11 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep,
                                      const framework::Scope& scope,
                                      const std::string& var_name,
                                      const std::string& out_varname,
+                                     const std::string& table_name,
                                      int64_t time_out) {
   return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname,
-                      "/sendrecv.SendRecvService/GetVariable", time_out);
+                      "/sendrecv.SendRecvService/GetVariable", table_name,
+                      time_out);
 }
 
 VarHandlePtr GRPCClient::AsyncGetVarNoBarrier(
@@ -142,7 +144,7 @@ VarHandlePtr GRPCClient::AsyncGetVarNoBarrier(
 
   return _AsyncGetVar(
       ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname,
-      "/sendrecv.SendRecvService/GetVariableNoBarrier", time_out);
+      "/sendrecv.SendRecvService/GetVariableNoBarrier", "", time_out);
 }
 
 VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
@@ -150,18 +152,21 @@ VarHandlePtr GRPCClient::AsyncGetMonomerVariable(
     const framework::Scope& scope, const std::string& var_name,
     int64_t time_out) {
   return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name,
-                      "/sendrecv.SendRecvService/GetMonomerVariable", time_out);
+                      "/sendrecv.SendRecvService/GetMonomerVariable", "",
+                      time_out);
 }
 
 VarHandlePtr GRPCClient::_AsyncGetVar(
     const std::string& ep, const platform::DeviceContext& ctx,
     const framework::Scope& scope, const std::string& method,
     const std::string& var_name, const std::string& out_varname,
-    const std::string& rpc_path, int64_t time_out) {
+    const std::string& rpc_path, const std::string& table_name,
+    int64_t time_out) {
   const platform::DeviceContext* p_ctx = &ctx;
   const std::string ep_val = ep;
   const std::string var_name_val = var_name;
   const std::string out_varname_val = out_varname;
+  const std::string table_name_val = table_name;
   const framework::Scope* p_scope = &scope;
   const auto ch = GetChannel(ep_val);
   GetProcessor* s = new GetProcessor(ch);
@@ -169,32 +174,33 @@ VarHandlePtr GRPCClient::_AsyncGetVar(
   VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope));
   s->Prepare(h, time_out);
 
-  framework::AsyncIO(
-      [var_name_val, out_varname_val, s, method, p_ctx, h, rpc_path, this] {
-        // prepare input
-        sendrecv::VariableMessage req;
-        req.set_varname(var_name_val);
-        req.set_out_varname(out_varname_val);
-        req.set_trainer_id(trainer_id_);
-        ::grpc::ByteBuffer buf;
-        RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
+  framework::AsyncIO([var_name_val, out_varname_val, table_name_val, s, method,
+                      p_ctx, h, rpc_path, this] {
+    // prepare input
+    sendrecv::VariableMessage req;
+    req.set_varname(var_name_val);
+    req.set_out_varname(out_varname_val);
+    req.set_trainer_id(trainer_id_);
+    req.set_table_name(table_name_val);
+    ::grpc::ByteBuffer buf;
+    RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
 
-        VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
+    VLOG(3) << s->GetVarHandlePtr()->String() << " begin";
 
-        // stub context
-        s->response_call_back_ = ProcGetResponse;
+    // stub context
+    s->response_call_back_ = ProcGetResponse;
 
-        platform::RecordRPCEvent record_event(method);
+    platform::RecordRPCEvent record_event(method);
 
-        auto call =
-            s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
-        call->StartCall();
-        call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+    auto call =
+        s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
+    call->StartCall();
+    call->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
 
-        if (UNLIKELY(platform::IsProfileEnabled())) {
-          h->Wait();
-        }
-      });
+    if (UNLIKELY(platform::IsProfileEnabled())) {
+      h->Wait();
+    }
+  });
 
   req_count_++;
 
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h
index ce0d2152aa27c62b6e12881aaf2ae458597e67e6..ad2f04a6d1dda34e35b67b21dce8ac612ff697a0 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h
@@ -23,9 +23,11 @@ limitations under the License. */
 #include <functional>
 #include <iostream>
 #include <map>
+#include <memory>
 #include <mutex>  // NOLINT
 #include <string>
 #include <thread>  // NOLINT
+#include <unordered_map>
 #include <vector>
 
 #include "grpc++/channel.h"
@@ -187,6 +189,7 @@ class GRPCClient : public RPCClient {
                            const framework::Scope& scope,
                            const std::string& var_name,
                            const std::string& out_varname,
+                           const std::string& table_name = "",
                            int64_t time_out = FLAGS_rpc_deadline) override;
 
   VarHandlePtr AsyncGetVarNoBarrier(
@@ -239,7 +242,8 @@ class GRPCClient : public RPCClient {
       const std::string& ep, const platform::DeviceContext& ctx,
       const framework::Scope& scope, const std::string& method,
       const std::string& var_name, const std::string& out_varname,
-      const std::string& rpc_path, int64_t time_out = FLAGS_rpc_deadline);
+      const std::string& rpc_path, const std::string& table_name = "",
+      int64_t time_out = FLAGS_rpc_deadline);
 
  private:
   grpc::CompletionQueue cq_;
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
index 6e65aa5fae83536d229be63fbaf7874bd45f967d..91c398d0c84db1fc67740cd2368d178610ef0841 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_serde.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <nccl.h>
 #endif
 #include <limits>
+#include <memory>
 #include <thread>  // NOLINT
 
 #include "google/protobuf/io/coded_stream.h"
@@ -104,8 +105,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
   e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber,
                             payload->memory_size());
   if (payload->memory_size() >= std::numeric_limits<int>::max()) {
-    LOG(FATAL) << "AppendZeroCopy varname:" << name
-               << ", vlen:" << payload->memory_size();
+    LOG(FATAL) << "FATAL error: varname:" << name
+               << ", vlen:" << payload->memory_size()
+               << " >= std::numeric_limits<int>::max():"
+               << std::numeric_limits<int>::max() << ", so exit!";
   }
   // steal reference of tensor data
   ::grpc::Slice slices[4];  // metadata, tensor, rows meta, rows
diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
index 0eb313f75dfa64f8722faa365128f3111f72bd0b..75526bed0f0eadada65279ec05757da7a469f984 100644
--- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc
+++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc
@@ -137,6 +137,7 @@ class RequestGet final : public RequestBase {
     // proc request.
     std::string varname = request_.varname();
     std::string out_varname = request_.out_varname();
+    std::string table_name = request_.table_name();
     int trainer_id = request_.trainer_id();
 
     VLOG(4) << "RequestGet " << out_varname << " from " << varname;
@@ -145,19 +146,23 @@ class RequestGet final : public RequestBase {
     framework::Variable* invar = nullptr;
     framework::Variable* outvar = nullptr;
 
-    request_handler_->Handle(varname, scope, invar, &outvar, trainer_id,
-                             out_varname);
+    tmp_scope_ = std::move(scope->NewTmpScope());
+    request_handler_->Handle(varname, tmp_scope_.get(), invar, &outvar,
+                             trainer_id, out_varname, table_name);
 
+    VLOG(1) << "before SerializeToByteBuffer";
     if (outvar) {
       SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(),
                             &reply_);
     }
+    VLOG(1) << "after SerializeToByteBuffer";
     Finish(reply_, &responder_);
   }
 
  protected:
   sendrecv::VariableMessage request_;
   ::grpc::ByteBuffer reply_;
+  std::unique_ptr<framework::Scope> tmp_scope_;
   ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_;
 };
 
diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc
index e7d4c262aa9fad10a23adc61b94ba0c38577c0e8..da73167ae603fb8c8ba9deabe118269891d1f52a 100644
--- a/paddle/fluid/operators/distributed/parameter_recv.cc
+++ b/paddle/fluid/operators/distributed/parameter_recv.cc
@@ -42,27 +42,23 @@ using DDim = framework::DDim;
 template <typename T>
 void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
                                   const framework::Scope &scope) {
-  VLOG(3) << "ParameterRecv in";
+  VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name;
   std::unique_ptr<framework::Scope> local_scope = scope.NewTmpScope();
 
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
   auto &cpu_ctx = *pool.Get(platform::CPUPlace());
 
   distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
 
   auto *recv_var = scope.FindVar(rpc_ctx.var_name);
 
-  std::vector<framework::Tensor *> recved_tensors;
-
   // recv all vars to local scope
   if (recv_var->IsType<framework::LoDTensor>()) {
     std::vector<distributed::VarHandlePtr> rets;
     for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) {
       auto &recv_var_name = rpc_ctx.splited_var_names[i];
-      framework::Tensor *t =
-          local_scope->Var(recv_var_name)->GetMutable<framework::LoDTensor>();
-      recved_tensors.push_back(t);
+      local_scope->Var(recv_var_name);
       VLOG(3) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i];
       rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx,
                                              *local_scope.get(), recv_var_name,
@@ -78,23 +74,61 @@ void ParameterRecv<T>::operator()(const RpcContext &rpc_ctx,
   // concat recved tensor into one var
   {
     size_t output_offset = 0;
+    size_t row_offset = 0;
     framework::Tensor *recv_tensor =
         recv_var->GetMutable<framework::LoDTensor>();
     auto dev_ctx = paddle::platform::CPUDeviceContext();
     int64_t recv_numel = 0;
-    for (auto *in : recved_tensors) {
-      recv_numel += in->numel();
-      auto in_stride = framework::stride_numel(in->dims());
-      auto out_stride = framework::stride_numel(recv_tensor->dims());
-      StridedNumelCopyWithAxis<T>(
-          dev_ctx, 0, recv_tensor->data<T>() + output_offset, out_stride,
-          in->data<T>(), in_stride, in_stride[0]);
-      output_offset += in_stride[0];
+    for (auto &recv_var_name : rpc_ctx.splited_var_names) {
+      auto *recv_var = local_scope->FindVar(recv_var_name);
+      if (recv_var->IsType<framework::LoDTensor>()) {
+        auto &in = recv_var->Get<framework::LoDTensor>();
+        recv_numel += in.numel();
+        auto in_stride = framework::stride_numel(in.dims());
+        auto out_stride = framework::stride_numel(recv_tensor->dims());
+        StridedNumelCopyWithAxis<T>(
+            dev_ctx, 0, recv_tensor->data<T>() + output_offset, out_stride,
+            in.data<T>(), in_stride, in_stride[0]);
+        output_offset += in_stride[0];
+      } else if (recv_var->IsType<framework::SelectedRows>()) {
+        auto &recv_slr = recv_var->Get<framework::SelectedRows>();
+        auto &recv_dims = recv_tensor->dims();
+        int64_t width = recv_dims[1];
+        recv_numel += recv_slr.height() * width;
+        PADDLE_ENFORCE_EQ(recv_slr.value().dims()[1], width);
+        PADDLE_ENFORCE_EQ(recv_slr.value().dims()[0], recv_slr.rows().size());
+        VLOG(3) << "recv slr " << recv_var_name << " dims "
+                << recv_slr.value().dims();
+        if (VLOG_IS_ON(3)) {
+          std::ostringstream sstream;
+          sstream << "[";
+          for (auto &row_id : recv_slr.rows()) {
+            sstream << row_id << ", ";
+          }
+          sstream << "]";
+          VLOG(3) << "recv_slr size: " << recv_slr.rows().size() << " "
+                  << sstream.str();
+        }
+
+        for (auto i = 0; i < recv_slr.rows().size(); ++i) {
+          auto row_id = recv_slr.rows()[i] + row_offset;
+          PADDLE_ENFORCE_LT(row_id, recv_dims[0]);
+          memcpy(recv_tensor->data<T>() + row_id * width,
+                 recv_slr.value().data<T>() + i * width, sizeof(T) * width);
+        }
+        row_offset += recv_slr.height();
+      } else {
+        PADDLE_THROW("unsupported recieved var type");
+      }
+    }
+    auto numel = recv_tensor->numel();
+    if (recv_numel != numel) {
+      LOG(FATAL) << "recv_numel: " << recv_numel << " acture numel: " << numel;
     }
-    PADDLE_ENFORCE_EQ(recv_numel, recv_tensor->numel());
+    PADDLE_ENFORCE_EQ(recv_numel, numel);
   }
 
-  VLOG(3) << "ParameterRecv out";
+  VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name;
 }
 
 template struct ParameterRecv<float>;
diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc
index 9ce424445229cde0a7e775c95f4af8839f4d4d68..dfabad567af590b65b9e777824d476fce2b17238 100644
--- a/paddle/fluid/operators/distributed/parameter_send.cc
+++ b/paddle/fluid/operators/distributed/parameter_send.cc
@@ -47,7 +47,7 @@ void ParameterSend<T>::operator()(const RpcContext &rpc_ctx,
   auto &cpu_ctx = *pool.Get(platform::CPUPlace());
 
   distributed::RPCClient *rpc_client =
-      distributed::RPCClient::GetInstance<RPCCLIENT_T>(0);
+      distributed::RPCClient::GetInstance<RPCCLIENT_T>(rpc_ctx.trainer_id);
 
   auto *send_var = scope.FindVar(rpc_ctx.var_name);
   size_t out_num = rpc_ctx.splited_var_names.size();
diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 991158ac72007efc1233f852caed4f90f35fe1cd..de8f30184611aeb961e2ab69b05779c56371b976 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -18,7 +18,9 @@
 #include <condition_variable>  // NOLINT
 
 #include <functional>
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -180,6 +182,10 @@ class RequestHandler {
     grad_to_prepared_ctx_ = g;
   }
 
+  void SetSparseGradToParam(std::unordered_map<std::string, std::string>* g) {
+    sparse_grad_to_param_ = g;
+  }
+
   void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; }
 
   // Get attributes.
@@ -228,6 +234,7 @@ class RequestHandler {
   std::unordered_map<std::string,
                      std::shared_ptr<framework::ExecutorPrepareContext>>*
       grad_to_prepared_ctx_;
+  std::unordered_map<std::string, std::string>* sparse_grad_to_param_;
 
   RPCServer* rpc_server_;
 };
diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc
index e289ec929dbd6643a2518b92c1a25b7d63e790a9..a41536368abc925531d1a54615546a100482a7eb 100644
--- a/paddle/fluid/operators/distributed/request_handler_impl.cc
+++ b/paddle/fluid/operators/distributed/request_handler_impl.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/variable_helper.h"
+#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
 #include "paddle/fluid/operators/distributed/rpc_server.h"
 #include "paddle/fluid/string/piece.h"
 #include "paddle/fluid/string/printf.h"
@@ -59,6 +60,12 @@ bool RequestSendHandler::Handle(const std::string& varname,
             "async mode should not recv BATCH_BARRIER_MESSAGE or "
             "COMPLETE_MESSAGE");
       }
+      if (AsyncSparseParamUpdateRecorder::GetInstance()->HasGrad(varname)) {
+        auto& grad_slr =
+            scope->FindVar(varname)->Get<framework::SelectedRows>();
+        AsyncSparseParamUpdateRecorder::GetInstance()->Update(varname,
+                                                              grad_slr.rows());
+      }
       executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(),
                                     scope);
       return true;
@@ -82,8 +89,9 @@ bool RequestGetHandler::Handle(const std::string& varname,
                                const int trainer_id,
                                const std::string& out_var_name,
                                const std::string& table_name) {
-  VLOG(4) << "RequestGetHandler:" << varname
-          << " out_var_name: " << out_var_name;
+  VLOG(3) << "RequestGetHandler:" << varname
+          << " out_var_name: " << out_var_name << " trainer_id: " << trainer_id
+          << " table_name: " << table_name;
 
   if (sync_mode_) {
     if (varname == FETCH_BARRIER_MESSAGE) {
@@ -108,7 +116,42 @@ bool RequestGetHandler::Handle(const std::string& varname,
         VLOG(3) << "copying " << varname << " to " << param_bak_name;
         framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t);
       }
-      *outvar = scope_->FindVar(varname);
+      if (AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) &&
+          !table_name.empty()) {
+        std::vector<int64_t> updated_rows;
+        AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear(
+            varname, trainer_id, &updated_rows);
+        if (VLOG_IS_ON(3)) {
+          std::ostringstream sstream;
+          sstream << "[";
+          for (auto& row_id : updated_rows) {
+            sstream << row_id << ", ";
+          }
+          sstream << "]";
+          VLOG(3) << "updated_rows size: " << updated_rows.size() << " "
+                  << sstream.str();
+        }
+        auto& origin_tensor =
+            scope_->FindVar(varname)->Get<framework::LoDTensor>();
+        auto* origin_tensor_data = origin_tensor.data<float>();
+        auto& dims = origin_tensor.dims();
+        *outvar = scope->Var();
+        auto* out_slr = (*outvar)->GetMutable<framework::SelectedRows>();
+        out_slr->set_rows(updated_rows);
+        out_slr->set_height(dims[0]);
+        auto out_dims = framework::make_ddim(
+            {static_cast<int64_t>(updated_rows.size()), dims[1]});
+        auto* data = out_slr->mutable_value()->mutable_data<float>(
+            out_dims, origin_tensor.place());
+        auto width = dims[1];
+        for (auto i = 0; i < updated_rows.size(); ++i) {
+          PADDLE_ENFORCE_LT(updated_rows[i], dims[0]);
+          memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width,
+                 sizeof(float) * width);
+        }
+      } else {
+        *outvar = scope_->FindVar(varname);
+      }
     }
   }
   return true;
diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h
index ea54e0c2951253fc009672f4cd2e5233ed56944e..d4be2c28fdbaa4beef62402155de5b677ed67e9b 100644
--- a/paddle/fluid/operators/distributed/rpc_client.h
+++ b/paddle/fluid/operators/distributed/rpc_client.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <condition_variable>  // NOLINT
+#include <memory>
 #include <string>
 #include "gflags/gflags.h"
 
@@ -44,6 +45,7 @@ class RPCClient {
                                    const framework::Scope& scope,
                                    const std::string& var_name,
                                    const std::string& out_varname,
+                                   const std::string& table_name = "",
                                    int64_t time_out = FLAGS_rpc_deadline) = 0;
 
   virtual VarHandlePtr AsyncGetVarNoBarrier(
@@ -96,6 +98,7 @@ class RPCClient {
   // Init is called by GetInstance.
   template <typename T>
   static void Init(int trainer_id) {
+    VLOG(0) << "init rpc client with trainer_id " << trainer_id;
     trainer_id_ = trainer_id;
     if (rpc_client_.get() == nullptr) {
       rpc_client_.reset(new T());
diff --git a/paddle/fluid/operators/distributed/rpc_common.h b/paddle/fluid/operators/distributed/rpc_common.h
index 3de89c2ae89d29edc317ca123882d1c55038b6ca..eb127bf4ad5a5c9a28210e2fbcdb69b07543f4b9 100644
--- a/paddle/fluid/operators/distributed/rpc_common.h
+++ b/paddle/fluid/operators/distributed/rpc_common.h
@@ -27,23 +27,26 @@ struct RpcContext {
 
   RpcContext(const std::string &name, const std::vector<std::string> &names,
              const std::vector<std::string> &emap,
-             const std::vector<int64_t> &sections)
+             const std::vector<int64_t> &sections, int id)
       : var_name(name),
         splited_var_names(names),
         epmap(emap),
-        height_sections(sections) {}
+        height_sections(sections),
+        trainer_id(id) {}
 
   RpcContext(const RpcContext &ctx) {
     var_name = ctx.var_name;
     splited_var_names = ctx.splited_var_names;
     epmap = ctx.epmap;
     height_sections = ctx.height_sections;
+    trainer_id = ctx.trainer_id;
   }
 
   std::string var_name;
   std::vector<std::string> splited_var_names;
   std::vector<std::string> epmap;
   std::vector<int64_t> height_sections;
+  int trainer_id;
 };
 
 inline std::ostream &operator<<(std::ostream &os, const RpcContext &rpc_ctx) {
diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
index a1ef1af39ff2ab1456706ebafbd3d7ce1acc0c07..1096f3773c6d44560d370502b1c550d67d40ca64 100644
--- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt
@@ -2,9 +2,9 @@ include(operators)
 
 set(DISTRIBUTE_DEPS "")
 if(WITH_GRPC)
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
 else()
-    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
+    set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder brpc leveldb snappystream snappy protobuf ssl crypto zlib node)
     if(WITH_BRPC_RDMA)
         find_library(IBVERBS_LIBRARY NAMES ibverbs)
         ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
index 5b30ed472d51a37a0705d1717395da9e4ff7d743..a672fb2a9141a81383d947dcc961a112aee3f7ac 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc
@@ -24,8 +24,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/distributed/distributed.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
+#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h"
 #include "paddle/fluid/operators/distributed/request_handler_impl.h"
 #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h"
+
 #include "paddle/fluid/platform/profiler.h"
 
 DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send");
@@ -292,6 +294,8 @@ static void FillRequestCtx(
     std::unordered_map<std::string,
                        std::shared_ptr<framework::ExecutorPrepareContext>>
         *prefetch_ctx,
+    std::unordered_map<std::string, std::string>
+        *sparse_grad_name_to_param_name,
     std::shared_ptr<framework::ExecutorPrepareContext> checkpoint_ctx,
     distributed::RPCServer *rpc_server) {
   h->SetScope(scope);
@@ -299,6 +303,7 @@ static void FillRequestCtx(
   h->SetExecutor(executor);
   h->SetProgram(program);
   h->SetPrefetchPreparedCtx(prefetch_ctx);
+  h->SetSparseGradToParam(sparse_grad_name_to_param_name);
   h->SetRPCServer(rpc_server);
   h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx);
 }
@@ -414,10 +419,24 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
     prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i];
   }
 
-  auto f =
-      std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx,
-                &executor, program, &prefetch_var_name_to_prepared_ctx,
-                ckpt_pre_context, rpc_service_.get());
+  // parse attr of kSparseGradToParam  sparse_grad_name -> param_name
+  std::unordered_map<std::string, std::string> sparse_grad_name_to_param_name;
+  auto sparse_grad_name_to_param_name_str =
+      Attr<std::vector<std::string>>(kSparseGradToParam);
+  for (const auto &sparse_grad_name_and_param_name :
+       sparse_grad_name_to_param_name_str) {
+    std::vector<std::string> pieces;
+    split(sparse_grad_name_and_param_name, ':', &pieces);
+    PADDLE_ENFORCE_EQ(pieces.size(), 2);
+    VLOG(3) << "after split, sparse_grad_name = " << pieces[0]
+            << ", param_name = " << pieces[1];
+    sparse_grad_name_to_param_name[pieces[0]] = pieces[1];
+  }
+
+  auto f = std::bind(
+      FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx, &executor,
+      program, &prefetch_var_name_to_prepared_ctx,
+      &sparse_grad_name_to_param_name, ckpt_pre_context, rpc_service_.get());
 
   f(request_send_handler_.get());
   f(request_get_handler_.get());
@@ -445,6 +464,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
     RunSyncLoop(&executor, program, &recv_scope, &dev_ctx,
                 prefetch_block_id_list, checkpoint_block_id);
   } else {
+    distributed::AsyncSparseParamUpdateRecorder::Init(
+        fan_in, sparse_grad_name_to_param_name);
     RunAsyncLoop(&executor, program, &recv_scope);
   }
 }
@@ -475,6 +496,10 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<std::vector<std::string>>(kPrefetchVarNameToBlockId,
                                       "prefetch blocks to run on server side.")
         .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        kSparseGradToParam,
+        "sparse grad name to param name. like: 'emb@Grad:emb'")
+        .SetDefault({});
     AddAttr<int>("Fanin", "How many clients send to this server.")
         .SetDefault(1);
     AddAttr<int>(kCheckpointBlockId,
diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
index f20442bad7c5bd96173b9d6efc4dceb13feacf5b..1cf2130d7a593077d1145b4f3be379c32557dd53 100644
--- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
+++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h
@@ -16,8 +16,10 @@ limitations under the License. */
 
 #include <stdint.h>
 #include <atomic>
+#include <memory>
 #include <set>
 #include <string>
+#include <unordered_map>
 #include <utility>
 #include <vector>
 
@@ -35,6 +37,7 @@ namespace operators {
 constexpr char kOptimizeBlocks[] = "optimize_blocks";
 constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id";
 constexpr char kCheckpointBlockId[] = "checkpint_block_id";
+constexpr char kSparseGradToParam[] = "sparse_grad_to_param";
 
 void RunServer(std::shared_ptr<distributed::RPCServer> service);
 
diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc
index 3fd0700a077321d931e87b1d94c3637d167c9eff..8e9846b1fc89953526149be3838103526d5c441b 100644
--- a/paddle/fluid/operators/distributed_ops/recv_op.cc
+++ b/paddle/fluid/operators/distributed_ops/recv_op.cc
@@ -50,17 +50,18 @@ class RecvOp : public framework::OperatorBase {
 
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &ctx = *pool.Get(place);
+    auto trainer_id = Attr<int>("trainer_id");
 
     distributed::RPCClient *rpc_client =
-        distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-            Attr<int>("trainer_id"));
+        distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
 
     std::vector<std::string> recv_varnames =
         Attr<std::vector<std::string>>("recv_varnames");
 
     if (recv_varnames.size() > 0) {
       auto recv_functor = distributed::ParameterRecv<float>();
-      auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {});
+      auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {},
+                                             trainer_id);
       recv_functor(rpc_ctx, scope);
     } else {
       if (with_barrier) {
diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc
index b08cd0942f8c89b60d722c931d0cec2063b96578..5731bcc15a07074b3d77873c5cdcbb70dc41aba8 100644
--- a/paddle/fluid/operators/distributed_ops/send_op.cc
+++ b/paddle/fluid/operators/distributed_ops/send_op.cc
@@ -42,6 +42,7 @@ class SendOp : public framework::OperatorBase {
 
     auto epmap = Attr<std::vector<std::string>>("epmap");
     int sync_send = Attr<int>("sync_mode");
+    auto trainer_id = Attr<int>("trainer_id");
 
     auto send_varnames = Attr<std::vector<std::string>>("send_varnames");
     auto height_sections = Attr<std::vector<int64_t>>("sections");
@@ -51,7 +52,7 @@ class SendOp : public framework::OperatorBase {
       if (distributed::Communicator::GetInstance() == nullptr) {
         auto send_functor = distributed::ParameterSend<float>();
         auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap,
-                                               height_sections);
+                                               height_sections, trainer_id);
         send_functor(rpc_ctx, scope, true);
       } else {
         distributed::Communicator::GetInstance()->Send(ins[0], scope);
@@ -62,8 +63,7 @@ class SendOp : public framework::OperatorBase {
       auto& ctx = *pool.Get(place);
 
       distributed::RPCClient* rpc_client =
-          distributed::RPCClient::GetInstance<RPCCLIENT_T>(
-              Attr<int>("trainer_id"));
+          distributed::RPCClient::GetInstance<RPCCLIENT_T>(trainer_id);
 
       std::vector<distributed::VarHandlePtr> rets;
       for (size_t i = 0; i < ins.size(); i++) {
diff --git a/paddle/fluid/operators/distributed_ops/split_byref_op.cc b/paddle/fluid/operators/distributed_ops/split_byref_op.cc
index d65e7ffe5a492fe5df038bb6bd469e09de6f95ca..43980107c14176f1751a3db2858c80cb65c764de 100644
--- a/paddle/fluid/operators/distributed_ops/split_byref_op.cc
+++ b/paddle/fluid/operators/distributed_ops/split_byref_op.cc
@@ -31,14 +31,16 @@ class SplitByrefOp : public framework::OperatorWithKernel {
     auto in_dims = ctx->GetInputDim("X");
     auto outs_names = ctx->Outputs("Out");
     size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
-    std::vector<int> sections = static_cast<std::vector<int>>(
-        ctx->Attrs().Get<std::vector<int>>("sections"));
+    auto sections = ctx->Attrs().Get<std::vector<int>>("sections");
     const size_t outs_number = outs_names.size();
     std::vector<framework::DDim> outs_dims;
     outs_dims.reserve(outs_number);
 
     if (num > 0) {
-      int64_t in_axis_dim = in_dims[0];
+      int64_t in_axis_dim = 0;
+      if (ctx->IsRuntime()) {
+        in_axis_dim = in_dims[0];
+      }
       PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
                         "tensor split does not result"
                         " in an equal division");
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 33bd275e5cc507ec700b3694cd8b1df9672ec512..7d551106756070a14f94f39f19b775d022d90777 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -235,11 +235,13 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
 
     int g_find_max;
     memory::Copy(platform::CPUPlace(), &g_find_max, gpu_place, find_max,
-                 sizeof(int), 0);
+                 sizeof(int), ctx.stream());
+    ctx.Wait();
     if (g_find_max) {
       int len;
       memory::Copy(platform::CPUPlace(), &len, gpu_place, out_size_data,
-                   sizeof(int), 0);
+                   sizeof(int), ctx.stream());
+      ctx.Wait();
       FindAbsMaxFunctor<platform::CUDADeviceContext, T>()(ctx, scale_arr, len,
                                                           out_scale_data);
     }
@@ -258,25 +260,26 @@ struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
     const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
 
     T accum;
-    memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
-                 sizeof(T), 0);
     T state;
-    memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
-                 sizeof(T), 0);
     T scale;
+    memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
+                 sizeof(T), ctx.stream());
+    memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
+                 sizeof(T), ctx.stream());
     memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T),
-                 0);
-
+                 ctx.stream());
+    ctx.Wait();
     state = rate * state + 1;
     accum = rate * accum + scale;
     scale = accum / state;
 
     memory::Copy(gpu_place, out_accum->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &accum, sizeof(T), 0);
+                 platform::CPUPlace(), &accum, sizeof(T), ctx.stream());
     memory::Copy(gpu_place, out_state->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &state, sizeof(T), 0);
+                 platform::CPUPlace(), &state, sizeof(T), ctx.stream());
     memory::Copy(gpu_place, out_scale->mutable_data<T>(gpu_place),
-                 platform::CPUPlace(), &scale, sizeof(T), 0);
+                 platform::CPUPlace(), &scale, sizeof(T), ctx.stream());
+    ctx.Wait();
   }
 };
 
diff --git a/paddle/fluid/operators/grid_sampler_op.cc b/paddle/fluid/operators/grid_sampler_op.cc
index 241184c6f4a19a1da0d6d75c5d4e2b372c14e9da..57a1fcd42da04a766ebd8713e3863f259b3784ac 100644
--- a/paddle/fluid/operators/grid_sampler_op.cc
+++ b/paddle/fluid/operators/grid_sampler_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/grid_sampler_op.h"
+#include <memory>
 #include "paddle/fluid/framework/op_registry.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
@@ -40,10 +41,12 @@ class GridSampleOp : public framework::OperatorWithKernel {
                    "Input(X) of GridSampleOp should be 4-D Tensor.");
     PADDLE_ENFORCE(grid_dims.size() == 4,
                    "Input(Grid) of GridSampleOp should be 4-D Tensor.");
-    PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2.");
-    PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0],
-                      "Input(X) and Input(Grid) dims[0] should be equal.");
+    if (ctx->IsRuntime() || grid_dims[3] > 0) {
+      PADDLE_ENFORCE(grid_dims[3] == 2, "Input(Grid) dims[3] should be 2.");
+    }
     if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(grid_dims[0], x_dims[0],
+                        "Input(X) and Input(Grid) dims[0] should be equal.");
       PADDLE_ENFORCE_EQ(
           grid_dims[1], x_dims[2],
           "Input(X) dims[2] and Input(Grid) dims[1] should be equal.");
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index 752d706cbfab8eb3027fe9610c25b7400ecfed1d..7437d7bd2092044b6634aa720fbee1a02b630bcd 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -47,8 +47,11 @@ class GRUOp : public framework::OperatorWithKernel {
     auto weight_dims = ctx->GetInputDim("Weight");
     int input_size = input_dims[1];
     int frame_size = weight_dims[0];
-    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
-                      "The input_size must be 3 times of frame_size in GRUOp.");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(
+          input_size, frame_size * 3,
+          "The input_size must be 3 times of frame_size in GRUOp.");
+    }
     PADDLE_ENFORCE_EQ(
         weight_dims[1], frame_size * 3,
         "The shape of Weight matrix must be [frame_size, frame_size * 3].");
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index 82c8171ca52ffb128df103f27bafbdba1e72e52f..7cfe0aabcb7f3ce86ccc3a9a1c54b3b60d384aa1 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -238,6 +238,8 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
       zero(dev_ctx, w_grad, static_cast<T>(0.0));
       bit_code->MulGradWeight(pre_out_grad, w_grad, in);
     } else {
+      PADDLE_ENFORCE(path != nullptr,
+                     "Sparse mode should not be used without custom tree!");
       framework::Vector<int64_t> real_rows = PathToRows(*path);
       auto* w_grad =
           ctx.Output<framework::SelectedRows>(framework::GradVarName("W"));
diff --git a/paddle/fluid/operators/huber_loss_op.cc b/paddle/fluid/operators/huber_loss_op.cc
index a72db384c1f09f66ecf7ce85271d6263bbdcb523..157f13ffbc3d52180fac0efac07dd23112d692e7 100644
--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -31,13 +31,18 @@ class HuberLossOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
 
-    PADDLE_ENFORCE_EQ(x_dims, y_dims);
     PADDLE_ENFORCE_EQ(x_dims.size(), 2,
                       "The rank of Input(X) must be 2 and the shape is "
                       "[batch_size, 1].");
-    PADDLE_ENFORCE_EQ(x_dims[1], 1,
-                      "Each row of Input(X) contains a real value, "
-                      "so the 2nd dimension of Input(X) must be 1.");
+    if (ctx->IsRuntime() ||
+        (framework::product(x_dims) > 0 && framework::product(y_dims) > 0)) {
+      PADDLE_ENFORCE_EQ(x_dims, y_dims, "Shape of X and Y should be same");
+    }
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(x_dims[1], 1,
+                        "Each row of Input(X) contains a real value, "
+                        "so the 2nd dimension of Input(X) must be 1.");
+    }
 
     ctx->SetOutputDim("Residual", x_dims);
     ctx->SetOutputDim("Out", {x_dims[0], 1});
diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc
index edee8c08d070742d54f761083592466658a445c9..900b0c636ddafc8c033560adf58d596eb696621f 100644
--- a/paddle/fluid/operators/interpolate_op.cc
+++ b/paddle/fluid/operators/interpolate_op.cc
@@ -37,10 +37,24 @@ class InterpolateOp : public framework::OperatorWithKernel {
         "Interpolation method can only be \"bilinear\" or \"nearest\".");
 
     auto dim_x = ctx->GetInputDim("X");  // NCHW format
-    int out_h = ctx->Attrs().Get<int>("out_h");
-    int out_w = ctx->Attrs().Get<int>("out_w");
     PADDLE_ENFORCE_EQ(dim_x.size(), 4, "X's dimension must be 4");
 
+    int out_h, out_w;
+    float scale = ctx->Attrs().Get<float>("scale");
+    if (scale > 0) {
+      // round down
+      out_h = static_cast<int>(dim_x[2] * scale);
+      out_w = static_cast<int>(dim_x[3] * scale);
+      // protect when input shape is -1
+      out_h = out_h > 0 ? out_h : -1;
+      out_w = out_w > 0 ? out_w : -1;
+    } else {
+      out_h = ctx->Attrs().Get<int>("out_h");
+      out_w = ctx->Attrs().Get<int>("out_w");
+      PADDLE_ENFORCE_GT(out_h, 0, "out_h should be greater than 0.");
+      PADDLE_ENFORCE_GT(out_w, 0, "out_w should be greater than 0.");
+    }
+
     if (ctx->HasInput("OutSize") && ctx->IsRuntime()) {
       auto out_size_dim = ctx->GetInputDim("OutSize");
       PADDLE_ENFORCE_EQ(out_size_dim.size(), 1,
@@ -49,6 +63,7 @@ class InterpolateOp : public framework::OperatorWithKernel {
       ctx->ShareLoD("X", "Out");
       return;
     }
+
     std::vector<int64_t> dim_out({dim_x[0], dim_x[1], out_h, out_w});
     ctx->SetOutputDim("Out", framework::make_ddim(dim_out));
   }
@@ -77,6 +92,7 @@ class InterpolateOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddAttr<int>("out_h", "output height of interpolate op.");
     AddAttr<int>("out_w", "output width of interpolate op.");
+    AddAttr<float>("scale", "scale factor of interpolate op.").SetDefault(0.);
     AddAttr<std::string>("interp_method",
                          "(string, default \"bilinear\"), interpolation "
                          "method, can be \"bilinear\" for "
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index b887878ea2291d6c56fec91738784e338606b84f..35177a4e9ade26831f50de84bbb943d856cb98d9 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -192,9 +192,21 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
     auto* output = ctx.Output<Tensor>("Out");
     auto* input_data = input->data<T>();
 
+    int n = input->dims()[0];
+    int c = input->dims()[1];
+    int in_h = input->dims()[2];
+    int in_w = input->dims()[3];
+
     auto interp_method = ctx.Attr<std::string>("interp_method");
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+
+    float scale = ctx.Attr<float>("scale");
+    if (scale > 0) {
+      out_h = in_h * scale;
+      out_w = in_w * scale;
+    }
+
     auto out_size = ctx.Input<Tensor>("OutSize");
     if (out_size != nullptr) {
       Tensor sizes;
@@ -207,11 +219,6 @@ class InterpolateOpCUDAKernel : public framework::OpKernel<T> {
     bool align_corners = ctx.Attr<bool>("align_corners");
     int align_mode = ctx.Attr<int>("align_mode");
 
-    int n = input->dims()[0];
-    int c = input->dims()[1];
-    int in_h = input->dims()[2];
-    int in_w = input->dims()[3];
-
     auto* output_data =
         output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
 
@@ -268,14 +275,20 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
     math::SetConstant<platform::CUDADeviceContext, T> zero;
     zero(device_ctx, input_grad, static_cast<T>(0.0));
 
+    int n = input_grad->dims()[0];
+    int c = input_grad->dims()[1];
+    int in_h = input_grad->dims()[2];
+    int in_w = input_grad->dims()[3];
+
     auto interp_method = ctx.Attr<std::string>("interp_method");
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+    float scale = ctx.Attr<float>("scale");
+    if (scale > 0) {
+      out_h = in_h * scale;
+      out_w - in_w* scale;
+    }
     auto out_size = ctx.Input<Tensor>("OutSize");
-
-    bool align_corners = ctx.Attr<bool>("align_corners");
-    int align_mode = ctx.Attr<int>("align_mode");
-
     if (out_size != nullptr) {
       Tensor sizes;
       framework::TensorCopy(*out_size, platform::CPUPlace(), &sizes);
@@ -284,10 +297,8 @@ class InterpolateGradOpCUDAKernel : public framework::OpKernel<T> {
       out_w = size_data[1];
     }
 
-    int n = input_grad->dims()[0];
-    int c = input_grad->dims()[1];
-    int in_h = input_grad->dims()[2];
-    int in_w = input_grad->dims()[3];
+    bool align_corners = ctx.Attr<bool>("align_corners");
+    int align_mode = ctx.Attr<int>("align_mode");
 
     int in_hw = in_h * in_w;
     int out_hw = out_h * out_w;
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index c631ad1dd158ce114169602f073d69b2291b5b3b..5fd42809dfec6dd821c9b27bc97d61de94b5d326 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -163,9 +163,21 @@ class InterpolateKernel : public framework::OpKernel<T> {
     auto* input = ctx.Input<Tensor>("X");
     auto* output = ctx.Output<Tensor>("Out");
 
+    const int n = input->dims()[0];
+    const int c = input->dims()[1];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
+
     std::string interp_method = ctx.Attr<std::string>("interp_method");
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+
+    float scale = ctx.Attr<float>("scale");
+    if (scale > 0) {
+      out_h = static_cast<int>(in_h * scale);
+      out_w = static_cast<int>(in_w * scale);
+    }
+
     auto out_size = ctx.Input<Tensor>("OutSize");
     if (out_size != nullptr) {
       auto out_size_data = out_size->data<int>();
@@ -175,11 +187,6 @@ class InterpolateKernel : public framework::OpKernel<T> {
     bool align_corners = ctx.Attr<bool>("align_corners");
     int align_mode = ctx.Attr<int>("align_mode");
 
-    const int n = input->dims()[0];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-
     output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
     auto& device_ctx =
         ctx.template device_context<platform::CPUDeviceContext>();
@@ -221,23 +228,31 @@ class InterpolateGradKernel : public framework::OpKernel<T> {
     auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
+    const int n = input->dims()[0];
+    const int c = input->dims()[1];
+    const int in_h = input->dims()[2];
+    const int in_w = input->dims()[3];
+
     std::string interp_method = ctx.Attr<std::string>("interp_method");
     int out_h = ctx.Attr<int>("out_h");
     int out_w = ctx.Attr<int>("out_w");
+
+    float scale = ctx.Attr<float>("scale");
+    if (scale > 0) {
+      out_h = static_cast<int>(in_h * scale);
+      out_w = static_cast<int>(in_w * scale);
+    }
+
     auto out_size = ctx.Input<Tensor>("OutSize");
     if (out_size != nullptr) {
       auto out_size_data = out_size->data<int>();
       out_h = out_size_data[0];
       out_w = out_size_data[1];
     }
+
     bool align_corners = ctx.Attr<bool>("align_corners");
     int align_mode = ctx.Attr<int>("align_mode");
 
-    const int n = input->dims()[0];
-    const int c = input->dims()[1];
-    const int in_h = input->dims()[2];
-    const int in_w = input->dims()[3];
-
     input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
     auto& device_ctx =
         ctx.template device_context<platform::CPUDeviceContext>();
diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc
index a43f22c0496f89943d2fd5110446f1aae6a99315..a7c5d6305b09afb93be0b3b8524a91bd53e719fe 100644
--- a/paddle/fluid/operators/kldiv_loss_op.cc
+++ b/paddle/fluid/operators/kldiv_loss_op.cc
@@ -35,8 +35,10 @@ class KLDivLossOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(dim_x.size(), dim_target.size(),
                       "Input(X) rank and Input(Target) rank should be same.");
     for (int i = 0; i < dim_x.size(); i++) {
-      PADDLE_ENFORCE_EQ(dim_x[i], dim_target[i],
-                        "Input(X) and Input(Target) should in same shape.");
+      if (ctx->IsRuntime() || (dim_x[i] > 0 && dim_target[i] > 0)) {
+        PADDLE_ENFORCE_EQ(dim_x[i], dim_target[i],
+                          "Input(X) and Input(Target) should in same shape.");
+      }
     }
 
     auto reduction = ctx->Attrs().Get<std::string>("reduction");
diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc
index 9b1a854a312551732424e0d127a43328b8db6085..1aac60ef36c62703f8f9a3b896c17a1483642f53 100644
--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -46,11 +46,18 @@ class LayerNormOp : public framework::OperatorWithKernel {
     int right = static_cast<int>(matrix_dim[1]);
     if (ctx->HasInput("Scale")) {
       PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
+
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right,
+                          "scale should with right");
+      }
     }
     if (ctx->HasInput("Bias")) {
       PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right,
+                          "bias should with right");
+      }
     }
 
     ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc
index fa09cb61e64aacd2aebf1ecf9826a15f9dcef877..a94704a7282f4962c981e1a106cfe5e056fc0f90 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/linear_chain_crf_op.h"
+
 #include <memory>
 
 namespace paddle {
@@ -152,12 +153,19 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
     auto transition_dims = ctx->GetInputDim("Transition");
     PADDLE_ENFORCE_EQ(transition_dims.size(), 2,
                       "The Input(Transition) should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(
-        transition_dims[0] - 2, transition_dims[1],
-        "An invalid dimension for the Input(Transition), which should "
-        "be a 2-D tensor with shape [(D + 2) x D].");
-    PADDLE_ENFORCE_EQ(
-        emission_dims[1], transition_dims[1],
+    bool check = true;
+    if ((!ctx->IsRuntime()) &&
+        (transition_dims[0] <= 0 || transition_dims[1] <= 0)) {
+      check = false;
+    }
+    if (check) {
+      PADDLE_ENFORCE_EQ(
+          transition_dims[0] - 2, transition_dims[1],
+          "An invalid dimension for the Input(Transition), which should "
+          "be a 2-D tensor with shape [(D + 2) x D].");
+    }
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
+        ctx, emission_dims[1], transition_dims[1],
         "The 2nd dimension of the Input(Emission) and the Input(Transition) "
         "should be equal to the tag number.");
 
@@ -165,8 +173,8 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
                    "The Input(Label) should be a 2-D tensor with the 2nd "
                    "dimensions fixed to 1.");
-    PADDLE_ENFORCE_EQ(
-        emission_dims[0], label_dims[0],
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
+        ctx, emission_dims[0], label_dims[0],
         "The height of Input(Emission) and the height of Input(Label) "
         "should be the same.");
 
@@ -211,12 +219,19 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
     auto transition_exps_dims = ctx->GetInputDim("TransitionExps");
     PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2,
                       "The Input(TransitionExps) should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(
-        transition_exps_dims[0] - 2, transition_exps_dims[1],
-        "An invalid dimension for the Input(TransitionExps), which should "
-        "be a 2-D tensor with shape [(D + 2) x D].");
-    PADDLE_ENFORCE_EQ(
-        emission_exps_dims[1], transition_exps_dims[1],
+    bool check = true;
+    if ((!ctx->IsRuntime()) &&
+        (transition_exps_dims[0] <= 0 || transition_exps_dims[1] <= 0)) {
+      check = false;
+    }
+    if (check) {
+      PADDLE_ENFORCE_EQ(
+          transition_exps_dims[0] - 2, transition_exps_dims[1],
+          "An invalid dimension for the Input(TransitionExps), which should "
+          "be a 2-D tensor with shape [(D + 2) x D].");
+    }
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
+        ctx, emission_exps_dims[1], transition_exps_dims[1],
         "The 2nd dimension of the Input(EmissionExps) and the "
         "Input(TransitionExps) should be equal to the tag number.");
 
@@ -224,8 +239,8 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
                    "The Input(Label) should be a 2-D tensor with the 2nd "
                    "dimensions fixed to 1.");
-    PADDLE_ENFORCE_EQ(
-        emission_exps_dims[0], label_dims[0],
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
+        ctx, emission_exps_dims[0], label_dims[0],
         "The height of Input(EmissionExps) and the height of Input(Label) "
         "should be the same.");
 
diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f4aeb062d8dfae31a72b8ebccb3d377276662da6
--- /dev/null
+++ b/paddle/fluid/operators/linspace_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/linspace_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LinspaceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Start"),
+                   "Input(Start) of LinspaceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Stop"),
+                   "Input(Stop) of LinspaceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Num"),
+                   "Input(Num) of LinspaceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(OUt) of LinspaceOp should not be null.");
+
+    auto s_dims = ctx->GetInputDim("Start");
+    PADDLE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1),
+                   "The shape of Input(Start) should be [1].");
+
+    auto e_dims = ctx->GetInputDim("Stop");
+    PADDLE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1),
+                   "The shape of Input(Stop) should be [1].");
+
+    auto step_dims = ctx->GetInputDim("Num");
+    PADDLE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1),
+                   "The shape of Input(Num) should be [1].");
+
+    ctx->SetOutputDim("Out", {-1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(
+        ctx.Input<framework::Tensor>("Start")->type(), ctx.device_context(),
+        layout_, library_);
+  }
+};
+
+class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Start",
+             "First entry in the sequence. It is a tensor of shape [1], should "
+             "be of type float32 or float64.");
+    AddInput("Stop",
+             "Last entry in the sequence. It is a tensor of shape [1], should "
+             "be of type float32 or float64.");
+    AddInput("Num",
+             "Number of entry in the sequence. It is a tensor of shape [1], "
+             "should be of type int32.");
+    AddOutput("Out", "A sequence of numbers.");
+    AddComment(R"DOC(
+    Return fixed number of evenly spaced values within a given interval. First entry is start, and last entry is stop. In the case when Num is 1, only Start is returned. Like linspace function of numpy.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker);
+REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel<float>,
+                       ops::CPULinspaceKernel<double>);
diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..90bd17cda0e0d1f78810233537bb502f9115fbd0
--- /dev/null
+++ b/paddle/fluid/operators/linspace_op.cu
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/linspace_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
+  CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+}
+
+template <typename T>
+__global__ void LinspaceSpecialKernel(T start, T* out) {
+  out[0] = start;
+}
+
+template <typename T>
+class CUDALinspaceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* start_t = context.Input<framework::Tensor>("Start");
+    auto* stop_t = context.Input<framework::Tensor>("Stop");
+    auto* num_t = context.Input<framework::Tensor>("Num");
+    auto* out = context.Output<framework::Tensor>("Out");
+
+    framework::Tensor n;
+    framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
+    T start = n.data<T>()[0];
+    framework::TensorCopy(*stop_t, platform::CPUPlace(), &n);
+    T stop = n.data<T>()[0];
+    framework::TensorCopy(*num_t, platform::CPUPlace(), &n);
+    int32_t num = n.data<int32_t>()[0];
+
+    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
+
+    out->Resize(framework::make_ddim({num}));
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+
+    T step = 0;
+    if (num != 1) {
+      step = (stop - start) / (num - 1);
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    int block = 512;
+    int grid = (num + block - 1) / block;
+    LinspaceKernel<T><<<grid, block, 0, stream>>>(start, step, num, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel<float>,
+                        ops::CUDALinspaceKernel<double>);
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1fcac73b0ad249aa19859bde770a8554cdb7408
--- /dev/null
+++ b/paddle/fluid/operators/linspace_op.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <functional>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CPULinspaceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
+    T stop = context.Input<framework::Tensor>("Stop")->data<T>()[0];
+    int32_t num = context.Input<framework::Tensor>("Num")->data<int32_t>()[0];
+    auto* out = context.Output<framework::Tensor>("Out");
+    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
+
+    out->Resize(framework::make_ddim({num}));
+
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+
+    if (num > 1) {
+      T step = (stop - start) / (num - 1);
+      T value = start;
+      for (int i = 0; i < num; ++i) {
+        out_data[i] = value;
+        value += step;
+      }
+    } else {
+      out_data[0] = start;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc
index e0ab02cd90cdee848250a6aba882b0cb0c17abd7..458037c5aca6af4c8c97b2da630c35929770c156 100644
--- a/paddle/fluid/operators/lod_reset_op.cc
+++ b/paddle/fluid/operators/lod_reset_op.cc
@@ -30,10 +30,10 @@ class LoDResetOp : public framework::OperatorWithKernel {
 
     if (!ctx->HasInput("Y")) {
       auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
-      PADDLE_ENFORCE_GT(level0.size(), 1,
+      PADDLE_ENFORCE_GT(level0.size(), 0,
                         "If Input(Y) not provided, the target lod should be "
                         "specified by attribute `target_lod`.");
-    } else {
+    } else if (ctx->IsRuntime()) {
       ctx->ShareLoD("Y", "Out");
     }
 
@@ -48,6 +48,23 @@ class LoDResetOp : public framework::OperatorWithKernel {
   }
 };
 
+class LoDResetOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_var_name = ctx->Input("X").front();
+    auto out_var_name = ctx->Output("Out").front();
+    if (ctx->HasInput("Y")) {
+      auto y_var_name = ctx->Input("Y").front();
+      auto y_lod_level = std::max(ctx->GetLoDLevel(y_var_name), 1);
+      ctx->SetLoDLevel(out_var_name, y_lod_level);
+    } else {
+      ctx->SetLoDLevel(out_var_name, 1);
+    }
+    ctx->SetDataType(out_var_name, ctx->GetDataType(x_var_name));
+    ctx->SetType(out_var_name, paddle::framework::proto::VarType::LOD_TENSOR);
+  }
+};
+
 class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -177,9 +194,10 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LoDResetGradNoNeedBufferVarInference,
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker,
-                  ops::LoDResetGradDescMaker);
+                  ops::LoDResetGradDescMaker, ops::LoDResetOpVarTypeInference);
 REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp,
                   ops::LoDResetGradNoNeedBufferVarInference);
+
 REGISTER_OP_CPU_KERNEL(
     lod_reset, ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
     ops::LoDResetKernel<paddle::platform::CPUPlace, double>,
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
index d36aa0ce025a1c0f717913131fcc75040d16afac..1c2f0b0ac8ab4be35e4716acc7be3f05b9d63805 100644
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -63,7 +63,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
                       "Target LoD should be a vector end with the "
                       "first dimension of Input(X).");
     for (size_t i = 0; i < level0.size() - 1; ++i) {
-      PADDLE_ENFORCE(level0[i + 1] > level0[i],
+      PADDLE_ENFORCE(level0[i + 1] >= level0[i],
                      "Target LoD should be an ascending vector.");
     }
 
diff --git a/paddle/fluid/operators/log_loss_op.cc b/paddle/fluid/operators/log_loss_op.cc
index e8850a1e582dc5c0a9ad64d26ba9b824349ee4e3..0048c75ccf04687b42f990dc5aa79541359645c1 100644
--- a/paddle/fluid/operators/log_loss_op.cc
+++ b/paddle/fluid/operators/log_loss_op.cc
@@ -31,14 +31,18 @@ class LogLossOp : public framework::OperatorWithKernel {
     auto pred_dims = ctx->GetInputDim("Predicted");
     auto label_dims = ctx->GetInputDim("Labels");
 
-    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    if (ctx->IsRuntime() || (framework::product(pred_dims) > 0 &&
+                             framework::product(label_dims) > 0)) {
+      PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    }
     PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
                       "The rank of Input(Predicted) must be 2 and the shape is "
                       "[batch_size, 1].");
-    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
-                      "Each row of Input(Predicted) contains a real value, "
-                      "so the 2nd dimension of Input(X) must be 1.");
-
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+                        "Each row of Input(Predicted) contains a real value, "
+                        "so the 2nd dimension of Input(X) must be 1.");
+    }
     ctx->SetOutputDim("Loss", {pred_dims[0], 1});
     ctx->ShareLoD("Predicted", "Loss");
   }
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 04323eee02c8dbed6eeffef67ef75b18f351e46b..8b7d7a52704d5452487373d38d75626ea2b239c8 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lookup_table_op.h"
+
+#include <memory>
+
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
 #include "paddle/fluid/framework/var_type_inference.h"
 
 namespace paddle {
@@ -119,6 +123,29 @@ or not. And the output only shares the LoD information with input Ids.
   }
 };
 
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LookupTableGradOpNoBuffer, "W");
+
+class LookupTableGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+
+    op->SetType("lookup_table_grad");
+
+    op->SetInput("W", Input("W"));
+    op->SetInput("Ids", Input("Ids"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+
+    op->SetOutput(framework::GradVarName("W"), InputGrad("W"));
+
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 class LookupTableOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -131,7 +158,8 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
-    auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("Out"));
+    auto data_type = framework::GetDataTypeOfVar(
+        ctx.InputVar(framework::GradVarName("Out")));
     return framework::OpKernelType(data_type, ctx.device_context());
   }
 };
@@ -159,10 +187,11 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(lookup_table, ops::LookupTableOp,
-                  paddle::framework::DefaultGradOpDescMaker<true>,
-                  ops::LookupTableOpMaker);
+REGISTER_OPERATOR(lookup_table, ops::LookupTableOp, ops::LookupTableOpMaker,
+                  ops::LookupTableGradOpDescMaker);
+
 REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad,
+                  ops::LookupTableGradOpNoBuffer,
                   ops::LookupTableOpGradVarTypeInference);
 
 REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>,
diff --git a/paddle/fluid/operators/lstm_unit_op.cc b/paddle/fluid/operators/lstm_unit_op.cc
index 0895c58f5f58afd444000ebeac7a92e3eb7778d3..47d695475c2e240d273fe873352cf5c213e2026e 100644
--- a/paddle/fluid/operators/lstm_unit_op.cc
+++ b/paddle/fluid/operators/lstm_unit_op.cc
@@ -34,10 +34,12 @@ class LstmUnitOp : public framework::OperatorWithKernel {
     auto c_prev_dims = ctx->GetInputDim("C_prev");
 
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0],
-                      "Batch size of inputs and states must be equal");
-    PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4,
-                      "Dimension of FC should equal to prev state * 4");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0],
+                        "Batch size of inputs and states must be equal");
+      PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4,
+                        "Dimension of FC should equal to prev state * 4");
+    }
 
     int b_size = c_prev_dims[0];  // batch size
     int s_dim = c_prev_dims[1];   // state dim
diff --git a/paddle/fluid/operators/lstmp_op.cc b/paddle/fluid/operators/lstmp_op.cc
index 2728aa8a4ee21a9e1fe3deddcdba4c35a6aba7bc..f31c177c92d0a9e4cc731c478ea8339b450f318a 100644
--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/lstmp_op.h"
+#include <memory>
 #include <string>
 
 namespace paddle {
@@ -45,6 +46,7 @@ class LSTMPOp : public framework::OperatorWithKernel {
                    "Output(BatchHidden) of LSTMP operator should not be null.");
 
     auto in_dims = ctx->GetInputDim("Input");
+
     PADDLE_ENFORCE_EQ(in_dims.size(), 2,
                       "Input(X)'s rank of LSTMP operator must be 2.");
 
@@ -269,13 +271,47 @@ Users can choose to use fully-connected operator before LSTMP operator.
   }
 };
 
+class LSTMPGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
+    grad_op->SetType("lstmp_grad");
+    grad_op->SetInput("Weight", Input("Weight"));
+    grad_op->SetInput("ProjWeight", Input("ProjWeight"));
+    grad_op->SetInput("Bias", Input("Bias"));
+
+    grad_op->SetInput("Projection", Output("Projection"));
+    grad_op->SetInput("Cell", Output("Cell"));
+    grad_op->SetInput("BatchGate", Output("BatchGate"));
+    grad_op->SetInput("BatchCellPreAct", Output("BatchCellPreAct"));
+    grad_op->SetInput("BatchHidden", Output("BatchHidden"));
+    grad_op->SetInput("H0", Input("H0"));
+    grad_op->SetInput("C0", Input("C0"));
+
+    grad_op->SetInput(framework::GradVarName("Projection"),
+                      OutputGrad("Projection"));
+
+    grad_op->SetOutput(framework::GradVarName("Input"), InputGrad("Input"));
+    grad_op->SetOutput(framework::GradVarName("Weight"), InputGrad("Weight"));
+    grad_op->SetOutput(framework::GradVarName("ProjWeight"),
+                       InputGrad("ProjWeight"));
+    grad_op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias"));
+    grad_op->SetOutput(framework::GradVarName("H0"), InputGrad("H0"));
+    grad_op->SetOutput(framework::GradVarName("C0"), InputGrad("C0"));
+
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
 class LSTMPGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of LSTMP operator should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Projection"),
                    "Input(Projection) of LSTMP operator should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Cell"),
@@ -298,7 +334,8 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
         ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
     };
 
-    SetOutGradDim("Input");
+    ctx->SetOutputDim(framework::GradVarName("Input"),
+                      ctx->GetInputDim("BatchGate"));
     SetOutGradDim("Weight");
     SetOutGradDim("ProjWeight");
     SetOutGradDim("Bias");
@@ -310,7 +347,8 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
-        ctx.Input<framework::LoDTensor>("Input")->type(), ctx.device_context());
+        ctx.Input<framework::LoDTensor>("BatchGate")->type(),
+        ctx.device_context());
   }
 };
 
@@ -318,8 +356,7 @@ class LSTMPGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, ops::LSTMPGradMaker);
 REGISTER_OPERATOR(lstmp_grad, ops::LSTMPGradOp);
 REGISTER_OP_CPU_KERNEL(
     lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index c7d6e4205f8862526904e4fa767a2f4c4a2d8481..36da882639a235f27b4e5a9e77bf0813ea9c0ee3 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -267,7 +267,6 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
   }
 
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<LoDTensor>("Input");
     auto* weight = ctx.Input<Tensor>("Weight");
     auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
     auto* bias = ctx.Input<Tensor>("Bias");
@@ -323,7 +322,8 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
     }
 
-    auto in_dims = input->dims();
+    // batch_gate dims equal to input dims
+    auto in_dims = batch_gate->dims();
     auto out_dims = cell_out->dims();
     framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
     int frame_size = static_cast<int>(in_dims[1] / 4);
diff --git a/paddle/fluid/operators/math/context_project.h b/paddle/fluid/operators/math/context_project.h
index bc0df3f3551c7a100d5d285cab585bb81c07fc5e..d6a4793a8ac3e10fb334d47a51b17c985360d2d8 100644
--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -87,7 +87,7 @@ template <typename DeviceContext, typename T>
 class ContextProjectFunctor {
  public:
   void operator()(const DeviceContext& context, const LoDTensor& in,
-                  const Tensor& padding_data, bool padding_trainable,
+                  const Tensor* padding_data, bool padding_trainable,
                   const int context_start, const int context_length,
                   const int context_stride, const int up_pad,
                   const int down_pad, Tensor* col) {
@@ -132,6 +132,7 @@ class ContextProjectFunctor {
       }
     }
     if (padding_trainable) {
+      PADDLE_ENFORCE_NOT_NULL(padding_data);
       for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
         Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
                                   static_cast<int>(lod_level_0[i + 1]));
@@ -150,7 +151,7 @@ class ContextProjectFunctor {
                 k + context_length < up_pad ? context_length : up_pad - k;
             Tensor out_t_sub = out_t.Slice(k * context_length,
                                            k * context_length + padding_size);
-            Tensor w_sub = padding_data.Slice(k, k + padding_size);
+            Tensor w_sub = padding_data->Slice(k, k + padding_size);
             framework::TensorCopy(w_sub, context.GetPlace(), context,
                                   &out_t_sub);
           }
@@ -180,7 +181,7 @@ class ContextProjectFunctor {
             Tensor out_t_sub = out_t.Slice(
                 (down_pad_begin_row + t) * context_length - padding_size,
                 (down_pad_begin_row + t) * context_length);
-            Tensor w_sub = padding_data.Slice(
+            Tensor w_sub = padding_data->Slice(
                 up_pad + padding_idx, up_pad + padding_idx + padding_size);
             framework::TensorCopy(w_sub, context.GetPlace(), context,
                                   &out_t_sub);
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index b99115e44b31536f0fd0a9078b40d07949be86f0..647d4f14842ee38bbd8a5d07563ea29ff0432e1a 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -296,6 +296,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
     auto input_height = has_value_input->height();
     framework::SelectedRows& out = *output;
     std::set<int64_t> merged_row_set;
+    size_t row_num = 0;
     for (auto* input : inputs) {
       if (input->rows().size() == 0) {
         continue;
@@ -305,42 +306,71 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
                         "dimension except for the first one");
       PADDLE_ENFORCE_EQ(input_height, input->height(),
                         "all input should have same height");
+      row_num += input->rows().size();
       merged_row_set.insert(input->rows().begin(), input->rows().end());
     }
-    std::vector<int64_t> merge_rows(merged_row_set.begin(),
-                                    merged_row_set.end());
-    if (sorted_result) {
-      std::sort(merge_rows.begin(), merge_rows.end());
-    }
-    std::unordered_map<int64_t, size_t> rows_to_id;
-    for (size_t i = 0; i < merge_rows.size(); ++i) {
-      rows_to_id[merge_rows[i]] = i;
-    }
-    out.set_rows(merge_rows);
+
     out.set_height(input_height);
     out.mutable_value()->mutable_data<T>(
         framework::make_ddim(
-            {static_cast<int64_t>(merge_rows.size()), input_width}),
+            {static_cast<int64_t>(merged_row_set.size()), input_width}),
         context.GetPlace());
+    auto* out_data = out.mutable_value()->data<T>();
 
-    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
-    constant_functor(context, out.mutable_value(), 0.0);
+    if (merged_row_set.size() == row_num && !sorted_result) {
+      // no duplicated ids, just concat the result together
+      std::vector<int64_t> merge_rows;
+      merge_rows.reserve(row_num);
+      // concat rows
+      for (auto* in : inputs) {
+        merge_rows.insert(merge_rows.end(), in->rows().begin(),
+                          in->rows().end());
+      }
+      out.set_rows(merge_rows);
+      auto in_place = inputs[0]->place();
+      auto out_place = out.place();
+      int64_t copied_numel = 0;
+      for (auto* in : inputs) {
+        auto* in_data = in->value().data<T>();
+        auto in_numel = in->value().numel();
+        memory::Copy(boost::get<platform::CPUPlace>(out_place),
+                     out_data + copied_numel,
+                     boost::get<platform::CPUPlace>(in_place), in_data,
+                     in_numel * sizeof(T));
+        copied_numel += in_numel;
+      }
+    } else {
+      std::vector<int64_t> merge_rows(merged_row_set.begin(),
+                                      merged_row_set.end());
 
-    auto* out_data = out.mutable_value()->data<T>();
+      if (sorted_result) {
+        std::sort(merge_rows.begin(), merge_rows.end());
+      }
 
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-    for (auto* input : inputs) {
-      if (input->rows().size() == 0) {
-        continue;
+      out.set_rows(merge_rows);
+
+      math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
+      constant_functor(context, out.mutable_value(), 0.0);
+
+      std::unordered_map<int64_t, size_t> rows_to_id;
+      for (size_t i = 0; i < merge_rows.size(); ++i) {
+        rows_to_id[merge_rows[i]] = i;
       }
-      auto* input_data = input->value().data<T>();
-      auto& input_rows = input->rows();
-
-      for (size_t i = 0; i < input_rows.size(); i++) {
-        size_t out_i = rows_to_id[input_rows[i]];
-        elementwise_add_to<platform::CPUDeviceContext, T>(
-            context, &blas, static_cast<size_t>(input_width),
-            &input_data[i * input_width], &out_data[out_i * input_width]);
+
+      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+      for (auto* input : inputs) {
+        if (input->rows().size() == 0) {
+          continue;
+        }
+        auto* input_data = input->value().data<T>();
+        auto& input_rows = input->rows();
+
+        for (size_t i = 0; i < input_rows.size(); i++) {
+          size_t out_i = rows_to_id[input_rows[i]];
+          elementwise_add_to<platform::CPUDeviceContext, T>(
+              context, &blas, static_cast<size_t>(input_width),
+              &input_data[i * input_width], &out_data[out_i * input_width]);
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index aedb82da2f0fb2f15e1586d351af7c9d4364852b..5581b9e040272e224669d612409f88d61f794443 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+#include <memory>
 #include <vector>
 #include "gtest/gtest.h"
+
 #include "paddle/fluid/operators/math/math_function.h"
 
 TEST(selected_rows_functor, cpu_add) {
@@ -360,6 +363,69 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
   }
 }
 
+TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
+  paddle::platform::CPUPlace cpu_place;
+  paddle::platform::CPUDeviceContext ctx(cpu_place);
+  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
+                                       float>
+      set_const;
+
+  int64_t height = 10;
+  int64_t row_numel = 8;
+
+  std::vector<int64_t> rows1{1, 3, 5, 7, 9};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows1{
+      new paddle::framework::SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows1.size()), row_numel}),
+      cpu_place);
+  set_const(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{0, 2, 4, 6, 8};
+  std::unique_ptr<paddle::framework::SelectedRows> selected_rows2{
+      new paddle::framework::SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      paddle::framework::make_ddim(
+          {static_cast<int64_t>(rows2.size()), row_numel}),
+      cpu_place);
+  set_const(ctx, in2_value, 2.0);
+
+  std::unique_ptr<paddle::framework::SelectedRows> output{
+      new paddle::framework::SelectedRows()};
+  output->set_height(height);
+  paddle::operators::math::scatter::MergeAdd<paddle::platform::CPUDeviceContext,
+                                             float>
+      merge_add_functor;
+
+  std::vector<const paddle::framework::SelectedRows*> inputs;
+  inputs.push_back(selected_rows1.get());
+  inputs.push_back(selected_rows2.get());
+  merge_add_functor(ctx, inputs, output.get());
+
+  EXPECT_EQ(output->height(), height);
+  EXPECT_EQ(output->value().dims(),
+            paddle::framework::make_ddim({10, row_numel}));
+
+  std::vector<int64_t> ret_rows{1, 3, 5, 7, 9, 0, 2, 4, 6, 8};
+  EXPECT_EQ(output->rows(), ret_rows);
+
+  auto* out_data = output->value().data<float>();
+  for (size_t i = 0; i < ret_rows.size(); ++i) {
+    float data_value = 0;
+    if (i < 5) {
+      data_value = 1.0;
+    } else {
+      data_value = 2.0;
+    }
+    for (size_t j = 0; j < static_cast<size_t>(row_numel); ++j) {
+      EXPECT_EQ(out_data[i * row_numel + j], data_value);
+    }
+  }
+}
+
 TEST(selected_rows_functor, cpu_sum_to) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
diff --git a/paddle/fluid/operators/merge_lod_tensor_op.cc b/paddle/fluid/operators/merge_lod_tensor_op.cc
index da7fa1b81d601f4dd03d6716de601a4b1abc7fa0..5edc233f6f73262c3d1b803aae0089f5b15d403d 100644
--- a/paddle/fluid/operators/merge_lod_tensor_op.cc
+++ b/paddle/fluid/operators/merge_lod_tensor_op.cc
@@ -164,7 +164,9 @@ class MergeLoDTensorInferShape : public framework::InferShapeBase {
 
     auto mask_dim = context->GetInputDim("Mask");
     PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
-    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+    if (context->IsRuntime() || mask_dim[1] > 0) {
+      PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+    }
 
     context->SetOutputDim("Out", context->GetInputDim("InTrue"));
   }
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc
index 7db6dff2971ab7eab9d38d7b32e8a5cff1aacb3c..26e6ab1568d15362c7793fe1eb1e970e4a8946d7 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op.cc
@@ -41,10 +41,11 @@ class AccuracyOp : public framework::OperatorWithKernel {
     // it's the output of topk.
 
     PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2.");
-    PADDLE_ENFORCE_EQ(label_dim[1], 1, "label's second dimension must be 1");
-    PADDLE_ENFORCE_EQ(inference_dim[0], label_dim[0],
-                      "the inference tensor's num_rows must be"
-                      " the same as label.");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, label_dim[1], 1,
+                                 "label's second dimension must be 1");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, inference_dim[0], label_dim[0],
+                                 "the inference tensor's num_rows must be"
+                                 " the same as label.");
 
     ctx->SetOutputDim("Accuracy", {1});
     ctx->SetOutputDim("Correct", {1});
diff --git a/paddle/fluid/operators/metrics/auc_op.cc b/paddle/fluid/operators/metrics/auc_op.cc
index 5e33dd96064dffb2b7e8dd748163bac18d5e5eb3..001d26936886f12efc6eaa0333bb12e4e7118d67 100644
--- a/paddle/fluid/operators/metrics/auc_op.cc
+++ b/paddle/fluid/operators/metrics/auc_op.cc
@@ -28,12 +28,13 @@ class AucOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasInput("Label"),
                    "Input of Label should not be null.");
     auto predict_width = ctx->GetInputDim("Predict")[1];
-    PADDLE_ENFORCE_EQ(predict_width, 2, "Only support binary classification");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, predict_width, 2,
+                                 "Only support binary classification");
     auto predict_height = ctx->GetInputDim("Predict")[0];
     auto label_height = ctx->GetInputDim("Label")[0];
 
-    PADDLE_ENFORCE_EQ(predict_height, label_height,
-                      "Out and Label should have same height.");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, predict_height, label_height,
+                                 "Out and Label should have same height.");
 
     int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
     int slide_steps = ctx->Attrs().Get<int>("slide_steps");
diff --git a/paddle/fluid/operators/metrics/precision_recall_op.cc b/paddle/fluid/operators/metrics/precision_recall_op.cc
index 1a67b134914053642377ec2623e68ab5a3e9ba50..f6d6ffc668c9aaa40e12e7289d4f97fc656e2c70 100644
--- a/paddle/fluid/operators/metrics/precision_recall_op.cc
+++ b/paddle/fluid/operators/metrics/precision_recall_op.cc
@@ -40,30 +40,40 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
     auto max_probs_dims = ctx->GetInputDim("MaxProbs");
     auto labels_dims = ctx->GetInputDim("Labels");
 
-    PADDLE_ENFORCE_EQ(max_probs_dims[1], 1,
-                      "Each instance contains one max probability, so the "
-                      "shape of Input(MaxProbs) should be [batch_size, 1].");
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Indices"), max_probs_dims,
-                      "The shape of Input(Indices) should be [batch_size, 1].");
-    PADDLE_ENFORCE_EQ(max_probs_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(MaxProbs) and "
-                      "Input(Labels) both are batch_size and the shape should "
-                      "be the same.");
-    PADDLE_ENFORCE_EQ(labels_dims[1], 1,
-                      "The 2nd dimension of Input(Labels) contains instance "
-                      "label and the shape should be equal to 1.");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(max_probs_dims[1], 1,
+                        "Each instance contains one max probability, so the "
+                        "shape of Input(MaxProbs) should be [batch_size, 1].");
+      PADDLE_ENFORCE_EQ(
+          ctx->GetInputDim("Indices"), max_probs_dims,
+          "The shape of Input(Indices) should bes same with max_probs_dims");
+      PADDLE_ENFORCE_EQ(
+          max_probs_dims[0], labels_dims[0],
+          "The 1st dimension of Input(MaxProbs) and "
+          "Input(Labels) both are batch_size and the shape should "
+          "be the same.");
+      PADDLE_ENFORCE_EQ(labels_dims[1], 1,
+                        "The 2nd dimension of Input(Labels) contains instance "
+                        "label and the shape should be equal to 1.");
+    }
     if (ctx->HasInput("Weights")) {
       auto weights_dims = ctx->GetInputDim("Weights");
-      PADDLE_ENFORCE_EQ(weights_dims,
-                        framework::make_ddim({max_probs_dims[0], 1}),
-                        "The shape of Input(Weights) should be "
-                        "[batch_size, 1].");
+
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_EQ(weights_dims,
+                          framework::make_ddim({max_probs_dims[0], 1}),
+                          "The shape of Input(Weights) should be "
+                          "[batch_size, 1].");
+      }
     }
     if (ctx->HasInput("StatesInfo")) {
       auto states_dims = ctx->GetInputDim("StatesInfo");
-      PADDLE_ENFORCE_EQ(states_dims, framework::make_ddim({cls_num, 4}),
-                        "The shape of Input(StatesInfo) should be "
-                        "[class_number, 4].");
+
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_EQ(states_dims, framework::make_ddim({cls_num, 4}),
+                          "The shape of Input(StatesInfo) should be "
+                          "[class_number, 4].");
+      }
     }
 
     // Layouts of BatchMetrics and AccumMetrics both are:
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 34571a38a14795a98ac8454cec606077727b5ffa..02a90d77b6e54475f4e722266d0a3b2046ea33ed 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/minus_op.h"
 
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -38,9 +39,12 @@ class MinusOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
 
-    PADDLE_ENFORCE_EQ(
-        x_dims, y_dims,
-        "Minus operator must take two tensor with same num of elements");
+    if (ctx->IsRuntime() ||
+        (framework::product(x_dims) > 0 && framework::product(y_dims) > 0)) {
+      PADDLE_ENFORCE_EQ(
+          x_dims, y_dims,
+          "Minus operator must take two tensor with same num of elements");
+    }
     ctx->SetOutputDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
index bddca232e6c8a2a7fde998877006e37ee6d3d0dc..911c4d22ee5cd84c0b42646a1d3e62a0d765732e 100644
--- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc
@@ -39,13 +39,9 @@ struct bn_type_traits {
 
 class BatchNormMKLDNNHandler : public platform::MKLDNNHandler {
  public:
-  BatchNormMKLDNNHandler(
-      std::shared_ptr<batch_norm_fwd::primitive_desc> batch_norm_pd,
-      const platform::MKLDNNDeviceContext &dev_ctx, mkldnn::engine engine,
-      const std::string &base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {
-    batch_norm_pd_ = batch_norm_pd;
-  }
+  BatchNormMKLDNNHandler(const platform::MKLDNNDeviceContext &dev_ctx,
+                         mkldnn::engine engine, const std::string &base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {}
 
   std::shared_ptr<memory> AcquireScaleshiftMemoryFromPrimitive(void *ptr) {
     return this->AcquireMemoryFromPrimitive(
@@ -62,6 +58,26 @@ class BatchNormMKLDNNHandler : public platform::MKLDNNHandler {
         batch_norm_pd_->variance_primitive_desc(), ptr, "@variance_mem_p");
   }
 
+  std::shared_ptr<batch_norm_fwd::primitive_desc>
+  AcquireBatchNormPrimitiveDescriptor(const batch_norm_fwd::desc &bn_fwd_desc,
+                                      const mkldnn::engine &engine) {
+    const std::string key_batch_norm_fwd_pd = key_ + "@bn_fwd_pd";
+    auto batch_norm_pd =
+        std::static_pointer_cast<batch_norm_fwd::primitive_desc>(
+            dev_ctx_.GetBlob(key_batch_norm_fwd_pd));
+
+    if (batch_norm_pd == nullptr) {
+      batch_norm_pd_.reset(
+          new batch_norm_fwd::primitive_desc(bn_fwd_desc, engine));
+      dev_ctx_.SetBlob(key_batch_norm_fwd_pd, batch_norm_pd_);
+    } else {
+      batch_norm_pd_ = batch_norm_pd;
+      is_reusing_ = true;
+    }
+
+    return batch_norm_pd_;
+  }
+
   std::shared_ptr<batch_norm_fwd> AcquireTestTrainingBatchNormFwd(
       std::shared_ptr<memory> src_memory,
       std::shared_ptr<memory> scaleshift_memory,
@@ -213,7 +229,7 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const std::string key = BatchNormMKLDNNHandler::GetHash(
         src_tz, epsilon, flags, global_stats, input_format,
         ctx.op().Output("SavedMean"));
-    const std::string key_batch_norm_fwd_pd = key + "@bn_fwd_pd";
+    BatchNormMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
 
     auto user_src_md = platform::MKLDNNMemDesc(
         {src_tz}, platform::MKLDNNGetDataType<T>(), input_format);
@@ -222,13 +238,9 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
     auto batch_norm_fwd_desc =
         bn_fwd_types::op_desc{propagation, user_src_md, epsilon, flags};
-    auto batch_norm_fwd_pd = std::make_shared<batch_norm_fwd::primitive_desc>(
-        batch_norm_fwd_desc, mkldnn_engine);
-    // Save conv_pd/src_memory/weights_memory for backward pass
-    dev_ctx.SetBlob(key_batch_norm_fwd_pd, batch_norm_fwd_pd);
 
-    BatchNormMKLDNNHandler handler(batch_norm_fwd_pd, dev_ctx, mkldnn_engine,
-                                   key);
+    auto batch_norm_fwd_pd = handler.AcquireBatchNormPrimitiveDescriptor(
+        batch_norm_fwd_desc, mkldnn_engine);
 
     auto src_memory =
         handler.AcquireSrcMemory(user_src_md, to_void_cast(x_data));
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 5e4d79f1c35af42f662711ae9d8bfc650bab2b4f..faf518005c8cb0958dd5b0bbfc5c6fc4b3c2b582 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -144,7 +144,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const std::string key = platform::ConvMKLDNNHandler::GetHash(
         src_tz, weights_tz, strides, paddings, dilations, groups,
         ctx.op().Input("Input") + ctx.op().Input("Filter"));
-    const std::string key_conv_pd = key + "@conv_pd";
 
     std::vector<primitive> pipeline;
 
@@ -183,6 +182,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto dst_md = platform::MKLDNNMemDesc(
         dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
+    platform::ConvMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
+
     // create a conv primitive descriptor and save it for usage in backward
     std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd;
     auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference
@@ -191,18 +192,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       bias_tz = paddle::framework::vectorize2int(bias->dims());
       auto bias_md = platform::MKLDNNMemDesc(
           bias_tz, platform::MKLDNNGetDataType<T>(), memory::format::x);
-      conv_pd = ConvFwdPrimitiveDesc(
+      conv_pd = handler.AcquireConvolutionPrimitiveDescriptor(
           src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
           fuse_relu, fuse_residual_conn, fwd_prop_kind);
     } else {
-      conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides,
-                                     paddings, mkldnn_engine, fuse_relu,
-                                     fuse_residual_conn, fwd_prop_kind);
+      conv_pd = handler.AcquireConvolutionPrimitiveDescriptor(
+          src_md, weights_md, boost::none, dst_md, strides, paddings,
+          mkldnn_engine, fuse_relu, fuse_residual_conn, fwd_prop_kind);
     }
-    // Save conv_pd/src_memory/weights_memory for backward pass
-    if (!is_test) dev_ctx.SetBlob(key_conv_pd, conv_pd);
-
-    platform::ConvMKLDNNHandler handler(conv_pd, dev_ctx, mkldnn_engine, key);
 
     // create mkldnn memory from input tensors (data/weights)
     auto user_src_memory_p =
@@ -633,31 +630,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   }
 
  private:
-  mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
-                                       bool fuse_residual_conn) const {
-    mkldnn::primitive_attr conv_attr;
-    mkldnn::post_ops post_operations;
-    // Fusion with Elementwise layer relies on adding a sum post-operation with
-    // the scale parameter. It is assumed that when fuse_residual_connection is
-    // true, the output tensor contains the data coming from residual
-    // connection. The result of this post_op is:
-    // Output = scale * Output + Conv_Out.
-    if (fuse_residual_conn) {
-      post_operations.append_sum(1.0f);
-    }
-    // Fusion with ReLU layer is executed through the PostOps feature. Create a
-    // PostOps object and configure it to execute an eltwise relu operation.
-    if (fuse_relu) {
-      constexpr float scale = 1.0f;
-      constexpr float negative_slope = 0.0f;
-      constexpr float placeholder = 0.0f;
-      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
-                                     negative_slope, placeholder);
-    }
-    conv_attr.set_post_ops(post_operations);
-    return conv_attr;
-  }
-
   mkldnn::primitive_attr CreatePostOps(
       bool fuse_relu, bool fuse_residual_conn,
       const std::vector<float> output_shift_scale, float sum_scale) const {
@@ -679,30 +651,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     return conv_attr;
   }
 
-  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
-  ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
-                       const memory::desc& dst, const std::vector<int>& strides,
-                       const std::vector<int>& paddings,
-                       const mkldnn::engine& engine, const bool fuse_relu,
-                       const bool fuse_residual_conn,
-                       mkldnn::prop_kind fwd_prop_kind) const {
-    memory::dims stride_dims = strides;
-    memory::dims padding_dims = paddings;
-
-    auto conv_desc = mkldnn::convolution_forward::desc(
-        fwd_prop_kind, mkldnn::convolution_direct, src, weights, dst,
-        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
-
-    mkldnn::primitive_attr conv_attr =
-        CreatePostOps(fuse_relu, fuse_residual_conn);
-
-    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
-        conv_desc, conv_attr, engine);
-
-    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
-        p_conv_pd);
-  }
-
   std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
   ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
                        const memory::desc& dst, const std::vector<int>& strides,
@@ -731,31 +679,6 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         p_conv_pd);
   }
 
-  std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
-  ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
-                       const memory::desc& bias, const memory::desc& dst,
-                       const std::vector<int>& strides,
-                       const std::vector<int>& paddings,
-                       const mkldnn::engine& engine, const bool fuse_relu,
-                       const bool fuse_residual_conn,
-                       mkldnn::prop_kind fwd_prop_kind) const {
-    memory::dims stride_dims = strides;
-    memory::dims padding_dims = paddings;
-
-    auto conv_desc = mkldnn::convolution_forward::desc(
-        fwd_prop_kind, mkldnn::convolution_direct, src, weights, bias, dst,
-        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
-
-    mkldnn::primitive_attr conv_attr =
-        CreatePostOps(fuse_relu, fuse_residual_conn);
-
-    auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
-        conv_desc, conv_attr, engine);
-
-    return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
-        p_conv_pd);
-  }
-
   std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
   ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
                        const memory::desc& bias, const memory::desc& dst,
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index 317d4cebe26b81ff03c212e6328233d5152ed1b4..30d2469eeaf6938f1f93730b8b645ca2cfe97364 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -12,6 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
+#include "boost/optional.hpp"
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
@@ -124,7 +125,6 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const std::string key = platform::ConvTransposeMKLDNNHandler::GetHash(
         src_tz, weights_tz, strides, paddings, dilations, groups,
         ctx.op().Output("Output"));
-    const std::string key_conv_transpose_pd = key + "@conv_transpose_pd";
 
     std::vector<mkldnn::primitive> pipeline;
 
@@ -153,6 +153,7 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto dst_md = platform::MKLDNNMemDesc(
         dst_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
 
+    platform::ConvTransposeMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
     // create a deconv(conv transpose) primitive descriptor and save it for
     // usage in backward
     std::shared_ptr<mkldnn::deconvolution_forward::primitive_desc>
@@ -163,19 +164,14 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       bias_tz = paddle::framework::vectorize2int(bias->dims());
       auto bias_md = platform::MKLDNNMemDesc(
           bias_tz, platform::MKLDNNGetDataType<T>(), mkldnn::memory::format::x);
-      conv_transpose_pd = ConvTransposeFwdPrimitiveDesc(
+      conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor(
           src_md, weights_md, bias_md, dst_md, strides, paddings, mkldnn_engine,
-          fuse_relu, fwd_prop_kind);
+          fuse_relu, false, fwd_prop_kind);
     } else {
-      conv_transpose_pd = ConvTransposeFwdPrimitiveDesc(
-          src_md, weights_md, dst_md, strides, paddings, mkldnn_engine,
-          fuse_relu, fwd_prop_kind);
+      conv_transpose_pd = handler.AcquireConvolutionPrimitiveDescriptor(
+          src_md, weights_md, boost::none, dst_md, strides, paddings,
+          mkldnn_engine, fuse_relu, false, fwd_prop_kind);
     }
-    // Save conv_pd/src_memory/weights_memory for backward pass
-    if (!is_test) dev_ctx.SetBlob(key_conv_transpose_pd, conv_transpose_pd);
-
-    platform::ConvTransposeMKLDNNHandler handler(conv_transpose_pd, dev_ctx,
-                                                 mkldnn_engine, key);
 
     // create mkldnn memory from input tensors (data/weights)
     auto user_src_memory_p = handler.AcquireSrcMemory(
@@ -224,70 +220,6 @@ class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     output->set_layout(DataLayout::kMKLDNN);
     output->set_format(platform::GetMKLDNNFormat(*dst_memory_p));
   }
-
- private:
-  mkldnn::primitive_attr CreatePostOps(bool fuse_relu) const {
-    mkldnn::primitive_attr conv_attr;
-    mkldnn::post_ops post_operations;
-    // Fusion with ReLU layer is executed through the PostOps feature. Create a
-    // PostOps object and configure it to execute an eltwise relu operation.
-    if (fuse_relu) {
-      constexpr float scale = 1.0f;
-      constexpr float negative_slope = 0.0f;
-      constexpr float placeholder = 0.0f;
-      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
-                                     negative_slope, placeholder);
-    }
-    conv_attr.set_post_ops(post_operations);
-    return conv_attr;
-  }
-
-  std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>
-  ConvTransposeFwdPrimitiveDesc(
-      const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
-      const mkldnn::memory::desc& dst, const std::vector<int>& strides,
-      const std::vector<int>& paddings, const mkldnn::engine& engine,
-      const bool fuse_relu, mkldnn::prop_kind fwd_prop_kind) const {
-    mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
-    mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
-
-    auto deconv_desc = mkldnn::deconvolution_forward::desc(
-        fwd_prop_kind, mkldnn::deconvolution_direct, src, weights, dst,
-        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
-
-    mkldnn::primitive_attr deconv_attr = CreatePostOps(fuse_relu);
-
-    auto p_conv_transpose_pd =
-        new mkldnn::deconvolution_forward::primitive_desc(deconv_desc,
-                                                          deconv_attr, engine);
-
-    return std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>(
-        p_conv_transpose_pd);
-  }
-
-  std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>
-  ConvTransposeFwdPrimitiveDesc(
-      const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
-      const mkldnn::memory::desc& bias, const mkldnn::memory::desc& dst,
-      const std::vector<int>& strides, const std::vector<int>& paddings,
-      const mkldnn::engine& engine, const bool fuse_relu,
-      mkldnn::prop_kind fwd_prop_kind) const {
-    mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
-    mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
-
-    auto deconv_desc = mkldnn::deconvolution_forward::desc(
-        fwd_prop_kind, mkldnn::deconvolution_direct, src, weights, bias, dst,
-        stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
-
-    mkldnn::primitive_attr deconv_attr = CreatePostOps(fuse_relu);
-
-    auto p_conv_transpose_pd =
-        new mkldnn::deconvolution_forward::primitive_desc(deconv_desc,
-                                                          deconv_attr, engine);
-
-    return std::unique_ptr<mkldnn::deconvolution_forward::primitive_desc>(
-        p_conv_transpose_pd);
-  }
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index dc1176f0848b93dd6872f676c3a71dab4f3455fd..1b3f33d345f4e0fafd7ad5da41eec052ac2dc504 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -34,12 +34,9 @@ using platform::to_void_cast;
 
 class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
  public:
-  SoftmaxMKLDNNHandler(
-      std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd,
-      const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
-      const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key),
-        softmax_pd_(softmax_pd) {}
+  SoftmaxMKLDNNHandler(const platform::MKLDNNDeviceContext& dev_ctx,
+                       mkldnn::engine engine, const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {}
 
   SoftmaxMKLDNNHandler(
       std::shared_ptr<mkldnn::softmax_forward::primitive_desc> softmax_pd,
@@ -54,6 +51,26 @@ class SoftmaxMKLDNNHandler : public platform::MKLDNNHandler {
     key_ += "-BWD";
   }
 
+  std::shared_ptr<softmax_forward::primitive_desc>
+  AcquireSoftmaxPrimitiveDescriptor(const softmax_forward::desc& softmax_desc,
+                                    const mkldnn::engine& engine) {
+    const std::string key_softmax_pd = key_ + "@softmax_pd";
+
+    auto softmax_pd = std::static_pointer_cast<softmax_forward::primitive_desc>(
+        dev_ctx_.GetBlob(key_softmax_pd));
+
+    if (softmax_pd == nullptr) {
+      softmax_pd_.reset(
+          new softmax_forward::primitive_desc(softmax_desc, engine));
+      dev_ctx_.SetBlob(key_softmax_pd, softmax_pd_);
+    } else {
+      softmax_pd_ = softmax_pd;
+      is_reusing_ = true;
+    }
+
+    return softmax_pd_;
+  }
+
   std::shared_ptr<mkldnn::softmax_forward> AcquireSoftmax(
       std::shared_ptr<mkldnn::memory> dst_memory_p,
       std::shared_ptr<mkldnn::memory> src_memory_p) {
@@ -138,19 +155,18 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
     // Generate keys for storing/retriving primitives for this operator
     const std::string key =
         platform::MKLDNNHandler::GetHash(softmax_tz, ctx.op().Output("Out"));
-    const std::string key_softmax_pd = key + "@softmax_pd";
 
+    SoftmaxMKLDNNHandler handler(dev_ctx, mkldnn_engine, key);
     // Currently only NC data format is supported
     auto softmax_md = MKLDNNMemDesc(
         {softmax_tz}, platform::MKLDNNGetDataType<T>(), memory::format::nc);
     // Normalization is made after innermost dimension eg. C out of NC
     auto softmax_desc = softmax_forward::desc(prop_kind::forward_scoring,
                                               softmax_md, 1 /*dim: C*/);
-    auto softmax_pd = std::make_shared<mkldnn::softmax_forward::primitive_desc>(
-        softmax_desc, mkldnn_engine);
-    dev_ctx.SetBlob(key_softmax_pd, softmax_pd);
 
-    SoftmaxMKLDNNHandler handler(softmax_pd, dev_ctx, mkldnn_engine, key);
+    auto softmax_pd =
+        handler.AcquireSoftmaxPrimitiveDescriptor(softmax_desc, mkldnn_engine);
+
     auto softmax_src_memory_p =
         handler.AcquireSrcMemory(softmax_md, to_void_cast<T>(input_data));
     auto softmax_dst_memory_p =
diff --git a/paddle/fluid/operators/modified_huber_loss_op.cc b/paddle/fluid/operators/modified_huber_loss_op.cc
index 9954e51083b2c4dbc043fe82ee75be91c6d60128..14d75aee754bc3d5b951a4f53a34ea8661c08cca 100644
--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -28,9 +28,16 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
 
-    PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same.");
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The tensor rank of X must be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[1], 1, "The 2nd dimension of X must be 1.");
+    if (ctx->IsRuntime() ||
+        (framework::product(x_dims) > 0 && framework::product(y_dims) > 0)) {
+      PADDLE_ENFORCE_EQ(x_dims, y_dims,
+                        "The shape of X and Y must be the same.");
+    }
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(x_dims[1], 1, "The 2nd dimension of X must be 1.");
+    }
 
     ctx->SetOutputDim("IntermediateVal", x_dims);
     ctx->SetOutputDim("Out", {x_dims[0], 1});
@@ -90,11 +97,13 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
     auto intermediate_dims = ctx->GetInputDim("IntermediateVal");
     auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
-    PADDLE_ENFORCE_EQ(
-        intermediate_dims, x_dims,
-        "The shape of X and intermediate value must be the same.");
-    PADDLE_ENFORCE_EQ(out_grad_dims, x_dims,
-                      "The shape of Input(Out@Grad) and X must be the same.");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(
+          intermediate_dims, x_dims,
+          "The shape of X and intermediate value must be the same.");
+      PADDLE_ENFORCE_EQ(out_grad_dims, x_dims,
+                        "The shape of Input(Out@Grad) and X must be the same.");
+    }
 
     if (ctx->HasOutput(framework::GradVarName("X"))) {
       ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 358e4f37b5b45c53b88f5477452ebf6448dcc461..0ccc5d30b3141b029b157fd8a046c4dbeab22c23 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -36,7 +36,9 @@ class NCEOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("Input");
     auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
+    if (ctx->IsRuntime() || (x_dims[0] > 0 && label_dims[0] > 0)) {
+      PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
+    }
     int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
     if (ctx->HasInput("Bias")) {
       PADDLE_ENFORCE_EQ(ctx->GetInputDim("Weight")[0],
@@ -60,7 +62,8 @@ class NCEOp : public framework::OperatorWithKernel {
     // set dims of output(SampleOut)
     std::vector<int64_t> sample_out_dims;
     sample_out_dims.push_back(x_dims[0]);
-    sample_out_dims.push_back(num_neg_samples + num_true_classes);
+    sample_out_dims.push_back(
+        (num_true_classes == -1) ? -1 : (num_neg_samples + num_true_classes));
     ctx->SetOutputDim("SampleLogits", framework::make_ddim(sample_out_dims));
     ctx->SetOutputDim("SampleLabels", framework::make_ddim(sample_out_dims));
   }
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.cc b/paddle/fluid/operators/ngraph/ngraph_engine.cc
index 9f73bbc1fdc72766a0b57bc72c62d208277c2f20..5ef385d2fcbaf01dce5c9b85321b41c103e5655a 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.cc
@@ -75,6 +75,7 @@ std::vector<std::string> NgraphEngine::feed_vars = {};
 std::vector<std::string> NgraphEngine::fetch_vars = {};
 framework::Variable* NgraphEngine::pre_var_ptr = nullptr;
 const framework::BlockDesc* NgraphEngine::p_bdesc = nullptr;
+bool NgraphEngine::is_training = false;
 
 std::unordered_map<std::string, EngineCache> NgraphEngine::engine_cache = {};
 std::unordered_map<std::string,
@@ -93,11 +94,13 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
   int size = ops->size();
   int left = 0;
   while (left < size && ops->at(left)->Type() != framework::kFeedOpType &&
+         ops->at(left)->Type() != "read" &&
          ops->at(left)->Type() != framework::kFetchOpType) {
     ++left;
   }
 
-  while (left < size && ops->at(left)->Type() == framework::kFeedOpType) {
+  while (left < size && (ops->at(left)->Type() == framework::kFeedOpType ||
+                         ops->at(left)->Type() == "read")) {
     for (auto& var_name_item : ops->at(left)->Outputs()) {
       for (auto& var_name : var_name_item.second) {
         NgraphEngine::feed_vars.emplace_back(var_name);
@@ -270,6 +273,7 @@ void NgraphEngine::Prepare(const std::vector<int>& interval) {
 
   for (auto op_desc : ops_desc) {
     if (op_desc->Type().find("_grad") != std::string::npos) {
+      is_training = true;
       this->is_test_ = false;
       break;
     }
@@ -590,7 +594,7 @@ void NgraphEngine::Run(const framework::Scope& scope,
       }
       bool is_persistable =
           (p_persistables->find(vi) != p_persistables->end()) ? true : false;
-      if (is_test && is_persistable) {
+      if (!is_training && is_test && is_persistable) {
         ti->set_stale(false);
       }
       (*p_t_in).emplace_back(ti);
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine.h b/paddle/fluid/operators/ngraph/ngraph_engine.h
index b6532519e947bc59f0605c4f2008270f5e51b0e0..19400ac5b0ecd9d3254583b8db9889fc6cf8bc0f 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine.h
+++ b/paddle/fluid/operators/ngraph/ngraph_engine.h
@@ -57,6 +57,7 @@ class NgraphEngine {
 
   void Run(const framework::Scope& scope, const platform::Place& place) const;
 
+  static bool is_training;
   static const framework::BlockDesc* p_bdesc;
   static std::vector<std::string> feed_vars, fetch_vars;
 
diff --git a/paddle/fluid/operators/one_hot_op.cc b/paddle/fluid/operators/one_hot_op.cc
index 4fcb1d69935175c3f643db7a4da04db34492f8fb..626895f49d8d4347f1e9a40526943cf00c73d034 100644
--- a/paddle/fluid/operators/one_hot_op.cc
+++ b/paddle/fluid/operators/one_hot_op.cc
@@ -30,9 +30,10 @@ class OneHotOp : public framework::OperatorWithKernel {
     auto x_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE_GE(x_dims.size(), 2,
                       "Rank of Input(X) should be at least 2.");
-    PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], 1U,
-                      "Last dimension of Input(X) should be 1.");
-
+    if (ctx->IsRuntime() || x_dims[x_dims.size() - 1] > 0) {
+      PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], 1U,
+                        "Last dimension of Input(X) should be 1.");
+    }
     int depth = ctx->Attrs().Get<int>("depth");
 
     PADDLE_ENFORCE_GT(depth, 0, "Should provide a positive depth (%d).", depth);
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index 9731aefa95c5243e29ace87ad8c35d5b01904e60..1e8ba5922aa96ac40798d103868c839242ac1e55 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -483,8 +483,10 @@ class Pad2dOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(
           paddings_dim.size(), 1,
           "Size of Input(Paddings)'s dimension should be equal to 1.");
-      PADDLE_ENFORCE_EQ(paddings_dim[0], 4,
-                        "Shape of Input(Paddings) should be equal to [4].");
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_EQ(paddings_dim[0], 4,
+                          "Shape of Input(Paddings) should be equal to [4].");
+      }
       out_dims[1] = x_dim[1];
       out_dims[2] = x_dim[2];
       out_dims[3] = x_dim[3];
@@ -504,11 +506,7 @@ class Pad2dOp : public framework::OperatorWithKernel {
     }
 
     ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
-    if (out_dims[0] == x_dim[0]) {
-      // Only pass LoD when the first dimension is equal between
-      // output and input.
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 
  protected:
diff --git a/paddle/fluid/operators/pad_constant_like_op.cc b/paddle/fluid/operators/pad_constant_like_op.cc
index 3f827c26fd415c8a3c2295129f413850ea59bef3..31ed0a686f712bd286b4accda68716b156037dbc 100644
--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/pad_constant_like_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -38,8 +39,16 @@ class PadConstantLikeOp : public framework::OperatorWithKernel {
                       "The dimention of X and Y should be the same.");
 
     for (int i = 0; i < x_dim.size(); ++i) {
-      PADDLE_ENFORCE_GE(x_dim[i], y_dim[i]);
+      if ((!ctx->IsRuntime()) && ((x_dim[i] == -1) || (y_dim[i] == -1))) {
+        continue;
+      } else {
+        PADDLE_ENFORCE_GE(
+            x_dim[i], y_dim[i],
+            "expected X_dim[i] >= Y_dim[i], but received %d < %d for dim %d",
+            x_dim[i], y_dim[i], i);
+      }
     }
+
     ctx->SetOutputDim("Out", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
@@ -162,7 +171,14 @@ class PadConstantLikeOpGrad : public framework::OperatorWithKernel {
       ctx->ShareLoD("Y", /*->*/ y_grad_name);
 
       for (int i = 0; i < y_dim.size(); ++i) {
-        PADDLE_ENFORCE_GE(dout_dim[i], y_dim[i]);
+        if ((!ctx->IsRuntime()) && ((dout_dim[i] == -1) || (y_dim[i] == -1))) {
+          continue;
+        } else {
+          PADDLE_ENFORCE_GE(dout_dim[i], y_dim[i],
+                            "expected Out_dim[i] >= Y_dim[i], but received %d "
+                            "< %d for dim %d",
+                            dout_dim[i], y_dim[i], i);
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/pad_op.cc b/paddle/fluid/operators/pad_op.cc
index c28106d31273cb54e3974d186296644272d2014c..36dc8b0dbb3d3b6537b6395f4c831ac25b03a4c6 100644
--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -34,9 +34,16 @@ class PadOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()),
                       "Size of paddings should be equal to 2 * dimension size "
                       "of input tensor.");
+    for (size_t i = 0; i < paddings.size(); ++i) {
+      PADDLE_ENFORCE_GE(paddings[i], 0, "paddings should >= 0.");
+    }
     std::vector<int64_t> out_dims(x_dim.size());
     for (int i = 0; i < x_dim.size(); ++i) {
-      out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
+      if ((!ctx->IsRuntime()) && (x_dim[i] == -1)) {
+        out_dims[i] = -1;
+      } else {
+        out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
+      }
     }
     ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
     if (out_dims[0] == x_dim[0]) {
@@ -100,18 +107,14 @@ class PadOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    for (int i = 0; i < dout_dims.size(); ++i) {
-      dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]);
-    }
-
     auto x_grad_name = framework::GradVarName("X");
     if (ctx->HasOutput(x_grad_name)) {
       auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
       auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
       for (int i = 0; i < dout_dims.size(); ++i) {
-        dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]);
+        if (ctx->IsRuntime() || (dout_dims[i] != -1)) {
+          dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]);
+        }
       }
       ctx->SetOutputDim(x_grad_name, dout_dims);
     }
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index 99256e408d44802418728c0970cc2efeaa682587..e917e778e41ff8994f248e905635da702b428fc2 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -61,23 +61,31 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
     auto query_dim = ctx->GetInputDim("QueryID");
     PADDLE_ENFORCE_EQ(score_dim.size(), 2, "Score should be a 2-D tensor.");
     PADDLE_ENFORCE_EQ(label_dim.size(), 2, "Label should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(
-        label_dim[0], score_dim[0],
-        "Tensor Score and Label should have the same height (batch size).");
-    PADDLE_ENFORCE_EQ(label_dim[1], 1,
-                      "The width of Label should be 1, i.e. each item should "
-                      "have a scalar label.");
-    PADDLE_ENFORCE(query_dim == label_dim,
-                   "QueryID should have the same shape as Label.");
-    if (ctx->HasInput("Weight")) {
-      PADDLE_ENFORCE(ctx->GetInputDim("Weight") == label_dim,
-                     "Weight should have the same shape as Label.");
+
+    if (ctx->IsRuntime() ||
+        (score_dim[0] > 0 && label_dim[0] > 0 && query_dim[0] > 0)) {
+      PADDLE_ENFORCE_EQ(
+          label_dim[0], score_dim[0],
+          "Tensor Score and Label should have the same height (batch size).");
+
+      PADDLE_ENFORCE_EQ(label_dim[1], 1,
+                        "The width of Label should be 1, i.e. each item should "
+                        "have a scalar label.");
+
+      PADDLE_ENFORCE(query_dim == label_dim,
+                     "QueryID should have the same shape as Label.");
+
+      if (ctx->HasInput("Weight")) {
+        PADDLE_ENFORCE(ctx->GetInputDim("Weight") == label_dim,
+                       "Weight should have the same shape as Label.");
+      }
+
+      int column = ctx->Attrs().Get<int>("column");
+      auto depth = score_dim[1];
+      PADDLE_ENFORCE(column < depth && column >= -depth,
+                     "Attribute column should be in the range of [-%l, %l)",
+                     depth, depth);
     }
-    int column = ctx->Attrs().Get<int>("column");
-    auto depth = score_dim[1];
-    PADDLE_ENFORCE(column < depth && column >= -depth,
-                   "Attribute column should be in the range of [-%l, %l)",
-                   depth, depth);
 
     ctx->SetOutputDim("PositivePair", scalar_dim);
     ctx->SetOutputDim("NegativePair", scalar_dim);
diff --git a/paddle/fluid/operators/random_crop_op.cc b/paddle/fluid/operators/random_crop_op.cc
index cd3bd32adb4df0f8d8ab15de6a52ec2f1fbbddf2..dad46ec6683349b9d383368a85411a39750e3e2f 100644
--- a/paddle/fluid/operators/random_crop_op.cc
+++ b/paddle/fluid/operators/random_crop_op.cc
@@ -60,7 +60,9 @@ class RandomCropOpInferShape : public framework::InferShapeBase {
     for (size_t i = 1; i <= shape.size(); ++i) {
       size_t x_i = x_dim.size() - i;
       size_t shape_i = shape.size() - i;
-      PADDLE_ENFORCE_GE(x_dim[x_i], shape[shape_i]);
+      if (ctx->IsRuntime() || (x_dim[x_i] > 0 && shape[shape_i] > 0)) {
+        PADDLE_ENFORCE_GE(x_dim[x_i], shape[shape_i]);
+      }
       out_dim[x_i] = shape[shape_i];
     }
     ctx->SetOutputDim("Out", framework::make_ddim(out_dim));
diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc
index 2898a62ddbac524ceb212cac5f34aeda3b1e01cb..1a2feee11c951cd4a55958df58f3756472f64769 100644
--- a/paddle/fluid/operators/recurrent_op.cc
+++ b/paddle/fluid/operators/recurrent_op.cc
@@ -23,6 +23,7 @@ constexpr char kInitialStates[] = "initial_states";
 constexpr char kParameters[] = "parameters";
 constexpr char kOutputs[] = "outputs";
 constexpr char kStepScopes[] = "step_scopes";
+constexpr char kHasStates[] = "has_states";
 constexpr char kExStates[] = "ex_states";
 constexpr char kStates[] = "states";
 constexpr char kStepBlock[] = "sub_block";
@@ -241,11 +242,16 @@ class RecurrentOp : public RecurrentBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
+    bool has_state = Attr<bool>(kHasStates);
     auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
     VLOG(3) << "Static RNN input sequence length = " << seq_len;
     StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
 
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
     framework::Executor executor(place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
 
@@ -269,15 +275,17 @@ class RecurrentOp : public RecurrentBase {
             inside->Resize(framework::make_ddim(dims));
           });
 
-      if (i == 0) {
-        // Link initial states  --> ex_states
-        LinkTensor(scope, Inputs(kInitialStates), &cur_scope,
-                   Attr<std::vector<std::string>>(kExStates));
-      } else {
-        auto &ex_scope = scopes.ExScope();
-        // Link ex_scope::state --> cur_scope::ex_state
-        LinkTensor(ex_scope, Attr<std::vector<std::string>>(kStates),
-                   &cur_scope, Attr<std::vector<std::string>>(kExStates));
+      if (has_state) {
+        if (i == 0) {
+          // Link initial states  --> ex_states
+          LinkTensor(scope, Inputs(kInitialStates), &cur_scope,
+                     Attr<std::vector<std::string>>(kExStates));
+        } else {
+          auto &ex_scope = scopes.ExScope();
+          // Link ex_scope::state --> cur_scope::ex_state
+          LinkTensor(ex_scope, Attr<std::vector<std::string>>(kStates),
+                     &cur_scope, Attr<std::vector<std::string>>(kExStates));
+        }
       }
 
       // Every inputs are linked now, execute!
@@ -286,11 +294,6 @@ class RecurrentOp : public RecurrentBase {
                    std::vector<std::string>() /*skip_ref_cnt_vars*/,
                    true /*force_disable_gc*/);
 
-      // get device context from pool
-      platform::DeviceContextPool &pool =
-          platform::DeviceContextPool::Instance();
-      auto &dev_ctx = *pool.Get(place);
-
       // Copy inside::output -> outside::output
       //    outside::output[seq_offset: seq_offset + 1] = inside::output
       this->LinkTensorWithCallback(
@@ -333,13 +336,13 @@ class RecurrentGradOp : public RecurrentBase {
  private:
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
-    auto seq_len = static_cast<size_t>(GetSequenceLength(scope));
+    bool has_state = Attr<bool>(kHasStates);
+    const size_t seq_len = static_cast<size_t>(GetSequenceLength(scope));
     StepScopes scopes = CreateStepScopes(scope, seq_len);
     auto reverse = Attr<bool>(kReverse);
 
     framework::Executor executor(place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
-
     auto *program = block->Program();
 
     // get device context from pool
@@ -350,6 +353,7 @@ class RecurrentGradOp : public RecurrentBase {
       size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
       VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
       auto &cur_scope = scopes.CurScope();
+
       // Link outside::output_grads --> inside::output_grads
       //   inside::output_grad = outside::output_grad[seq_offset:seq_offset+1]
       LinkTensorWithCallback(
@@ -370,30 +374,32 @@ class RecurrentGradOp : public RecurrentBase {
         VLOG(10) << " RNN output gradients = [" << sout.str() << "]";
       }
 
-      // Link states
-      //   if cur_scope::cur_state_grad in out_grads:
-      //     cur_scope::cur_state_grad += ex_scope::ex_state_grad
-      //   else:
-      //     ex_scope::ex_state_grad --> cur_scope::cur_state_grad
-      if (step_id != 0) {  // not at beginning
-        auto &ex_scope = scopes.ExScope();
-        auto ex_state_grads =
-            GradVarLists(Attr<std::vector<std::string>>(kExStates));
-        auto cur_state_grads =
-            GradVarLists(Attr<std::vector<std::string>>(kStates));
-
-        PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size());
-        for (size_t i = 0; i < ex_state_grads.size(); ++i) {
-          auto &cur_grad = cur_state_grads[i];
-          auto &ex_grad = ex_state_grads[i];
-          auto &ex_tensor =
-              ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>();
-
-          VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
-          auto *cur_grad_var = cur_scope.Var(cur_grad);
-          auto cur_grad_tensor =
-              cur_grad_var->GetMutable<framework::LoDTensor>();
-          framework::TensorCopy(ex_tensor, place, dev_ctx, cur_grad_tensor);
+      if (has_state) {
+        // Link states
+        //   if cur_scope::cur_state_grad in out_grads:
+        //     cur_scope::cur_state_grad += ex_scope::ex_state_grad
+        //   else:
+        //     ex_scope::ex_state_grad --> cur_scope::cur_state_grad
+        if (step_id != 0) {  // not at beginning
+          auto &ex_scope = scopes.ExScope();
+          auto ex_state_grads =
+              GradVarLists(Attr<std::vector<std::string>>(kExStates));
+          auto cur_state_grads =
+              GradVarLists(Attr<std::vector<std::string>>(kStates));
+
+          PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size());
+          for (size_t i = 0; i < ex_state_grads.size(); ++i) {
+            auto &cur_grad = cur_state_grads[i];
+            auto &ex_grad = ex_state_grads[i];
+            auto &ex_tensor =
+                ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>();
+
+            VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
+            auto *cur_grad_var = cur_scope.Var(cur_grad);
+            auto cur_grad_tensor =
+                cur_grad_var->GetMutable<framework::LoDTensor>();
+            framework::TensorCopy(ex_tensor, place, dev_ctx, cur_grad_tensor);
+          }
         }
       }
 
@@ -442,8 +448,8 @@ class RecurrentGradOp : public RecurrentBase {
           }
 
           auto new_inside_name = cur_scope.Rename(inside_grad_name);
-          // sum gradient
 
+          // sum gradient
           auto sum_op = framework::OpRegistry::CreateOp(
               "sum", {{"X", {pg_names[param_id], new_inside_name}}},
               {{"Out", {pg_names[param_id]}}},
@@ -475,22 +481,33 @@ class RecurrentGradOp : public RecurrentBase {
           true /*is_backward*/);
       VLOG(5) << "Link outside gradient finished ";
 
-      if (step_id + 1 == seq_len) {  // at_end
-        // copy initialize states gradient from inside to outside
-        LinkTensorWithCallback(
-            cur_scope, GradVarLists(Attr<std::vector<std::string>>(kExStates)),
-            scope, Outputs(kInitStateGrads),
-            [&](const framework::LoDTensor &inside,
-                framework::LoDTensor *outside) {
-              outside->Resize(inside.dims());
-              outside->mutable_data(place, inside.type());
-              framework::TensorCopy(inside, place, dev_ctx, outside);
-            },
-            true /*is_backward*/);
-        VLOG(5) << "Link initialize state gradient finished ";
+      if (has_state) {
+        if (step_id + 1 == seq_len) {  // at_end
+          // copy initialize states gradient from inside to outside
+          LinkTensorWithCallback(
+              cur_scope,
+              GradVarLists(Attr<std::vector<std::string>>(kExStates)), scope,
+              Outputs(kInitStateGrads),
+              [&](const framework::LoDTensor &inside,
+                  framework::LoDTensor *outside) {
+                outside->Resize(inside.dims());
+                outside->mutable_data(place, inside.type());
+                framework::TensorCopy(inside, place, dev_ctx, outside);
+              },
+              true /*is_backward*/);
+          VLOG(5) << "Link initialize state gradient finished ";
+        }
       }
       scopes.Next();
     }
+    // Delete the scope of StepScopes
+    dev_ctx.Wait();
+    auto *var = scope.FindVar(Input(kStepScopes));
+    PADDLE_ENFORCE(var != nullptr);
+    auto step_scopes = var->GetMutable<StepScopeVar>();
+    for (auto *sub_scope : *step_scopes) {
+      const_cast<framework::Scope &>(scope).DeleteScope(sub_scope);
+    }
   }
 
  private:
@@ -541,6 +558,7 @@ class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
         .AsDuplicable();
     AddOutput(kStepScopes,
               "StepScopes contain all local variables in each time step.");
+    AddAttr<bool>(kHasStates, "Whether has states.").SetDefault(false);
     AddAttr<std::vector<std::string>>(kExStates,
                                       string::Sprintf(
                                           R"DOC(The ex-state variable names.
@@ -624,20 +642,44 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
 class RecurrentGradOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {
-    std::vector<std::string> input{kInputs, kInitialStates};
     std::vector<std::string> output{kOutputs};
-    for (auto &s : input) {
-      // NOTE(zcd): In some case, some of kInputs doesn't have gradient.
-      PADDLE_ENFORCE(ctx->HasInputs(s));
-    }
-    for (auto &s : output) {
-      PADDLE_ENFORCE(ctx->HasInputs(s));
+
+    // In some case the kInitialStates is empty.
+    // If the kInitialStates is empty, all the states should be empty.
+    if (!ctx->HasInputs(kInitialStates)) {
+      PADDLE_ENFORCE_EQ(
+          ctx->Attrs().Get<std::vector<std::string>>(kExStates).size(), 0,
+          "The Attr(%s) should be empty.", kExStates);
+      PADDLE_ENFORCE_EQ(
+          ctx->Attrs().Get<std::vector<std::string>>(kStates).size(), 0,
+          "The Attr(%s) should be empty.", kStates);
     }
-    for (auto &s : input) {
-      ctx->SetOutputsDim(framework::GradVarName(s), ctx->GetInputsDim(s));
+
+    PADDLE_ENFORCE(ctx->HasInputs(kInputs),
+                   "The input(%s) should not be empty.", kInputs);
+    PADDLE_ENFORCE(ctx->HasInputs(kOutputs),
+                   "The input(%s) should not be empty.", kOutputs);
+
+    // In some case the kInitialStates is empty.
+    if (ctx->HasInputs(kInitialStates)) {
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kInitialStates)),
+                     "The output of(%s) should not be empty.",
+                     framework::GradVarName(kInitialStates));
+      ctx->SetOutputsDim(framework::GradVarName(kInitialStates),
+                         ctx->GetInputsDim(kInitialStates));
     }
+
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kInputs)),
+                   "The output of(%s) should not be empty.",
+                   framework::GradVarName(kInputs));
+    ctx->SetOutputsDim(framework::GradVarName(kInputs),
+                       ctx->GetInputsDim(kInputs));
+
+    // In some case the kParameters is empty.
     if (ctx->HasInputs(kParameters)) {
-      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)),
+                     "The output of(%s) should not be empty.",
+                     framework::GradVarName(kParameters));
       ctx->SetOutputsDim(framework::GradVarName(kParameters),
                          ctx->GetInputsDim(kParameters));
     }
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a3ca9ae0675472cb4f0bcd6f404f39004e7cc62f
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
+
+REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_all);
+REGISTER_OP_CPU_KERNEL(reduce_all,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         bool, ops::AllFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bd94ba263d957d0d65506ecd802bf43add6e2fb4
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
@@ -0,0 +1,19 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_all,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          bool, ops::AllFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.h b/paddle/fluid/operators/reduce_ops/reduce_all_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ba159dd703c8904784546eda262bf7be77967d48
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct AllFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->all(dim);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..34f0fffc9adef240c6fa222540710537587010c5
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
+
+REGISTER_REDUCE_OP_WITHOUT_GRAD(reduce_any);
+REGISTER_OP_CPU_KERNEL(reduce_any,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         bool, ops::AnyFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..66f0c9997ea1e27cf172a6839a68d2eb23395c4d
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
@@ -0,0 +1,19 @@
+// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_any,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          bool, ops::AnyFunctor>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.h b/paddle/fluid/operators/reduce_ops/reduce_any_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..b36bad9cada259932d2bd77c2426fbb46790de76
--- /dev/null
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct AnyFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->any(dim);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 540742c4cd8b0efc4c6cf095d7a8b3516f551d4c..c86591fdafa3d33bb3c7d75bf9f4f3b041a7a9cb 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -270,3 +270,12 @@ namespace ops = paddle::operators;
   REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__,        \
                     paddle::framework::DefaultGradOpDescMaker<true>);    \
   REGISTER_OPERATOR(op_name##_grad, ops::ReduceGradOp)
+
+#define REGISTER_REDUCE_OP_WITHOUT_GRAD(op_name)                         \
+  class __##op_name##Maker__ : public ops::ReduceOpMaker {               \
+   protected:                                                            \
+    virtual std::string GetName() const { return #op_name; }             \
+    virtual std::string GetOpType() const { return "Reduce " #op_name; } \
+  };                                                                     \
+  REGISTER_OPERATOR(op_name, ops::ReduceOp, __##op_name##Maker__,        \
+                    paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
index 834dd1eabd68db6c8b571071f8043589c66f8671..b00cc07dea920a6d7caa8b70c99d84b72a785a99 100644
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -40,9 +40,12 @@ class RNNMemoryHelperOp : public framework::OperatorBase {
                    "Cannot find out_var in scope, out_var_name is %s",
                    out_name);
 
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+
     auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
     auto &mem_tensor = mem_var->Get<framework::LoDTensor>();
-    framework::TensorCopySync(mem_tensor, dev_place, out_tensor);
+    framework::TensorCopy(mem_tensor, dev_place, dev_ctx, out_tensor);
     out_tensor->set_lod(mem_tensor.lod());
   }
 };
@@ -92,6 +95,9 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
                    "Cannot find in_grad_var in scope, name is %s",
                    in_grad_var_name);
 
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+
     if (out_grad_var == nullptr) {
       VLOG(5) << "Using fill constant 0 as starting gradient";
       auto in_var_name = Input("X");
@@ -109,7 +115,8 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase {
     } else {
       auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
       auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();
-      framework::TensorCopySync(out_grad_tensor, dev_place, in_grad_tensor);
+      framework::TensorCopy(out_grad_tensor, dev_place, dev_ctx,
+                            in_grad_tensor);
       in_grad_tensor->set_lod(out_grad_tensor.lod());
     }
   }
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index 7bb10ce063109dbd8520430d2b32ac9370ef8d25..d0dd861af7be80ede75b9d14867087ec687fc1da 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -37,9 +37,11 @@ class ROIAlignOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(rois_dims.size() == 2,
                    "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
                    "given as [[x1, y1, x2, y2], ...].");
-    PADDLE_ENFORCE(rois_dims[1] == 4,
-                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], ...].");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE(rois_dims[1] == 4,
+                     "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
+                     "given as [[x1, y1, x2, y2], ...].");
+    }
     int pooled_height = ctx->Attrs().Get<int>("pooled_height");
     int pooled_width = ctx->Attrs().Get<int>("pooled_width");
     float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
diff --git a/paddle/fluid/operators/row_conv_op.cc b/paddle/fluid/operators/row_conv_op.cc
index 81aabdd0061b3940f23d4731d55fc5cbe5817004..7e9611679ba9a988f40973aaa37f04bcfa48f1ad 100644
--- a/paddle/fluid/operators/row_conv_op.cc
+++ b/paddle/fluid/operators/row_conv_op.cc
@@ -45,9 +45,12 @@ class RowConvOp : public framework::OperatorWithKernel {
     auto filter_dims = ctx->GetInputDim("Filter");
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(
-        x_dims[1], filter_dims[1],
-        "The 2nd dimension of Input(X) and Input(Filter) should be same.");
+    if (ctx->IsRuntime() || (x_dims[1] > 0 && filter_dims[1] > 0)) {
+      PADDLE_ENFORCE_EQ(
+          x_dims[1], filter_dims[1],
+          "The 2nd dimension of Input(X) and Input(Filter) should be same.");
+    }
+
     ctx->SetOutputDim("Out", x_dims);
     ctx->ShareLoD("X", "Out");
   }
diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc
index a7f7fb26b17c77e6fe87646d3cac20c02c49b52c..8ce2d52273d7cc3d523e5d77c2c79b9989b9227f 100644
--- a/paddle/fluid/operators/sample_logits_op.cc
+++ b/paddle/fluid/operators/sample_logits_op.cc
@@ -11,8 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-
 #include "paddle/fluid/operators/sample_logits_op.h"
+#include <memory>
 #include "paddle/fluid/operators/math/sample_prob.h"
 
 namespace paddle {
@@ -60,6 +60,10 @@ class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
         "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
         "The probabilites of sampled positive and negtive labels.")
         .AsIntermediate();
+    AddOutput("LogitsDim", "Store dim information of Logits for gradient op")
+        .AsIntermediate();
+    AddOutput("LabelsDim", "Store dim information of Logits for gradient op")
+        .AsIntermediate();
     AddOutput("SampledLogits",
               "(Tensor, default: Tensor<float>), A 2-D tensor with shape"
               "[N, NT + S]. The outputs value of sampled logits, which will be"
@@ -121,6 +125,10 @@ class SampleLogitsOp : public framework::OperatorWithKernel {
                    "Output(SampledLogits) should be not null.");
     PADDLE_ENFORCE(ctx->HasOutput("SampledLabels"),
                    "Output(SampledLabels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("LogitsDim"),
+                   "Output(LogitsDim) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("LabelsDim"),
+                   "Output(LabelsDim) should be not null.");
 
     auto logits_dims = ctx->GetInputDim("Logits");
     auto labels_dims = ctx->GetInputDim("Labels");
@@ -132,11 +140,23 @@ class SampleLogitsOp : public framework::OperatorWithKernel {
                       "The labels should be a 2-D tensor.");
 
     const int num_samples = ctx->Attrs().Get<int>("num_samples");
-    const int num_sampled_classes = labels_dims[1] + num_samples;
+    int num_sampled_classes = labels_dims[1] + num_samples;
+    if ((!ctx->IsRuntime()) && labels_dims[1] <= 0) {
+      num_sampled_classes = -1;
+    }
     ctx->SetOutputDim("Samples", {logits_dims[0], num_sampled_classes});
     ctx->SetOutputDim("Probabilities", {logits_dims[0], num_sampled_classes});
     ctx->SetOutputDim("SampledLogits", {logits_dims[0], num_sampled_classes});
     ctx->SetOutputDim("SampledLabels", {logits_dims[0], labels_dims[1]});
+
+    // append 0 to shape variable to avoid optimized by memory optimize pass
+    auto logits_dim_vec = framework::vectorize(logits_dims);
+    logits_dim_vec.push_back(0);
+    ctx->SetOutputDim("LogitsDim", framework::make_ddim(logits_dim_vec));
+
+    auto labels_dim_vec = framework::vectorize(labels_dims);
+    labels_dim_vec.push_back(0);
+    ctx->SetOutputDim("LabelsDim", framework::make_ddim(labels_dim_vec));
   }
 
  protected:
@@ -155,28 +175,27 @@ class SampleLogitsOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Logits"),
-                   "Input(Logits) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("LogitsDim"),
+                   "Input(LogitsDim) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LabelsDim"),
+                   "Input(LabelsDim) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput("Samples"),
                    "Input(Samples) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("SampledLogits"),
-                   "Input(SampledLogits) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("SampledLogits")),
                    "Input(SampledLogits@Grad) should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
                    "Output(Logits@Grad) should be not null.");
 
-    auto logit_dims = ctx->GetInputDim("Logits");
-    auto label_dims = ctx->GetInputDim("Labels");
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
+    auto logits_dims = ctx->GetInputDim("LogitsDim");
+    logits_dims = framework::DDim(logits_dims.Get(), logits_dims.size() - 1);
+    auto labels_dims = ctx->GetInputDim("LabelsDim");
+    labels_dims = framework::DDim(labels_dims.Get(), labels_dims.size() - 1);
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
                       "The label should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(logit_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(logits_dims.size(), 2UL,
                       "The logits should be a 2-D tensor.");
 
-    ctx->SetOutputDim(framework::GradVarName("Logits"),
-                      ctx->GetInputDim("Logits"));
+    ctx->SetOutputDim(framework::GradVarName("Logits"), logits_dims);
   }
 
  protected:
@@ -199,10 +218,9 @@ class SampleLogitsGradMaker : public framework::SingleGradOpDescMaker {
   std::unique_ptr<framework::OpDesc> Apply() const override {
     auto* grad_op = new framework::OpDesc();
     grad_op->SetType("sample_logits_grad");
-    grad_op->SetInput("Logits", Input("Logits"));
-    grad_op->SetInput("Labels", Input("Labels"));
+    grad_op->SetInput("LogitsDim", Output("LogitsDim"));
+    grad_op->SetInput("LabelsDim", Output("LabelsDim"));
     grad_op->SetInput("Samples", Output("Samples"));
-    grad_op->SetInput("SampledLogits", Output("SampledLogits"));
     grad_op->SetInput(framework::GradVarName("SampledLogits"),
                       OutputGrad("SampledLogits"));
     grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
diff --git a/paddle/fluid/operators/sampling_id_op.cc b/paddle/fluid/operators/sampling_id_op.cc
index a4f41a170426a4650fd3bf8f7fec4758ff34e1b9..36712a8d06d3e9a6f582f8296e2c0c4b4b302eb1 100644
--- a/paddle/fluid/operators/sampling_id_op.cc
+++ b/paddle/fluid/operators/sampling_id_op.cc
@@ -28,15 +28,15 @@ class SamplingIdOp : public framework::OperatorWithKernel {
                    "Input(X) of SamplingIdOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SamplingIdOp should not be null.");
-    PADDLE_ENFORCE(
-        ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
-        "min must less then max");
+    PADDLE_ENFORCE_LT(ctx->Attrs().Get<float>("min"),
+                      ctx->Attrs().Get<float>("max"), "min must less then max");
 
     auto input_dims = ctx->GetInputDim("X");
     PADDLE_ENFORCE(input_dims.size() == 2,
                    "Input(X, Filter) should be 2-D tensor.");
 
-    framework::DDim dims = input_dims;
+    auto dim0 = input_dims[0];
+    framework::DDim dims = framework::make_ddim({dim0});
     ctx->SetOutputDim("Out", dims);
     ctx->ShareLoD("X", "Out");
   }
diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc
index 8e0e3bd6054018852b242d1dba5c250394ed81ce..68ad223b3c311bec5968eb18b50f15e9da84e6d3 100644
--- a/paddle/fluid/operators/scatter_op.cc
+++ b/paddle/fluid/operators/scatter_op.cc
@@ -42,10 +42,6 @@ class ScatterOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
                       ctx->GetInputDim("Ids")[0],
                       "Updates and Ids should have same batch-size.");
-    framework::DDim data_dim(updates_dims);
-    for (int i = 1; i < data_dim.size(); ++i) {
-      PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]);
-    }
     ctx->SetOutputDim("Out", ref_dims);
   }
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
index ee70281d51673b94a1451f636e607fad3404863b..3a2c9e3f734c8c302e3e4b1c6718b3a236fe897a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
@@ -49,7 +49,7 @@ class SequenceConvKernel : public framework::OpKernel<T> {
 
     int up_pad = std::max(0, -context_start);
     int down_pad = std::max(0, context_start + context_length - 1);
-    int sequence_width = static_cast<int>(in->dims()[1]);
+    auto sequence_width = static_cast<int64_t>(in->dims()[1]);
 
     framework::DDim col_shape = {in->dims()[0],
                                  context_length * sequence_width};
@@ -62,7 +62,7 @@ class SequenceConvKernel : public framework::OpKernel<T> {
     set_zero(dev_ctx, &col, static_cast<T>(0));
     math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
 
-    seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
+    seq_project_functor(dev_ctx, *in, padding_data, padding_trainable,
                         context_start, context_length, context_stride, up_pad,
                         down_pad, &col);
 
@@ -93,7 +93,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
 
     int up_pad = std::max(0, -context_start);
     int down_pad = std::max(0, context_start + context_length - 1);
-    int sequence_width = static_cast<int>(in->dims()[1]);
+    auto sequence_width = static_cast<int64_t>(in->dims()[1]);
 
     math::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
@@ -144,7 +144,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
         padding_data = context.Input<Tensor>("PaddingData");
       }
 
-      seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
+      seq_project_functor(dev_ctx, *in, padding_data, padding_trainable,
                           context_start, context_length, context_stride, up_pad,
                           down_pad, &col);
 
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index 5c92588cc1d073612d2f6a7b315edf16cc14bedd..1c2726454f3d1fb8545e5d3260e59fcafbcb2aee 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -34,15 +34,22 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("X");
     auto labels_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
-                      "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Label) should "
-                      "be equal.");
+
+    int rank = x_dims.size();
+    PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
+                      "Input(X) and Input(Label) shall have the same rank.");
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
+                                framework::product(labels_dims) <= 0)) {
+      check = false;
+    }
+
+    if (check) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank),
+                        framework::slice_ddim(labels_dims, 0, rank),
+                        "Input(X) and Input(Label) shall have the same shape "
+                        "except the last dimension.");
+    }
 
     ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
@@ -65,23 +72,24 @@ class SigmoidCrossEntropyWithLogitsGradOp
     auto x_dims = ctx->GetInputDim("X");
     auto labels_dims = ctx->GetInputDim("Label");
     auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
-                      "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(dout_dims.size(), 2,
-                      "Input(Out@Grad)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0],
-                      "The 1st dimension of Input(X) and Input(Out@Grad) "
-                      "should be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[1], dout_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Out@Grad) "
-                      "should be equal.");
+
+    int rank = x_dims.size();
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
+                                framework::product(labels_dims) <= 0)) {
+      check = false;
+    }
+
+    if (check) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank),
+                        framework::slice_ddim(labels_dims, 0, rank),
+                        "Input(X) and Input(Label) shall have the same shape.");
+
+      PADDLE_ENFORCE_EQ(
+          framework::slice_ddim(x_dims, 0, rank),
+          framework::slice_ddim(dout_dims, 0, rank),
+          "Input(X) and Input(Out@Grad) shall have the same shape.");
+    }
 
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
   }
diff --git a/paddle/fluid/operators/smooth_l1_loss_op.cc b/paddle/fluid/operators/smooth_l1_loss_op.cc
index 622420c1c33a62994c81ad9534c4fa37a4a1fa1a..22b621248d69811898e418b5a0ae609319583e43 100644
--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/smooth_l1_loss_op.h"
+#include <memory>
 
 namespace paddle {
 namespace operators {
@@ -27,15 +28,39 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
 
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims, y_dims);
+    bool check = true;
+    if ((!ctx->IsRuntime()) &&
+        (framework::product(x_dims) <= 0 || framework::product(y_dims) <= 0)) {
+      check = false;
+    }
+    if (check) {
+      PADDLE_ENFORCE_EQ(x_dims, y_dims);
+    }
     PADDLE_ENFORCE_GE(x_dims.size(), 2,
                       "The tensor rank of Input(X) should not be less than 2.");
     if (ctx->HasInput("InsideWeight")) {
       PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
                      "If weights are provided, must specify both "
                      "inside and outside weights.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims);
+      auto dims = ctx->GetInputDim("InsideWeight");
+      bool check = true;
+      if ((!ctx->IsRuntime()) &&
+          (framework::product(dims) <= 0 || framework::product(x_dims) <= 0)) {
+        check = false;
+      }
+      if (check) {
+        PADDLE_ENFORCE_EQ(dims, x_dims);
+      }
+
+      dims = ctx->GetInputDim("OutsideWeight");
+      check = true;
+      if ((!ctx->IsRuntime()) &&
+          (framework::product(dims) <= 0 || framework::product(x_dims) <= 0)) {
+        check = false;
+      }
+      if (check) {
+        PADDLE_ENFORCE_EQ(dims, x_dims);
+      }
     }
 
     ctx->SetOutputDim("Diff", x_dims);
@@ -110,11 +135,11 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_GE(out_dims.size(), 2,
                       "The tensor rank of Input(Out@Grad) should be 2.");
-    PADDLE_ENFORCE_EQ(out_dims[0], in_dims[0],
-                      "The 1st dimension of Input(Out@Grad) must be "
-                      "same as input.");
-    PADDLE_ENFORCE_EQ(out_dims[1], 1,
-                      "The 2nd dimension of Input(Out@Grad) must be 1.");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, out_dims[0], in_dims[0],
+                                 "The 1st dimension of Input(Out@Grad) must be "
+                                 "same as input.");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
+        ctx, out_dims[1], 1, "The 2nd dimension of Input(Out@Grad) must be 1.");
 
     auto x_grad_name = framework::GradVarName("X");
     auto y_grad_name = framework::GradVarName("Y");
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
index fda971b20e27b68cab6110c323469f0d1c77cb59..7c024e50dd21abea4e29257ebc53f7baa7f25235 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -106,24 +106,36 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
 
     auto logits_dims = ctx->GetInputDim("Logits");
     auto labels_dims = ctx->GetInputDim("Label");
+
+    int rank = logits_dims.size();
     PADDLE_ENFORCE_EQ(
-        logits_dims.size(), 2UL,
-        "The input of softmax_with_cross_entropy should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
-                      "The labels should be a 2-D tensor.");
+        rank, labels_dims.size(),
+        "Input(logits) and Input(Label) shall have the same rank.");
+    bool check = ctx->IsRuntime() || (framework::product(logits_dims) > 0 &&
+                                      framework::product(labels_dims) > 0);
+    if (check) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(logits_dims, 0, rank - 1),
+                        framework::slice_ddim(labels_dims, 0, rank - 1),
+                        "Input(X) and Input(Label) shall have the same shape "
+                        "except the last dimension.");
+    }
 
     if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(logits_dims[1], labels_dims[1],
-                        "If Attr(soft_label) == true, the 2nd dimension of "
-                        "Input(X) and Input(Label) should be equal.");
+      if (check) {
+        PADDLE_ENFORCE_EQ(logits_dims[rank - 1], labels_dims[rank - 1],
+                          "If Attr(soft_label) == true, the last dimension of "
+                          "Input(X) and Input(Label) should be equal.");
+      }
     } else {
-      PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
-                        "If Attr(soft_label) == false, the 2nd dimension of "
+      PADDLE_ENFORCE_EQ(labels_dims[rank - 1], 1UL,
+                        "If Attr(softLabel) == false, the last dimension of "
                         "Input(Label) should be 1.");
     }
 
     ctx->SetOutputDim("Softmax", logits_dims);
-    ctx->SetOutputDim("Loss", {logits_dims[0], 1});
+    auto loss_dims = logits_dims;
+    loss_dims[rank - 1] = 1;
+    ctx->SetOutputDim("Loss", loss_dims);
 
     ctx->ShareLoD("Logits", /*->*/ "Softmax");
     ctx->ShareLoD("Logits", /*->*/ "Loss");
@@ -152,16 +164,33 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
 
     auto softmax_dims = ctx->GetInputDim("Softmax");
     auto labels_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
-                      "The labels should be a 2-D tensor.");
+
+    int rank = softmax_dims.size();
+    PADDLE_ENFORCE_EQ(
+        rank, labels_dims.size(),
+        "Input(logits) and Input(Label) shall have the same rank.");
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(softmax_dims) <= 0 ||
+                                framework::product(labels_dims) <= 0)) {
+      check = false;
+    }
+    if (check) {
+      PADDLE_ENFORCE_EQ(
+          framework::slice_ddim(softmax_dims, 0, rank - 1),
+          framework::slice_ddim(labels_dims, 0, rank - 1),
+          "Input(Softmax) and Input(Label) shall have the same shape "
+          "except the last dimension.");
+    }
 
     if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(softmax_dims[1], labels_dims[1],
-                        "When Attr(soft_label) == true, the 2nd dimension of "
-                        "Input(X) and Input(Label) should be equal.");
+      if (check) {
+        PADDLE_ENFORCE_EQ(softmax_dims[rank - 1], labels_dims[rank - 1],
+                          "If Attr(soft_label) == true, the last dimension of "
+                          "Input( Softmax) and Input(Label) should be equal.");
+      }
     } else {
-      PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
-                        "When Attr(soft_label) == false, the 2nd dimension of "
+      PADDLE_ENFORCE_EQ(labels_dims[rank - 1], 1UL,
+                        "If Attr(softLabel) == false, the last dimension of "
                         "Input(Label) should be 1.");
     }
 
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 89aaac4cbe6399af08b3d340896df7a07e1be543..d3b8538124f60cd1c6f5849ee5078b83e4485c9f 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -400,9 +400,15 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
 
     auto soft_label = context.Attr<bool>("soft_label");
     auto ignore_index = context.Attr<int>("ignore_index");
+
+    int rank = logits->dims().size();
     if (soft_label) {
-      int batch_size = logits->dims()[0];
-      int feature_size = logits->dims()[1];
+      int batch_size = 1;
+      for (int i = 0; i < rank - 1; ++i) {
+        batch_size *= logits->dims()[i];
+      }
+
+      int feature_size = logits->dims()[rank - 1];
       auto* logits_data = logits->data<T>();
       auto* labels_data = labels->data<T>();
       SoftmaxWithCrossEntropyFusedKernel(
@@ -410,14 +416,23 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
           feature_size, context.cuda_device_context().stream());
     } else {
       if (!context.Attr<bool>("numeric_stable_mode")) {
-        math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(), logits,
-                                       softmax);
+        // reshape to 2d
+        Tensor logits_2d = framework::ReshapeToMatrix(*logits, rank - 1);
+        Tensor softmax_2d = framework::ReshapeToMatrix(*softmax, rank - 1);
+        Tensor loss_2d = framework::ReshapeToMatrix(*loss, rank - 1);
+        Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
+
+        math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(),
+                                       &logits_2d, &softmax_2d);
         math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
-            context.cuda_device_context(), loss, softmax, labels, false,
-            ignore_index);
+            context.cuda_device_context(), &loss_2d, &softmax_2d, &labels_2d,
+            false, ignore_index);
       } else {
-        int batch_size = logits->dims()[0];
-        int feature_size = logits->dims()[1];
+        int batch_size = 1;
+        for (int i = 0; i < rank - 1; ++i) {
+          batch_size *= logits->dims()[i];
+        }
+        int feature_size = logits->dims()[rank - 1];
         auto* logits_data = logits->data<T>();
         auto* labels_data = labels->data<int64_t>();
         HardLabelSoftmaxWithCrossEntropy<T>(
@@ -443,8 +458,13 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
                           context.device_context(), logit_grad);
     T* logit_grad_data = logit_grad->data<T>();
 
-    const int batch_size = logit_grad->dims()[0];
-    const int class_num = logit_grad->dims()[1];
+    int rank = logit_grad->dims().size();
+    int batch_size = 1;
+    for (int i = 0; i < rank - 1; ++i) {
+      batch_size *= logit_grad->dims()[i];
+    }
+
+    const int class_num = logit_grad->dims()[rank - 1];
     int block = 512;
     auto stream = context.cuda_device_context().stream();
     auto ignore_index = context.Attr<int>("ignore_index");
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
index 1042cbdcf5e96f0dd3780793cf1f233dc32c3eec..8cba960c7633af56cbf74d662f77af9bb32994ca 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -40,15 +40,22 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
     softmax->mutable_data<T>(context.GetPlace());
     loss->mutable_data<T>(context.GetPlace());
 
-    int axis_dim = logits->dims()[logits->dims().size() - 1];
+    // reshape to 2D tensor
+    int rank = logits->dims().size();
+    Tensor logits_2d = framework::ReshapeToMatrix(*logits, rank - 1);
+    Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
+    Tensor loss_2d = framework::ReshapeToMatrix(*loss, rank - 1);
+    Tensor softmax_2d = framework::ReshapeToMatrix(*softmax, rank - 1);
+
+    int axis_dim = logits->dims()[rank - 1];
 
     auto& dev_ctx =
         context.template device_context<platform::CPUDeviceContext>();
     math::SoftmaxFunctor<platform::CPUDeviceContext, T, false>()(
-        dev_ctx, axis_dim, logits, softmax);
+        dev_ctx, axis_dim, &logits_2d, &softmax_2d);
     math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
-        dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"),
-        context.Attr<int>("ignore_index"));
+        dev_ctx, &loss_2d, &softmax_2d, &labels_2d,
+        context.Attr<bool>("soft_label"), context.Attr<int>("ignore_index"));
   }
 };
 
@@ -63,13 +70,19 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
         context.Output<Tensor>(framework::GradVarName("Logits"));
     logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
 
-    const int class_num = logit_grad->dims()[1];
-    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
-    auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
+    int rank = logit_grad->dims().size();
+    const int class_num = logit_grad->dims()[rank - 1];
+    // reshape to 2d
+    Tensor logit_grad_2d = framework::ReshapeToMatrix(*logit_grad, rank - 1);
+    Tensor out_grad_2d = framework::ReshapeToMatrix(*out_grad, rank - 1);
+
+    auto out_grad_mat = EigenMatrix<T>::From(out_grad_2d);
+    auto logit_grad_mat = EigenMatrix<T>::From(logit_grad_2d);
     auto& place = *context.template device_context<platform::CPUDeviceContext>()
                        .eigen_device();
     if (context.Attr<bool>("soft_label")) {
-      auto lbl_mat = EigenMatrix<T>::From(*labels);
+      Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
+      auto lbl_mat = EigenMatrix<T>::From(labels_2d);
       logit_grad_mat.device(place) =
           out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) *
           (logit_grad_mat - lbl_mat);
@@ -78,7 +91,8 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
           logit_grad_mat *
           out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num));
 
-      const int batch_size = logit_grad->dims()[0];
+      const int batch_size = logit_grad_2d.dims()[0];
+
       const int64_t* label_data = labels->data<int64_t>();
       T* logit_grad_data = logit_grad->data<T>();
       const T* out_grad_data = out_grad->data<T>();
diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc
index b579244673fa1618c282c4d4fedf2ba6d1726a82..3d66613248c27f683faf6e3f075c495ed6e71b06 100644
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
@@ -13,12 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/space_to_depth_op.h"
+
+#include <memory>
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+
 namespace paddle {
 namespace operators {
 
+using Tensor = framework::Tensor;
+
 class SpaceToDepthOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -34,19 +40,44 @@ class SpaceToDepthOp : public framework::OperatorWithKernel {
     auto blocksize = ctx->Attrs().Get<int64_t>("blocksize");
 
     PADDLE_ENFORCE_GT(blocksize, 1, "The blocksize should be Greater than 1");
-    PADDLE_ENFORCE_GT(x_dims[1], 0, "input channel should be Greater than 0");
-    PADDLE_ENFORCE_GT(x_dims[2], 0, "input Height should be Greater than 0");
-    PADDLE_ENFORCE_GT(x_dims[3], 0, "input Width should be Greater than 0");
-
-    PADDLE_ENFORCE_EQ(x_dims[1] % (blocksize * blocksize), 0,
-                      "input channel should be divisible of the square of "
-                      "SpaceToDepthOp blocksize");
-    PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), 0,
-                      "input Height should be divisible of the square of "
-                      "SpaceToDepthOp blocksize");
-    PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), 0,
-                      "input Width should be divisible of the square of "
-                      "SpaceToDepthOp blocksize");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_GT(x_dims[1], 0, "input channel should be Greater than 0");
+      PADDLE_ENFORCE_GT(x_dims[2], 0, "input Height should be Greater than 0");
+      PADDLE_ENFORCE_GT(x_dims[3], 0, "input Width should be Greater than 0");
+
+      PADDLE_ENFORCE_EQ(x_dims[1] % (blocksize * blocksize), 0,
+                        "input channel should be divisible of the square of "
+                        "SpaceToDepthOp blocksize");
+      PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), 0,
+                        "input Height should be divisible of the square of "
+                        "SpaceToDepthOp blocksize");
+      PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), 0,
+                        "input Width should be divisible of the square of "
+                        "SpaceToDepthOp blocksize");
+    } else {
+      if (x_dims[1] != -1) {
+        PADDLE_ENFORCE_GT(x_dims[1], 0,
+                          "input channel should be Greater than 0");
+        PADDLE_ENFORCE_EQ(x_dims[1] % (blocksize * blocksize), 0,
+                          "input channel should be divisible of the square of "
+                          "SpaceToDepthOp blocksize");
+      }
+      if (x_dims[2] != -1) {
+        PADDLE_ENFORCE_GT(x_dims[2], 0,
+                          "input Height should be Greater than 0");
+        PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), 0,
+                          "input Height should be divisible of the square of "
+                          "SpaceToDepthOp blocksize");
+      }
+
+      if (x_dims[3] != -1) {
+        PADDLE_ENFORCE_GT(x_dims[3], 0, "input Width should be Greater than 0");
+
+        PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), 0,
+                          "input Width should be divisible of the square of "
+                          "SpaceToDepthOp blocksize");
+      }
+    }
 
     VLOG(3) << "SpaceToDepthOp operator x.shape=" << x_dims
             << "Attribute blocksize" << blocksize << std::endl;
@@ -100,6 +131,28 @@ class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SpaceToDepthGradOpNoBuffer, "X");
+
+class SpaceToDepthGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+
+    op->SetType("space_to_depth_grad");
+
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("X", Input("X"));
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 class SpaceToDepthGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -110,6 +163,14 @@ class SpaceToDepthGradOp : public framework::OperatorWithKernel {
                    "Input(Out@GRAD) shouldn't be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        ctx.Input<Tensor>(framework::GradVarName("Out"))->type(),
+        ctx.GetPlace());
+  }
 };
 }  // namespace operators
 }  // namespace paddle
@@ -117,8 +178,9 @@ class SpaceToDepthGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(space_to_depth, ops::SpaceToDepthOp, ops::SpaceToDepthOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(space_to_depth_grad, ops::SpaceToDepthGradOp);
+                  ops::SpaceToDepthGradOpDescMaker);
+REGISTER_OPERATOR(space_to_depth_grad, ops::SpaceToDepthGradOp,
+                  ops::SpaceToDepthGradOpNoBuffer);
 REGISTER_OP_CPU_KERNEL(
     space_to_depth,
     ops::SpaceToDepthKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/fluid/operators/spectral_norm_op.cc b/paddle/fluid/operators/spectral_norm_op.cc
index 04f659a465a345653d251cbe6703309c804fe614..ec5ee487729d0650983d553dbffe14b63c16b26a 100644
--- a/paddle/fluid/operators/spectral_norm_op.cc
+++ b/paddle/fluid/operators/spectral_norm_op.cc
@@ -56,13 +56,19 @@ class SpectralNormOp : public framework::OperatorWithKernel {
     }
     auto dim_u = ctx->GetInputDim("U");
     auto dim_v = ctx->GetInputDim("V");
-    PADDLE_ENFORCE_EQ(dim_u[0], h,
-                      "Input(U) dims[0] should be equal to "
-                      "Input(Weight) dims[Attr(dim)]");
-    PADDLE_ENFORCE_EQ(
-        dim_v[0], w,
-        "Input(V) dims[0] should be equal to "
-        "the product of Input(Weight) dims except dims[Attr(dim)]");
+
+    if (ctx->IsRuntime() || (dim_u[0] > 0 && h > 0)) {
+      PADDLE_ENFORCE_EQ(dim_u[0], h,
+                        "Input(U) dims[0] should be equal to "
+                        "Input(Weight) dims[Attr(dim)]");
+    }
+
+    if (ctx->IsRuntime() || (dim_v[0] > 0 && w > 0)) {
+      PADDLE_ENFORCE_EQ(
+          dim_v[0], w,
+          "Input(V) dims[0] should be equal to "
+          "the product of Input(Weight) dims except dims[Attr(dim)]");
+    }
 
     ctx->SetOutputDim("Out", dim_weight);
     ctx->ShareLoD("Weight", /*->*/ "Out");
diff --git a/paddle/fluid/operators/split_lod_tensor_op.cc b/paddle/fluid/operators/split_lod_tensor_op.cc
index 5ede972c71ff3ef8ff00756b97662aabb54d6349..c89e683d766580113d160eecf4f04d4ead4594f6 100644
--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -157,7 +157,9 @@ class SplitLoDTensorInferShape : public framework::InferShapeBase {
 
     auto mask_dim = context->GetInputDim("Mask");
     PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
-    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+    if (context->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+    }
 
     context->SetOutputDim("OutTrue", context->GetInputDim("X"));
     context->SetOutputDim("OutFalse", context->GetInputDim("X"));
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index a05582ae09e16ee17194d299d713d321f28ccace..a43bad878179d02c41d8c8bcd6b43eaffaa6e9a2 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -39,14 +39,22 @@ class SplitOp : public framework::OperatorWithKernel {
 
     if (num > 0) {
       int64_t in_axis_dim = in_dims[axis];
-      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
-                        "tensor split does not result"
-                        " in an equal division");
-      size_t out_axis_dim = in_axis_dim / num;
-      for (size_t i = 0; i < outs_number; ++i) {
-        auto dim = in_dims;
-        dim[axis] = out_axis_dim;
-        outs_dims.push_back(dim);
+      if (ctx->IsRuntime() || in_axis_dim > 0) {
+        PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
+                          "tensor split does not result"
+                          " in an equal division");
+        size_t out_axis_dim = in_axis_dim / num;
+        for (size_t i = 0; i < outs_number; ++i) {
+          auto dim = in_dims;
+          dim[axis] = out_axis_dim;
+          outs_dims.push_back(dim);
+        }
+      } else {
+        for (size_t i = 0; i < outs_number; ++i) {
+          auto dim = in_dims;
+          dim[axis] = -1;
+          outs_dims.push_back(dim);
+        }
       }
     } else if (sections.size() > 0) {
       PADDLE_ENFORCE_EQ(sections.size(), outs_number,
diff --git a/paddle/fluid/operators/squared_l2_distance_op.cc b/paddle/fluid/operators/squared_l2_distance_op.cc
index 42532a294b2ef9ffdb240fac8596278047daf7fe..6e82bf407496ab2d37d3fe81aacccfc128d57aec 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
@@ -14,6 +14,10 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/squared_l2_distance_op.h"
 
+#include <memory>
+
+#include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
+
 namespace paddle {
 namespace operators {
 
@@ -41,19 +45,60 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
 
     int rank = framework::arity(x_dims);
     PADDLE_ENFORCE_GE(rank, 2, "Tensor rank should be at least equal to 2.");
-    PADDLE_ENFORCE_EQ(product(x_dims) / x_dims[0], product(y_dims) / y_dims[0],
-                      "Product of dimensions expcet the first dimension of "
-                      "input and target must be equal.");
-    PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0],
-                   "First dimension of target must be equal to input "
-                   "or to 1.");
-
+    bool check = true;
+    if ((!ctx->IsRuntime()) &&
+        (framework::product(x_dims) <= 0 || framework::product(y_dims) <= 0)) {
+      check = false;
+    }
+    if (check) {
+      PADDLE_ENFORCE_EQ(product(x_dims) / x_dims[0],
+                        product(y_dims) / y_dims[0],
+                        "Product of dimensions expcet the first dimension of "
+                        "input and target must be equal.");
+    }
+    check = true;
+    if ((!ctx->IsRuntime()) && (y_dims[0] <= 0 || x_dims[0] <= 0)) {
+      check = false;
+    }
+    if (check) {
+      PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0],
+                     "First dimension of target must be equal to input "
+                     "or to 1.");
+    }
     ctx->SetOutputDim("sub_result", {x_dims[0], product(x_dims) / x_dims[0]});
     ctx->SetOutputDim("Out", {x_dims[0], 1});
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
+DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(SquaredL2DistanceGradOpNoBuffer, "X",
+                                      "Y");
+
+class SquaredL2DistanceGradOpDescMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+
+    op->SetType("squared_l2_distance_grad");
+
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("sub_result", Output("sub_result"));
+    op->SetInput("X", Input("X"));
+    op->SetInput("Y", Input("Y"));
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("Y"), InputGrad("Y"));
+
+    op->SetAttrMap(Attrs());
+
+    return op;
+  }
+};
+
 class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
@@ -88,20 +133,28 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Gradient of Out should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("sub_result"), "SubResult should not be null");
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0],
-                      "First dimension of output gradient and "
-                      "input value must be equal.");
-    PADDLE_ENFORCE_EQ(out_dims[1], 1,
-                      "Second dimension of output gradient "
-                      "must be 1.");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, out_dims[0], x_dims[0],
+                                 "First dimension of output gradient and "
+                                 "input value must be equal.");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, out_dims[1], 1,
+                                 "Second dimension of output gradient "
+                                 "must be 1.");
     auto x_grad_name = framework::GradVarName("X");
     auto y_grad_name = framework::GradVarName("Y");
     if (ctx->HasOutput(x_grad_name)) ctx->SetOutputDim(x_grad_name, x_dims);
     if (ctx->HasOutput(y_grad_name)) ctx->SetOutputDim(y_grad_name, y_dims);
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("sub_result")->type(),
+                                   ctx.GetPlace());
+  }
 };
 
 }  // namespace operators
@@ -110,8 +163,9 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(squared_l2_distance, ops::SquaredL2DistanceOp,
                   ops::SquaredL2DistanceOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(squared_l2_distance_grad, ops::SquaredL2DistanceGradOp);
+                  ops::SquaredL2DistanceGradOpDescMaker);
+REGISTER_OPERATOR(squared_l2_distance_grad, ops::SquaredL2DistanceGradOp,
+                  ops::SquaredL2DistanceGradOpNoBuffer);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_distance,
     ops::SquaredL2DistanceKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/squared_l2_distance_op.h b/paddle/fluid/operators/squared_l2_distance_op.h
index e0133d33e6a840d2d06832393a064df978cb9cbc..12a8f05b5a603417ead8ebd250ff7951f928f4a1 100644
--- a/paddle/fluid/operators/squared_l2_distance_op.h
+++ b/paddle/fluid/operators/squared_l2_distance_op.h
@@ -77,6 +77,9 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
     auto* x_g = context.Output<Tensor>(framework::GradVarName("X"));
     auto* y_g = context.Output<Tensor>(framework::GradVarName("Y"));
 
+    PADDLE_ENFORCE_NOT_NULL(x_g);
+    PADDLE_ENFORCE_NOT_NULL(y_g);
+
     auto sub_result = EigenMatrix<T>::From(*in0);
     auto out_grad = EigenMatrix<T>::From(*in1);
 
@@ -92,31 +95,28 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
     // propagate back to input
     auto& eigen_place =
         *context.template device_context<DeviceContext>().eigen_device();
-    if (x_g) {
-      x_g->mutable_data<T>(context.GetPlace());
-      // eigen matrix
-      auto x_grad =
-          EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols}));
-      // dimensions are same with subResult
-      x_grad.device(eigen_place) = grad_mat;
-    }
 
-    if (y_g) {
-      y_g->mutable_data<T>(context.GetPlace());
-
-      PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0],
-                        "First dimension of gradient must be greater or "
-                        "equal than first dimension of target.");
-
-      if (sub_result.dimensions()[0] == y_dims[0]) {
-        auto y_grad =
-            EigenMatrix<T>::From(*y_g, framework::make_ddim({y_dims[0], cols}));
-        y_grad.device(eigen_place) = -1 * grad_mat;
-      } else {
-        auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
-        auto y_grad = EigenVector<T>::Flatten(*y_g);
-        y_grad.device(eigen_place) = col_sum_res;
-      }
+    x_g->mutable_data<T>(context.GetPlace());
+    // eigen matrix
+    auto x_grad =
+        EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols}));
+    // dimensions are same with subResult
+    x_grad.device(eigen_place) = grad_mat;
+
+    y_g->mutable_data<T>(context.GetPlace());
+
+    PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0],
+                      "First dimension of gradient must be greater or "
+                      "equal than first dimension of target.");
+
+    if (sub_result.dimensions()[0] == y_dims[0]) {
+      auto y_grad =
+          EigenMatrix<T>::From(*y_g, framework::make_ddim({y_dims[0], cols}));
+      y_grad.device(eigen_place) = -1 * grad_mat;
+    } else {
+      auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
+      auto y_grad = EigenVector<T>::Flatten(*y_g);
+      y_grad.device(eigen_place) = col_sum_res;
     }
   }
 };
diff --git a/paddle/fluid/operators/squared_l2_norm_op.cc b/paddle/fluid/operators/squared_l2_norm_op.cc
index 7bd82e0ce4add6d4434e1defaee43da178a6f309..9d2deb678ecf714421f507af88e7eabade7ecb68 100644
--- a/paddle/fluid/operators/squared_l2_norm_op.cc
+++ b/paddle/fluid/operators/squared_l2_norm_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/squared_l2_norm_op.h"
 
+#include <memory>
+
 namespace paddle {
 namespace operators {
 
@@ -31,6 +33,26 @@ class SquaredL2NormOp : public framework::OperatorWithKernel {
   }
 };
 
+class SquaredL2NormGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+
+    op->SetType("squared_l2_norm_grad");
+
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("X", Input("X"));
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 class SquaredL2NormGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -67,8 +89,7 @@ $$Out = \sum_{i} X_{i}^2$$
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(squared_l2_norm, ops::SquaredL2NormOp,
-                  ops::SquaredL2NormOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::SquaredL2NormOpMaker, ops::SquaredL2NormGradOpDescMaker);
 REGISTER_OPERATOR(squared_l2_norm_grad, ops::SquaredL2NormGradOp);
 REGISTER_OP_CPU_KERNEL(
     squared_l2_norm,
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 1391148ccf5d13082cb31ef2e143249e8ef95bfc..67f7510e874d4b3dcb857510e42cbfa7081becfe 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -65,7 +65,21 @@ class SumOp : public framework::OperatorWithKernel {
       if (framework::product(in_dim) == 0) {
         in_dim = x_dim;
       } else {
-        PADDLE_ENFORCE_EQ(in_dim, x_dim, "Input tensors must have same shape");
+        if (ctx->IsRuntime()) {
+          PADDLE_ENFORCE_EQ(in_dim, x_dim,
+                            "Input tensors must have same shape");
+        } else {
+          PADDLE_ENFORCE_EQ(in_dim.size(), x_dim.size(),
+                            "Input tensors must have same shape size");
+          // if in_dim or x_dim has -1, not check equal
+          for (int i = 0; i < x_dim.size(); ++i) {
+            if (x_dim[i] == -1 || in_dim[i] == -1) {
+              continue;
+            }
+            PADDLE_ENFORCE_EQ(in_dim[i], x_dim[i],
+                              "Input tensors must have same shape if not -1");
+          }
+        }
       }
     }
     ctx->SetOutputDim("Out", in_dim);
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
index 640644a94690d9682a5e6b1aa788a9ebdc5d2a54..7f95d16f09b5182e4da33763751ac87b53f41cf3 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/teacher_student_sigmoid_loss_op.h"
+
+#include <memory>
+
 #include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
@@ -34,12 +37,14 @@ class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
                       "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
-                      "The 1st dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
-                      "The 2nd dimension of "
-                      "Input(Label) should be 1.");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+                        "The 1st dimension of Input(X) and Input(Label) should "
+                        "be equal.");
+      PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
+                        "The 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
     ctx->SetOutputDim("Y", {x_dims[0], 1});
     ctx->ShareLoD("X", /*->*/ "Y");
   }
@@ -55,6 +60,28 @@ class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel {
   }
 };
 
+class TeacherStudentSigmoidLossGradOpDescMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+
+    op->SetType("teacher_student_sigmoid_loss_grad");
+
+    op->SetInput("X", Input("X"));
+    op->SetInput("Label", Input("Label"));
+    op->SetInput(framework::GradVarName("Y"), OutputGrad("Y"));
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 class TeacherStudentSigmoidLossGradientOp
     : public framework::OperatorWithKernel {
  public:
@@ -74,17 +101,20 @@ class TeacherStudentSigmoidLossGradientOp
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
-                      "The 1st dimension of Input(X) and Input(Label) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
-                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
-                      "be equal.");
-    PADDLE_ENFORCE_EQ(dy_dims[1], 1,
-                      "The 2nd dimension of Input(Y@Grad) should be 1.");
-    PADDLE_ENFORCE_EQ(label_dims[1], 1,
-                      "When Attr(soft_label) == false, the 2nd dimension of "
-                      "Input(Label) should be 1.");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+                        "The 1st dimension of Input(X) and Input(Label) should "
+                        "be equal.");
+      PADDLE_ENFORCE_EQ(
+          x_dims[0], dy_dims[0],
+          "The 1st dimension of Input(X) and Input(Y@Grad) should "
+          "be equal.");
+      PADDLE_ENFORCE_EQ(dy_dims[1], 1,
+                        "The 2nd dimension of Input(Y@Grad) should be 1.");
+      PADDLE_ENFORCE_EQ(label_dims[1], 1,
+                        "When Attr(soft_label) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
     ctx->ShareLoD("X", framework::GradVarName("X"));
   }
@@ -148,7 +178,7 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(teacher_student_sigmoid_loss,
                   ops::TeacherStudentSigmoidLossOp,
                   ops::TeacherStudentSigmoidLossOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::TeacherStudentSigmoidLossGradOpDescMaker);
 
 REGISTER_OPERATOR(teacher_student_sigmoid_loss_grad,
                   ops::TeacherStudentSigmoidLossGradientOp);
diff --git a/paddle/fluid/operators/tree_conv_op.cc b/paddle/fluid/operators/tree_conv_op.cc
index 615ea285e54b97a8fb81acfef9bf0d18ac4e914d..566939afaa4b435c58717a49cfdec69d6c616587 100644
--- a/paddle/fluid/operators/tree_conv_op.cc
+++ b/paddle/fluid/operators/tree_conv_op.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/operators/tree_conv_op.h"
+
+#include <memory>
 #include <string>
 
 namespace paddle {
@@ -62,17 +64,38 @@ class TreeConvOp : public framework::OperatorWithKernel {
     auto edge_dims = ctx->GetInputDim("EdgeSet");
     auto vector_dims = ctx->GetInputDim("NodesVector");
     auto filter_dims = ctx->GetInputDim("Filter");
-    PADDLE_ENFORCE_EQ(edge_dims[2], 2, "Input(EdgeSet) dim[2] should be 2");
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(edge_dims[2], 2, "Input(EdgeSet) dim[2] should be 2");
+    } else {
+      if (edge_dims[2] != -1) {
+        PADDLE_ENFORCE_EQ(edge_dims[2], 2, "Input(EdgeSet) dim[2] should be 2");
+      }
+    }
     PADDLE_ENFORCE_EQ(edge_dims.size(), 3,
                       "The dimension of EdgeSet Tensor should be 3");
     PADDLE_ENFORCE_EQ(vector_dims.size(), 3,
                       "The dimension of NodesVector Tensor should be 3");
     PADDLE_ENFORCE_EQ(filter_dims.size(), 4,
                       "The dimension of Filter Tensor should be 4");
-    PADDLE_ENFORCE_EQ(filter_dims[1], 3, "Input(Filter) dim[1] should be 3");
-    PADDLE_ENFORCE_EQ(
-        filter_dims[0], vector_dims[2],
-        "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]");
+
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(filter_dims[1], 3, "Input(Filter) dim[1] should be 3");
+      PADDLE_ENFORCE_EQ(
+          filter_dims[0], vector_dims[2],
+          "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]");
+    } else {
+      if (filter_dims[1] != -1) {
+        PADDLE_ENFORCE_EQ(filter_dims[1], 3,
+                          "Input(Filter) dim[1] should be 3");
+      }
+
+      if (filter_dims[0] != -1 && vector_dims[2] != -1) {
+        PADDLE_ENFORCE_EQ(
+            filter_dims[0], vector_dims[2],
+            "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]");
+      }
+    }
     auto output_dims = framework::make_ddim(
         {vector_dims[0], vector_dims[1], filter_dims[2], filter_dims[3]});
     ctx->SetOutputDim("Out", output_dims);
@@ -86,6 +109,30 @@ class TreeConvOp : public framework::OperatorWithKernel {
   }
 };
 
+class TreeConvGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+
+    op->SetType("tree_conv_grad");
+
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetInput("Filter", Input("Filter"));
+    op->SetInput("EdgeSet", Input("EdgeSet"));
+    op->SetInput("NodesVector", Input("NodesVector"));
+
+    op->SetOutput(framework::GradVarName("NodesVector"),
+                  InputGrad("NodesVector"));
+    op->SetOutput(framework::GradVarName("Filter"), InputGrad("Filter"));
+
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 class TreeConvGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -115,7 +162,7 @@ class TreeConvGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(tree_conv, ops::TreeConvOp, ops::TreeConvOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::TreeConvGradOpDescMaker);
 
 REGISTER_OPERATOR(tree_conv_grad, ops::TreeConvGradOp);
 
diff --git a/paddle/fluid/operators/unpool_op.cc b/paddle/fluid/operators/unpool_op.cc
index 11e505d6df3beda7053c59b66a29ec2badde3b75..86b4c06a27cc63fca8ec077cb3044ffe9415e01d 100644
--- a/paddle/fluid/operators/unpool_op.cc
+++ b/paddle/fluid/operators/unpool_op.cc
@@ -99,10 +99,15 @@ class UnpoolOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(in_x_dims.size() == 4,
                    "Unpooling intput must be of 4-dimensional.");
     PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
+
     std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
     for (size_t i = 0; i < ksize.size(); ++i) {
-      output_shape.push_back(UnpoolOutputSize(in_x_dims[i + 2], ksize[i],
-                                              paddings[i], strides[i]));
+      if (!ctx->IsRuntime() && in_x_dims[i + 2] <= 0) {
+        output_shape.push_back(-1);
+      } else {
+        output_shape.push_back(UnpoolOutputSize(in_x_dims[i + 2], ksize[i],
+                                                paddings[i], strides[i]));
+      }
     }
     ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
   }
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index e2ae7caae1ebe46b30c811ae4537f718ca587939..217d400bb3c20b4b9e6117074cebbb35161017fd 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/warpctc_op.h"
 
+#include <memory>
+
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
@@ -118,6 +120,27 @@ http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf).
   }
 };
 
+class WarpCTCGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    std::unique_ptr<framework::OpDesc> op(new framework::OpDesc());
+
+    op->SetType("warpctc_grad");
+
+    op->SetInput("WarpCTCGrad", Output("WarpCTCGrad"));
+    op->SetInput("Logits", Input("Logits"));
+    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
+
+    op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
+
+    op->SetAttrMap(Attrs());
+    return op;
+  }
+};
+
 class WarpCTCGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -145,7 +168,7 @@ class WarpCTCGradOp : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker,
-                  paddle::framework::DefaultGradOpDescMaker<true>);
+                  ops::WarpCTCGradOpDescMaker);
 REGISTER_OPERATOR(warpctc_grad, ops::WarpCTCGradOp);
 REGISTER_OP_CPU_KERNEL(
     warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index a2669ee2113630332102549fd7e5c1d85e9972b6..5de00db55add1ebc0e7d81b14934a105fd3fe474 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -45,13 +45,12 @@ cc_library(cpu_helper SRCS cpu_helper.cc DEPS cblas enforce)
 cc_test(cpu_helper_test SRCS cpu_helper_test.cc DEPS cpu_helper)
 
 set(dgc_deps "")
+IF(WITH_DGC)
+    set(dgc_deps dgc)
+ENDIF()
+
 IF(WITH_GPU)
     set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
-    if(NOT WIN32)
-        set(dgc_deps dgc)
-    endif()
-ELSE()
-    set(dgc_deps)
 ENDIF()
 
 IF(WITH_MKLDNN)
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index bdb1d1bd3bf47ea89984587ae84d2aa84be232a4..127be44525beca0e2273e591cf2ea5fb332782b4 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -356,5 +356,46 @@ using CommonType2 = typename std::add_lvalue_reference<
 #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
   __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
 
+#define __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL1, __VAL2, __CMP, \
+                                           __INV_CMP, ...)               \
+  do {                                                                   \
+    auto __val1 = (__VAL1);                                              \
+    auto __val2 = (__VAL2);                                              \
+    if (!__CTX->IsRuntime()) {                                           \
+      if (__val1 == -1 || __val2 == -1) {                                \
+        break;                                                           \
+      }                                                                  \
+    }                                                                    \
+    using __TYPE1__ = decltype(__val1);                                  \
+    using __TYPE2__ = decltype(__val2);                                  \
+    using __COMMON_TYPE1__ =                                             \
+        ::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>;  \
+    using __COMMON_TYPE2__ =                                             \
+        ::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>;  \
+    bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP(  \
+        static_cast<__COMMON_TYPE2__>(__val2));                          \
+    if (UNLIKELY(!__is_not_error)) {                                     \
+      PADDLE_THROW("Enforce failed. Expected %s " #__CMP                 \
+                   " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s",  \
+                   #__VAL1, #__VAL2, #__VAL1,                            \
+                   ::paddle::string::to_string(__val1), #__VAL2,         \
+                   ::paddle::string::to_string(__val2),                  \
+                   ::paddle::string::Sprintf(__VA_ARGS__));              \
+    }                                                                    \
+  } while (0)
+
+#define PADDLE_INFERSHAPE_ENFORCE_EQ(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, ==, !=, __VA_ARGS__)
+#define PADDLE_INFERSHAPE_ENFORCE_NE(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, !=, ==, __VA_ARGS__)
+#define PADDLE_INFERSHAPE_ENFORCE_GT(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, >, <=, __VA_ARGS__)
+#define PADDLE_INFERSHAPE_ENFORCE_GE(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, >=, <, __VA_ARGS__)
+#define PADDLE_INFERSHAPE_ENFORCE_LT(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, <, >=, __VA_ARGS__)
+#define PADDLE_INFERSHAPE_ENFORCE_LE(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, <=, >, __VA_ARGS__)
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 407d1b1299855712d9877e59ed192c000b001036..bb22628cdfbbb696bd503423f4c3fea0c3845f40 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -31,7 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/piece.h"
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_DGC)
 #include "dgc/dgc.h"
 #endif
 
@@ -211,7 +211,7 @@ void InitGLOG(const std::string &prog_name) {
 #endif
 }
 
-#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
+#if defined(PADDLE_WITH_DGC)
 void InitDGC() {
   std::call_once(dgc_init_flag, []() {
     PADDLE_ENFORCE(paddle::communication::dgc::dynloadNcclLib());
diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc
index a5aa1a4148686b032c52f99497252fde4867438f..07eaf42d2d3bc20e7f7dc56bb0f4e0cc2fbac5e3 100644
--- a/paddle/fluid/platform/lodtensor_printer.cc
+++ b/paddle/fluid/platform/lodtensor_printer.cc
@@ -52,16 +52,26 @@ void PrintVar(framework::Scope* scope, const std::string& var_name,
     return;
   }
 
-#define PrintLoDTensorCallback(cpp_type, proto_type)             \
-  do {                                                           \
-    if (tensor->type() == proto_type) {                          \
-      print_lod_tensor<cpp_type>(var_name, *tensor, print_info); \
-      return;                                                    \
-    }                                                            \
+  framework::LoDTensor printed_tensor;
+  printed_tensor.set_lod(tensor->lod());
+  printed_tensor.Resize(tensor->dims());
+  if (platform::is_cpu_place(tensor->place())) {
+    printed_tensor.ShareDataWith(*tensor);
+  } else {
+    platform::CPUPlace place;
+    framework::TensorCopy(*tensor, place, &printed_tensor);
+  }
+
+#define PrintLoDTensorCallback(cpp_type, proto_type)                    \
+  do {                                                                  \
+    if (tensor->type() == proto_type) {                                 \
+      print_lod_tensor<cpp_type>(var_name, printed_tensor, print_info); \
+      return;                                                           \
+    }                                                                   \
   } while (0)
 
   _ForEachDataType_(PrintLoDTensorCallback);
-  VLOG(1) << "PrintVar: unrecognized data type:" << tensor->type();
+  VLOG(1) << "PrintVar: unrecognized data type:" << printed_tensor.type();
 }
 
 }  // end namespace platform
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index ecaad4ec070fe60a522839e0718c424a441dec0b..ba3a82b4b07f4dcb3f0037e398c146ab167d7b57 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "boost/optional.hpp"
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -395,9 +396,28 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
   std::vector<int> logical_axis_;
 };
 
+template <typename T>
+struct convolutional_algorithm;
+
+template <>
+struct convolutional_algorithm<mkldnn::convolution_forward> {
+  static constexpr mkldnn::algorithm T = mkldnn::algorithm::convolution_direct;
+};
+
+template <>
+struct convolutional_algorithm<mkldnn::deconvolution_forward> {
+  static constexpr mkldnn::algorithm T =
+      mkldnn::algorithm::deconvolution_direct;
+};
+
 template <class forward_t, class backward_data_t, class backward_weights_t>
 class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
  public:
+  ConvMKLDNNTemplateHandler(const platform::MKLDNNDeviceContext& dev_ctx,
+                            mkldnn::engine engine, const std::string& base_key)
+      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {}
+
+  // TODO(jczaja): remove after conv int8 is adapted
   ConvMKLDNNTemplateHandler(
       std::shared_ptr<typename forward_t::primitive_desc> conv_pd,
       const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine,
@@ -542,6 +562,73 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
                                scale_data, mask);
   }
 
+  mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
+                                       bool fuse_residual_conn = false) const {
+    mkldnn::primitive_attr conv_attr;
+    mkldnn::post_ops post_operations;
+    // Fusion with Elementwise layer relies on adding a sum post-operation with
+    // the scale parameter. It is assumed that when fuse_residual_connection is
+    // true, the output tensor contains the data coming from residual
+    // connection. The result of this post_op is:
+    // Output = scale * Output + Conv_Out.
+    if (fuse_residual_conn) {
+      post_operations.append_sum(1.0f);
+    }
+    // Fusion with ReLU layer is executed through the PostOps feature. Create a
+    // PostOps object and configure it to execute an eltwise relu operation.
+    if (fuse_relu) {
+      constexpr float scale = 1.0f;
+      constexpr float negative_slope = 0.0f;
+      constexpr float placeholder = 0.0f;
+      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
+                                     negative_slope, placeholder);
+    }
+    conv_attr.set_post_ops(post_operations);
+    return conv_attr;
+  }
+
+  std::shared_ptr<typename forward_t::primitive_desc>
+  AcquireConvolutionPrimitiveDescriptor(
+      const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights,
+      boost::optional<const mkldnn::memory::desc&> bias,
+      const mkldnn::memory::desc& dst, const std::vector<int>& strides,
+      const std::vector<int>& paddings, const mkldnn::engine& engine,
+      const bool fuse_relu, const bool fuse_residual_conn,
+      mkldnn::prop_kind fwd_prop_kind) {
+    const std::string key_conv_pd = key_ + "@conv_pd";
+
+    auto conv_pd = std::static_pointer_cast<typename forward_t::primitive_desc>(
+        dev_ctx_.GetBlob(key_conv_pd));
+
+    if (conv_pd == nullptr) {
+      mkldnn::memory::dims stride_dims = strides;
+      mkldnn::memory::dims padding_dims = paddings;
+
+      auto conv_desc =
+          bias ? typename forward_t::desc(
+                     fwd_prop_kind, convolutional_algorithm<forward_t>::T, src,
+                     weights, *bias, dst, stride_dims, padding_dims,
+                     padding_dims, mkldnn::padding_kind::zero)
+               : typename forward_t::desc(
+                     fwd_prop_kind, convolutional_algorithm<forward_t>::T, src,
+                     weights, dst, stride_dims, padding_dims, padding_dims,
+                     mkldnn::padding_kind::zero);
+
+      mkldnn::primitive_attr conv_attr =
+          CreatePostOps(fuse_relu, fuse_residual_conn);
+
+      conv_pd_.reset(
+          new typename forward_t::primitive_desc(conv_desc, conv_attr, engine));
+      // Save conv_pd/src_memory/weights_memory for backward pass
+      dev_ctx_.SetBlob(key_conv_pd, conv_pd_);
+    } else {
+      conv_pd_ = conv_pd;
+      is_reusing_ = true;
+    }
+
+    return conv_pd_;
+  }
+
   std::shared_ptr<forward_t> AcquireConvolution(
       std::shared_ptr<mkldnn::memory> src_memory_p,
       std::shared_ptr<mkldnn::memory> weights_memory_p,
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 16365c1fd0b0adb914cdfd08e3f6542fca952e06..900c1a0ca690378ab89568e7de503c660cd85b79 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,11 +1,11 @@
-set(PYBIND_DEPS pybind python proto_desc memory executor async_executor fleet_wrapper prune
+set(PYBIND_DEPS pybind python proto_desc memory executor async_executor fleet_wrapper nccl_wrapper prune
   feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
   tracer analysis_predictor imperative_profiler nccl_context)
 
 if(WITH_PYTHON)
   list(APPEND PYBIND_DEPS py_func_op)
 endif()
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc reader_py.cc async_executor_py.cc fleet_wrapper_py.cc data_set_py.cc imperative.cc ir.cc inference_api.cc)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc reader_py.cc async_executor_py.cc fleet_wrapper_py.cc nccl_wrapper_py.cc data_set_py.cc imperative.cc ir.cc inference_api.cc)
 
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index f8ded9f94ecaf3df1e14aead60ae12abcf8c34a9..71eeaf3b53acf98c9f5e43f9acd6d67d42086005 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -17,6 +17,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/operator.h"
 
+#if defined(PADDLE_WITH_DGC)
+#include "paddle/fluid/framework/details/dgc_const_values.h"
+#include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
+#endif
+
 namespace paddle {
 namespace pybind {
 
@@ -52,6 +57,17 @@ void BindConstValue(pybind11::module* m) {
   op_proto_and_checker_maker.def(
       "kOpCreationCallstackAttrName",
       framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
+#if defined(PADDLE_WITH_DGC)
+  auto dgc = m->def_submodule("dgc");
+  dgc.def("kDGCUName", [] { return framework::details::g_dgc_u; });
+  dgc.def("kDGCVName", [] { return framework::details::g_dgc_v; });
+  dgc.def("kDGCKName", [] { return framework::details::g_dgc_k; });
+  dgc.def("kDGCEncodedName", [] { return framework::details::g_dgc_encoded; });
+  dgc.def("kDGCCounterName",
+          [] { return framework::details::g_dgc_counter_name; });
+  dgc.def("kDGCRampUpBeginStepName",
+          [] { return framework::details::g_dgc_rampup_begin_step; });
+#endif
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc
index b773fd03c003e4c5b51f4876e6ac999f9e830ce4..3f171b65ab83de5a0d84d3c29b1e82510bf69716 100644
--- a/paddle/fluid/pybind/data_set_py.cc
+++ b/paddle/fluid/pybind/data_set_py.cc
@@ -50,11 +50,15 @@ void BindDataset(py::module* m) {
       .def("set_filelist", &framework::Dataset::SetFileList)
       .def("set_thread_num", &framework::Dataset::SetThreadNum)
       .def("set_trainer_num", &framework::Dataset::SetTrainerNum)
+      .def("set_fleet_send_batch_size",
+           &framework::Dataset::SetFleetSendBatchSize)
       .def("set_hdfs_config", &framework::Dataset::SetHdfsConfig)
       .def("set_data_feed_desc", &framework::Dataset::SetDataFeedDesc)
       .def("get_filelist", &framework::Dataset::GetFileList)
       .def("get_thread_num", &framework::Dataset::GetThreadNum)
       .def("get_trainer_num", &framework::Dataset::GetTrainerNum)
+      .def("get_fleet_send_batch_size",
+           &framework::Dataset::GetFleetSendBatchSize)
       .def("get_hdfs_config", &framework::Dataset::GetHdfsConfig)
       .def("get_data_feed_desc", &framework::Dataset::GetDataFeedDesc)
       .def("register_client2client_msg_handler",
diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc
index 77f15db8d68da131c892b1a65946c1994b90fd04..2f6a7d2480aedd5bd37d0dbd5ccf64447e4a21ff 100644
--- a/paddle/fluid/pybind/fleet_wrapper_py.cc
+++ b/paddle/fluid/pybind/fleet_wrapper_py.cc
@@ -36,7 +36,6 @@ limitations under the License. */
 #include "paddle/fluid/pybind/fleet_wrapper_py.h"
 
 namespace py = pybind11;
-namespace pd = paddle::framework;
 
 namespace paddle {
 namespace pybind {
diff --git a/paddle/fluid/pybind/nccl_wrapper_py.cc b/paddle/fluid/pybind/nccl_wrapper_py.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bbba03f6660fe9ddb14764709ea81a9a82b1b386
--- /dev/null
+++ b/paddle/fluid/pybind/nccl_wrapper_py.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fcntl.h>
+
+#ifdef _POSIX_C_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
+
+#ifdef _XOPEN_SOURCE
+#undef _XOPEN_SOURCE
+#endif
+
+#include <string>
+#include <vector>
+
+#include "google/protobuf/io/zero_copy_stream_impl.h"
+#include "google/protobuf/text_format.h"
+#include "paddle/fluid/framework/async_executor.h"
+#include "paddle/fluid/framework/data_feed.h"
+#include "paddle/fluid/framework/data_feed.pb.h"
+#include "paddle/fluid/framework/fleet/nccl_wrapper.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/io.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/variant.h"
+#include "paddle/fluid/pybind/nccl_wrapper_py.h"
+
+namespace py = pybind11;
+namespace pd = paddle::framework;
+
+namespace paddle {
+namespace pybind {
+void BindNCCLWrapper(py::module* m) {
+  py::class_<framework::NCCLWrapper>(*m, "Nccl")
+      .def(py::init())
+      .def("init_nccl", &framework::NCCLWrapper::InitNCCL)
+      .def("set_nccl_id", &framework::NCCLWrapper::SetNCCLId)
+      .def("set_rank_info", &framework::NCCLWrapper::SetRankInfo)
+      .def("sync_var", &framework::NCCLWrapper::SyncVar);
+}  // end NCCLWrapper
+}  // end namespace pybind
+}  // end namespace paddle
diff --git a/paddle/fluid/pybind/nccl_wrapper_py.h b/paddle/fluid/pybind/nccl_wrapper_py.h
new file mode 100644
index 0000000000000000000000000000000000000000..683eb4d61e00abf4e7192efb1d102ff73cb9e02e
--- /dev/null
+++ b/paddle/fluid/pybind/nccl_wrapper_py.h
@@ -0,0 +1,28 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindNCCLWrapper(py::module* m);
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 8c34e3efe2a07cadde5aa06669fda88be7661db1..6a5f5f60bca1974635730ce746869b95cf4e80ed 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/details/alloc_continuous_space_for_grad_pass.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -57,6 +58,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/inference_api.h"
 #include "paddle/fluid/pybind/ir.h"
+#include "paddle/fluid/pybind/nccl_wrapper_py.h"
 #include "paddle/fluid/pybind/protobuf.h"
 #include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/reader_py.h"
@@ -165,6 +167,11 @@ PYBIND11_MODULE(core, m) {
   // to enable eager deletion mode in unittest.
   m.def("_set_eager_deletion_mode", &paddle::framework::SetEagerDeletionMode);
 
+  m.def("_set_fuse_parameter_group_size",
+        &paddle::framework::details::SetFuseParameterGroupsSize);
+  m.def("_set_fuse_parameter_memory_size",
+        &paddle::framework::details::SetFuseParameterMemorySize);
+
   m.add_object("_cleanup",
                py::capsule([]() { ScopePool::Instance().Clear(); }));
 
@@ -1356,6 +1363,14 @@ All parameter, weight, gradient are variables in Paddle.
           "fuse_all_reduce_ops",
           [](const BuildStrategy &self) { return self.fuse_all_reduce_ops_; },
           [](BuildStrategy &self, bool b) { self.fuse_all_reduce_ops_ = b; })
+      .def_property(
+          "cache_runtime_context",
+          [](const BuildStrategy &self) { return self.cache_runtime_context_; },
+          [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; })
+      .def_property(
+          "cache_expected_kernel",
+          [](const BuildStrategy &self) { return self.cache_expected_kernel_; },
+          [](BuildStrategy &self, bool b) { self.cache_expected_kernel_ = b; })
       .def("_finalize_strategy_and_create_passes",
            [](BuildStrategy &self) -> std::shared_ptr<ir::PassBuilder> {
              return self.CreatePassesFromStrategy(true);
@@ -1391,6 +1406,7 @@ All parameter, weight, gradient are variables in Paddle.
   BindRecordIOWriter(&m);
   BindAsyncExecutor(&m);
   BindFleetWrapper(&m);
+  BindNCCLWrapper(&m);
   BindGraph(&m);
   BindNode(&m);
   BindInferenceApi(&m);
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index fc52c281c4f0de2b05ab2b58aa81cdbf1216e6a7..6f1b01b2d2fdfc0b0570abd1b0694866afd1c645 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -202,6 +202,7 @@ function cmake_gen() {
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
+        -DWITH_HIGH_LEVEL_API_TEST=${WITH_HIGH_LEVEL_API_TEST:-OFF}
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
         -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}
@@ -234,6 +235,7 @@ EOF
         -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
         -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
+        -DWITH_HIGH_LEVEL_API_TEST=${WITH_HIGH_LEVEL_API_TEST:-OFF} \
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
         -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\
@@ -291,8 +293,12 @@ function build() {
     Building in /paddle/build ...
     ============================================
 EOF
+    parallel_number=`nproc`
+    if [[ "$1" != "" ]]; then
+      parallel_number=$1
+    fi
     make clean
-    make -j `nproc`
+    make -j ${parallel_number}
     make install -j `nproc`
 }
 
@@ -440,7 +446,8 @@ function assert_api_spec_approvals() {
         BRANCH="develop"
     fi
 
-    API_FILES=("paddle/fluid/API.spec"
+    API_FILES=("CMakeLists.txt"
+               "paddle/fluid/API.spec"
                "paddle/fluid/op_use_default_grad_op_maker.spec"
                "python/paddle/fluid/parallel_executor.py"
                "paddle/fluid/framework/operator.h"
@@ -459,28 +466,33 @@ function assert_api_spec_approvals() {
                "python/paddle/fluid/compiler.py"
                "paddle/fluid/operators/distributed/send_recv.proto.in")
     for API_FILE in ${API_FILES[*]}; do
-      API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" || true`
+      API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" | grep -v "/CMakeList.txt" || true`
       echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}"
-      if [ ${API_CHANGE} ] && [ "${GIT_PR_ID}" != "" ]; then
+      if [ "${API_CHANGE}" ] && [ "${GIT_PR_ID}" != "" ]; then
           # NOTE: per_page=10000 should be ok for all cases, a PR review > 10000 is not human readable.
-          # approval_user_list: velconia 1979255,panyx0718 2887803,XiaoguangHu01 46782768,chengduoZH 30176695,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,tensor-tang 21351065,jacquesqiao 3048612,typhoonzero 13348433,shanyi15 35982308. 
-          if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
+          # approval_user_list: velconia 1979255,XiaoguangHu01 46782768,chengduoZH 30176695,Xreki 12538138,luotao1 6836917,sneaxiy 32832641,tensor-tang 21351065,jacquesqiao 3048612,typhoonzero 13348433,shanyi15 35982308. 
+          if [ "${API_FILE}" == "paddle/fluid/API.spec" ];then
             APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-            python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 2887803 35982308 46782768 30176695`
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 35982308 46782768 30176695`
             if [ "${APPROVALS}" == "TRUE" ];then
               APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
               python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 35982308`
             fi
+          elif [ "${API_FILE}" == "CMakeLists.txt" ];then
+            APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 6836917 46782768 30176695`
           else
             APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
+            python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
           fi
           echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
           if [ "${APPROVALS}" == "FALSE" ]; then
-            if [ "$API_FILE" == "paddle/fluid/API.spec" ];then
-              echo "You must have one RD (panyx0718 or chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE}"
+            if [ "${API_FILE}" == "paddle/fluid/API.spec" ];then
+              echo "You must have one RD (chengduoZH or XiaoguangHu01) and one PM (shanyi15) approval for the api change! ${API_FILE}"
+            elif [ "${API_FILE}" == "CMakeLists.txt" ];then
+              echo "You must have one RD (luotao1 or chengduoZH or XiaoguangHu01) approval for the cmakelist change! ${API_FILE}"
             else
-              echo "You must have one RD (velconia,panyx0718,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}"
+              echo "You must have one RD (velconia,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}"
             fi
             exit 1
           fi
@@ -490,10 +502,10 @@ function assert_api_spec_approvals() {
     HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true`
     if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then
         APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \
-        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 2887803 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
+        python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 1979255 21351065 3048612 13348433 46782768 30176695 12538138 6836917 32832641`
         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
         if [ "${APPROVALS}" == "FALSE" ]; then
-            echo "You must have one RD (velconia,panyx0718,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}"
+            echo "You must have one RD (velconia,XiaoguangHu01,chengduoZH,Xreki,luotao1,sneaxiy,tensor-tang,jacquesqiao,typhoonzero) approval for the api change! ${API_FILE}"
             exit 1
         fi
     fi
@@ -737,9 +749,13 @@ function gen_fluid_lib() {
     Generating fluid library for train and inference ...
     ========================================
 EOF
+    parallel_number=`nproc`
+    if [[ "$1" != "" ]]; then
+      parallel_number=$1
+    fi
     cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON
-    make -j `nproc` fluid_lib_dist
-    make -j `nproc` inference_lib_dist
+    make -j ${parallel_number} fluid_lib_dist
+    make -j ${parallel_number} inference_lib_dist
 }
 
 function tar_fluid_lib() {
@@ -770,11 +786,22 @@ EOF
 
 function main() {
     local CMD=$1
+    local parallel_number=$2
     init
     case $CMD in
+      build_only)
+        cmake_gen ${PYTHON_ABI:-""}
+        build ${parallel_number}
+        ;;
+      build_and_check)
+        cmake_gen ${PYTHON_ABI:-""}
+        build ${parallel_number}
+        assert_api_not_changed ${PYTHON_ABI:-""}
+        assert_api_spec_approvals
+        ;;
       build)
         cmake_gen ${PYTHON_ABI:-""}
-        build
+        build ${parallel_number}
         gen_dockerfile ${PYTHON_ABI:-""}
         ;;
       test)
@@ -797,7 +824,7 @@ function main() {
         ;;
       fluid_inference_lib)
         cmake_gen ${PYTHON_ABI:-""}
-        gen_fluid_lib
+        gen_fluid_lib ${parallel_number}
         tar_fluid_lib
         test_fluid_lib
         ;;
@@ -806,16 +833,16 @@ function main() {
         ;;
       cicheck)
         cmake_gen ${PYTHON_ABI:-""}
-        build
+        build ${parallel_number}
         assert_api_not_changed ${PYTHON_ABI:-""}
         run_test
-        gen_fluid_lib
+        gen_fluid_lib ${parallel_number}
         test_fluid_lib
         assert_api_spec_approvals
         ;;
       cicheck_brpc)
         cmake_gen ${PYTHON_ABI:-""}
-        build
+        build ${parallel_number}
         run_brpc_test
         ;;
       assert_api)
@@ -823,7 +850,7 @@ function main() {
         assert_api_spec_approvals
         ;;
       test_inference)
-        gen_fluid_lib
+        gen_fluid_lib ${parallel_number}
         test_fluid_lib
         ;;
       assert_api_approvals)
@@ -840,7 +867,7 @@ function main() {
         ;;
       cicheck_py35)
         cmake_gen ${PYTHON_ABI:-""}
-        build
+        build ${parallel_number}
         run_test
         assert_api_not_changed ${PYTHON_ABI:-""}
         ;;
@@ -848,7 +875,7 @@ function main() {
         cmake_gen ${PYTHON_ABI:-""}
         ;;
       gen_fluid_lib)
-        gen_fluid_lib
+        gen_fluid_lib ${parallel_number}
         ;;
       test_fluid_lib)
         test_fluid_lib
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 0af883764e157db24e17a1a4ef1bff27f9b39b0f..3dc2b0c895116155f41df3ca66125fff3ede5ead 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -66,6 +66,8 @@ from . import compiler
 from .compiler import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable
 from . import install_check
+from .dygraph.nn import *
+from .dygraph.layers import *
 
 Tensor = LoDTensor
 
@@ -173,6 +175,7 @@ def __bootstrap__():
         read_env_flags.append('communicator_thread_pool_size')
         read_env_flags.append('communicator_max_merge_var_num')
         read_env_flags.append('communicator_fake_rpc')
+        read_env_flags.append('communicator_send_wait_times')
         if core.is_compiled_with_brpc():
             read_env_flags.append('max_body_size')
             #set brpc max body size
diff --git a/python/paddle/fluid/contrib/slim/core/compressor.py b/python/paddle/fluid/contrib/slim/core/compressor.py
index 1547b6abbe660b6be7a681a4e270e3080a5dac36..b97508018ac6da47bfdefadd06a6c3788cb7bd77 100644
--- a/python/paddle/fluid/contrib/slim/core/compressor.py
+++ b/python/paddle/fluid/contrib/slim/core/compressor.py
@@ -363,6 +363,9 @@ class Compressor(object):
                             strategies = pickle.load(
                                 strategy_file, encoding='bytes')
 
+                for strategy in strategies:
+                    strategy.restore_from_checkpoint(context)
+
                 if os.path.exists(model_path):
                     exe = SlimGraphExecutor(context.place)
                     with scope_guard(context.scope):
diff --git a/python/paddle/fluid/contrib/slim/core/strategy.py b/python/paddle/fluid/contrib/slim/core/strategy.py
index 28bf24f4e341dd528d2cd25f6fb24543886150d6..f2cd2a2835b1c19a71679d74736a2d9fe7fc724e 100644
--- a/python/paddle/fluid/contrib/slim/core/strategy.py
+++ b/python/paddle/fluid/contrib/slim/core/strategy.py
@@ -46,3 +46,6 @@ class Strategy(object):
 
     def on_compression_end(self, context):
         pass
+
+    def restore_from_checkpoint(self, context):
+        pass
diff --git a/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
index 2fc6b45183164f135ae3ced08c1900ad526add45..d8e08c3ebef50c9808ed818dcf35443dc25f850e 100644
--- a/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
+++ b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py
@@ -38,7 +38,7 @@ class DistillationStrategy(Strategy):
         super(DistillationStrategy, self).__init__(start_epoch, end_epoch)
         self.distillers = distillers
 
-    def on_compression_begin(self, context):
+    def restore_from_checkpoint(self, context):
         # load from checkpoint
         if context.epoch_id > 0:
             if context.epoch_id > self.start_epoch and context.epoch_id < self.end_epoch:
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
index a22b6da020510838dc82fe7af87ab62db6e874ef..12c1ce98992c32caaa300045c4adc918dd88f427 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_strategy.py
@@ -88,7 +88,7 @@ class QuantizationStrategy(Strategy):
         self.save_out_nodes = save_out_nodes
         self.save_in_nodes = save_in_nodes
 
-    def on_compression_begin(self, context):
+    def restore_from_checkpoint(self, context):
         """
         Restore graph when the compressoin task is inited from checkpoint.
         """
@@ -143,10 +143,9 @@ class QuantizationStrategy(Strategy):
             train_ir_graph.graph).with_data_parallel(
                 loss_name=context.optimize_graph.out_nodes['loss'],
                 build_strategy=build_strategy)
-        # for evaluation. And program compiled from ir graph must be with data parallel.
-        context.eval_graph.compiled_graph = CompiledProgram(
-            test_ir_graph.graph).with_data_parallel(
-                build_strategy=build_strategy)
+
+        context.eval_graph.program = test_ir_graph.to_program()
+
         # for saving inference model after training
         context.put('quantization_test_ir_graph_backup', test_ir_graph)
 
diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt
index a2c59416467e5dbe66f058666633807eb0e45047..73e9ff10f0f0caedb6e2f837a114c5accb316dbb 100644
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
@@ -7,7 +7,7 @@ endif()
 
 foreach(src ${TEST_OPS})
     if(src MATCHES "test_calibration")
-        py_test(${src} SRCS ${src}.py ENVS FLAGS_use_mkldnn=true)
+        py_test(${src} SRCS ${src}.py ENVS FLAGS_use_mkldnn=true FLAGS_OMP_NUM_THREADS=${CPU_NUM_THREADS_ON_CI})
     else()
         py_test(${src} SRCS ${src}.py)
     endif()
diff --git a/python/paddle/fluid/contrib/tests/test_calibration.py b/python/paddle/fluid/contrib/tests/test_calibration.py
index 00885eb5d6057b4a7738705007a9334da6aea9d0..16de60440245dbe6d73dd499e851dd18a280825f 100644
--- a/python/paddle/fluid/contrib/tests/test_calibration.py
+++ b/python/paddle/fluid/contrib/tests/test_calibration.py
@@ -147,10 +147,11 @@ class TestCalibrationForResnet50(unittest.TestCase):
                                                    self.data_cache_folder)
         os.system(cmd)
 
-        self.batch_size = 1
-        self.sample_iterations = 50
+        self.batch_size = 1 if os.environ.get('DATASET') == 'full' else 50
+        self.sample_iterations = 50 if os.environ.get(
+            'DATASET') == 'full' else 1
         self.infer_iterations = 50000 if os.environ.get(
-            'DATASET') == 'full' else 50
+            'DATASET') == 'full' else 1
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
@@ -279,15 +280,15 @@ class TestCalibrationForResnet50(unittest.TestCase):
     def test_calibration(self):
         self.download_model()
         print("Start FP32 inference for {0} on {1} images ...").format(
-            self.model, self.infer_iterations)
+            self.model, self.infer_iterations * self.batch_size)
         (fp32_throughput, fp32_latency,
          fp32_acc1) = self.run_program(self.model_cache_folder + "/model")
         print("Start INT8 calibration for {0} on {1} images ...").format(
-            self.model, self.sample_iterations)
+            self.model, self.sample_iterations * self.batch_size)
         self.run_program(
             self.model_cache_folder + "/model", True, algo=self.algo)
         print("Start INT8 inference for {0} on {1} images ...").format(
-            self.model, self.infer_iterations)
+            self.model, self.infer_iterations * self.batch_size)
         (int8_throughput, int8_latency,
          int8_acc1) = self.run_program("calibration_out")
         delta_value = fp32_acc1 - int8_acc1
diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py
index d63773223ddc0c155f26a656f19c4ba80f482632..d539940133881d5aa632eae6e3975ae57a385ebf 100644
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -136,6 +136,7 @@ class DatasetBase(object):
             slot_var.name = var.name
             if var.lod_level == 0:
                 slot_var.is_dense = True
+                slot_var.shape.extend(var.shape)
             if var.dtype == core.VarDesc.VarType.FP32:
                 slot_var.type = "float"
             elif var.dtype == core.VarDesc.VarType.INT64:
@@ -218,6 +219,7 @@ class InMemoryDataset(DatasetBase):
             >>> dataset = fluid.DatasetFactory.create_dataset("InMemoryDataset")
             >>> filelist = ["a.txt", "b.txt"]
             >>> dataset.set_filelist(filelist)
+            >>> dataset.load_into_memory()
             >>> dataset.local_shuffle()
         """
         self.dataset.local_shuffle()
@@ -235,23 +237,45 @@ class InMemoryDataset(DatasetBase):
             >>> dataset = fluid.DatasetFactory.create_dataset("InMemoryDataset")
             >>> filelist = ["a.txt", "b.txt"]
             >>> dataset.set_filelist(filelist)
+            >>> dataset.load_into_memory()
             >>> dataset.global_shuffle(fleet)
 
         Args:
             fleet: fleet singleton. Default None.
         """
         trainer_num = 1
+        fleet_send_batch_size = 80000
         if fleet is not None:
             fleet.fleet_instance.role_maker_._barrier_worker()
             trainer_num = fleet.worker_num()
         self.dataset.register_client2client_msg_handler()
         self.dataset.set_trainer_num(trainer_num)
+        self.dataset.set_fleet_send_batch_size(fleet_send_batch_size)
         if fleet is not None:
             fleet.fleet_instance.role_maker_._barrier_worker()
         self.dataset.global_shuffle()
         if fleet is not None:
             fleet.fleet_instance.role_maker_._barrier_worker()
 
+    def release_memory(self):
+        """
+        Release InMemoryDataset memory data, when data will not be used again.
+
+        Example:
+            >>> import paddle.fluid as fluid
+            >>> import paddle.fluid.incubate.fleet.parameter_server as fleet
+            >>> dataset = fluid.DatasetFactory.create_dataset("InMemoryDataset")
+            >>> filelist = ["a.txt", "b.txt"]
+            >>> dataset.set_filelist(filelist)
+            >>> dataset.load_into_memory()
+            >>> dataset.global_shuffle(fleet)
+            >>> exe = fluid.Executor(fluid.CPUPlace())
+            >>> exe.run(fluid.default_startup_program())
+            >>> exe.train_from_dataset(fluid.default_main_program(), dataset)
+            >>> dataset.release_memory()
+        """
+        self.dataset.release_memory()
+
 
 class QueueDataset(DatasetBase):
     """
diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py
index 7fc72191884020f4cc57c9269b636161635f06d0..0998f779acfea23f3a494a25b43a6fa824b985f1 100644
--- a/python/paddle/fluid/device_worker.py
+++ b/python/paddle/fluid/device_worker.py
@@ -26,8 +26,8 @@ class DeviceWorker(object):
         """
         Init.
         """
-        self.program_ = None
-        self.infer_ = None
+        self._program = None
+        self._infer = None
 
     def _set_infer(self, infer=False):
         """
@@ -36,7 +36,7 @@ class DeviceWorker(object):
         Args:
             infer(bool): whether to do inference
         """
-        self.infer_ = infer
+        self._infer = infer
 
     def _set_fleet_desc(self, fleet_desc):
         """
@@ -45,7 +45,7 @@ class DeviceWorker(object):
         Args:
             fleet_desc(PSParameter): pslib.PSParameter object
         """
-        self.fleet_desc_ = fleet_desc
+        self._fleet_desc = fleet_desc
 
     def _set_program(self, program):
         """
@@ -54,7 +54,7 @@ class DeviceWorker(object):
         Args:
             program(Program): a Program object
         """
-        self.program_ = program
+        self._program = program
 
     def _gen_worker_desc(self, trainer_desc):
         """
@@ -88,7 +88,7 @@ class Hogwild(DeviceWorker):
             trainer_desc(TrainerDesc): a TrainerDesc object
         """
         trainer_desc.device_worker_name = "HogwildWorker"
-        if self.infer_:
+        if self._infer:
             # just ignore feed op for inference model
             trainer_desc.hogwild_param.skip_ops.extend(["feed"])
 
@@ -113,11 +113,11 @@ class DownpourSGD(DeviceWorker):
             trainer_desc(TrainerDesc): a TrainerDesc object
         """
         dense_table_set = set()
-        program_id = str(id(self.program_))
-        if self.program_ == None:
+        program_id = str(id(self._program))
+        if self._program == None:
             print("program of current device worker is not configured")
             exit(-1)
-        opt_info = self.program_._fleet_opt
+        opt_info = self._program._fleet_opt
         program_configs = opt_info["program_configs"]
         downpour = trainer_desc.downpour_param
 
@@ -140,7 +140,7 @@ class DownpourSGD(DeviceWorker):
         trainer_desc.device_worker_name = "DownpourWorker"
         pull_thread = trainer_desc.pull_dense_param
         pull_thread.device_num = trainer_desc.thread_num
-        for i in self.fleet_desc_.trainer_param.dense_table:
+        for i in self._fleet_desc.trainer_param.dense_table:
             if i.table_id in dense_table_set:
                 dense_table = pull_thread.dense_table.add()
                 dense_table.dense_value_name.extend(i.dense_variable_name)
@@ -148,29 +148,29 @@ class DownpourSGD(DeviceWorker):
                     i.table_id
         sparse_table = downpour.sparse_table.add()
         sparse_table.table_id = \
-                    self.fleet_desc_.trainer_param.sparse_table[0].table_id
+                    self._fleet_desc.trainer_param.sparse_table[0].table_id
         sparse_table.sparse_key_name.extend(
-            self.fleet_desc_.trainer_param.sparse_table[0].slot_key)
+            self._fleet_desc.trainer_param.sparse_table[0].slot_key)
         sparse_table.sparse_value_name.extend(
-            self.fleet_desc_.trainer_param.sparse_table[0].slot_value)
+            self._fleet_desc.trainer_param.sparse_table[0].slot_value)
         sparse_table.sparse_grad_name.extend(
-            self.fleet_desc_.trainer_param.sparse_table[0].slot_gradient)
+            self._fleet_desc.trainer_param.sparse_table[0].slot_gradient)
         sparse_table.emb_dim = \
-                    self.fleet_desc_.server_param.downpour_server_param.downpour_table_param[
+                    self._fleet_desc.server_param.downpour_server_param.downpour_table_param[
                         0].accessor.fea_dim - 2
         sparse_table.fea_dim = sparse_table.emb_dim + 2
         # TODO(guru4elephant): hard code here, need to improve
         sparse_table.label_var_name = "click"
 
-        for i in self.fleet_desc_.trainer_param.dense_table:
+        for i in self._fleet_desc.trainer_param.dense_table:
             if i.table_id in dense_table_set:
                 dense_table = downpour.dense_table.add()
                 dense_table.table_id = i.table_id
                 dense_table.dense_value_name.extend(i.dense_variable_name)
                 dense_table.dense_grad_name.extend(
                     i.dense_gradient_variable_name)
-                downpour.skip_ops.extend(self.fleet_desc_.trainer_param.skip_op)
-        if self.infer_:
+                downpour.skip_ops.extend(self._fleet_desc.trainer_param.skip_op)
+        if self._infer:
             downpour.push_dense = False
             downpour.push_sparse = False
 
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index d55dbbb9c72cb887e169849c3a3e32a13c202a7b..bf484b35c7bf9a2b17126789ff247bd73095fe7b 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -22,7 +22,7 @@ __all__ = ['enabled', 'guard', 'to_variable']
 
 
 def enabled():
-    return framework._in_dygraph_mode()
+    return framework.in_dygraph_mode()
 
 
 @signature_safe_contextmanager
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index f992ae0576c81ed98a3e9f7a446b0c2e808622ea..9d29b531756a25e9aaaf7ad4f3c4ca92de5deec2 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -97,20 +97,12 @@ def load_persistables(vardict, dirname, filename=None):
 
     Examples:
         .. code-block:: python
-            my_layer = layer(fluid.dygraph.Layer)
+            my_layer = layer(fluid.Layer)
             param_path = "./my_paddle_model"
 
             param_dict = fluid.dygraph.load_persistables(my_layer.parameters(), param_path)
             param_1 = param_dict['PtbModel_0.w_1']
 
-            or:
-            my_layer = layer(fluid.dygraph.Layer)
-            param_path = "./my_paddle_model"
-            filename = "model.file"
-            param_dict = fluid.dygraph.load_persistables(my_layer.state_dict(), param_path,
-                                                                       filename=filename)
-            param_1 = param_dict['PtbModel_0.w_1']
-
         """
     if isinstance(vardict, collections.OrderedDict):
         return _load_var_from_file(vardict, dirname, filename)
@@ -121,14 +113,17 @@ def load_persistables(vardict, dirname, filename=None):
 def _save_var_to_file(stat_dict, file_dir, file_name):
     save_block = default_main_program().global_block()
     save_var_map = {}
-    for each_var in stat_dict.items():
+    for var_key, each_var in stat_dict.items():
         save_var_map[each_var.name] = each_var
         if file_name is None:
             save_block.append_op(
                 type='save',
                 inputs={'X': [each_var]},
                 outputs={},
-                attrs={'file_path': os.path.join(file_dir, each_var.name)})
+                attrs={
+                    'file_path': os.path.join(file_dir,
+                                              os.path.normpath(each_var.name))
+                })
 
     if file_name is not None:
         save_var_list = []
@@ -139,14 +134,16 @@ def _save_var_to_file(stat_dict, file_dir, file_name):
             type='save_combine',
             inputs={'X': save_var_list},
             outputs={},
-            attrs={'file_path': os.path.join(file_dir, file_name)})
+            attrs={
+                'file_path': os.path.join(file_dir, os.path.normpath(file_name))
+            })
 
 
 def _load_var_from_file(stat_dict, file_dir, file_name):
     load_block = default_main_program().global_block()
     load_var_map = {}
 
-    for each_var in stat_dict.items():
+    for var_key, each_var in stat_dict.items():
         assert isinstance(each_var, Variable)
         if each_var.type == core.VarDesc.VarType.RAW:
             continue
@@ -156,7 +153,10 @@ def _load_var_from_file(stat_dict, file_dir, file_name):
                 type='load',
                 inputs={},
                 outputs={'Out': [new_var]},
-                attrs={'file_path': os.path.join(file_dir, each_var.name)})
+                attrs={
+                    'file_path': os.path.join(file_dir,
+                                              os.path.normpath(each_var.name))
+                })
 
         load_var_map[new_var.name] = new_var
 
@@ -169,7 +169,9 @@ def _load_var_from_file(stat_dict, file_dir, file_name):
             type='load_combine',
             inputs={},
             outputs={"Out": load_var_list},
-            attrs={'file_path': os.path.join(file_dir, file_name)})
+            attrs={
+                'file_path': os.path.join(file_dir, os.path.normpath(file_name))
+            })
         for res_var in load_var_list:
             load_var_map[res_var.name] = res_var
 
@@ -183,5 +185,5 @@ def _clone_var_in_block_(block, var):
         shape=var.shape,
         dtype=var.dtype,
         type=var.type,
-        lod_level=var.lod_level,
+        lod_level=0,
         persistable=True)
diff --git a/python/paddle/fluid/dygraph/layer_object_helper.py b/python/paddle/fluid/dygraph/layer_object_helper.py
index f0be5ff3bf2394f1f7da8fbcc341a0d2dfacdab3..9fd1e392791f2bf7a19942749eae87001ec3ede8 100644
--- a/python/paddle/fluid/dygraph/layer_object_helper.py
+++ b/python/paddle/fluid/dygraph/layer_object_helper.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import copy
 import six
-from ..framework import Parameter, _in_dygraph_mode
+from ..framework import Parameter, in_dygraph_mode
 from ..param_attr import ParamAttr
 from .. import core
 from six.moves import zip
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 014ee41f4c5aa280fb5b366d8f1704290cc067d4..fb0604ebc29f5c4878677441506ca40da7713fe7 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -48,6 +48,12 @@ class Layer(core.Layer):
 
         self._helper = LayerObjectHelper(self._full_name)
 
+    def train(self):
+        framework._dygraph_tracer().train_mode()
+
+    def eval(self):
+        framework._dygraph_tracer().eval_mode()
+
     def full_name(self):
         """Full name for this layers.
 
@@ -139,14 +145,14 @@ class Layer(core.Layer):
 
     def clear_gradients(self):
         for p in self.parameters():
-            p._clear_gradient()
+            p.clear_gradient()
 
-    def _build_once(self, *args):
+    def build_once(self, *args):
         pass
 
     def __call__(self, *inputs):
         if not self._built:
-            self._build_once(*inputs)
+            self.build_once(*inputs)
 
         outputs = self.forward(*inputs)
         self._built = True
@@ -240,7 +246,10 @@ class Layer(core.Layer):
     def load_dict(self, stat_dict, include_sublayers=True):
         for name, item in self.__dict__.get('_parameters', None).items():
             if item.name in stat_dict:
-                self.__setattr__(name, stat_dict[item.name])
+                var = item._ivar.value()
+                tensor = var.get_tensor()
+                tensor.set(stat_dict[item.name].numpy(),
+                           framework._current_expected_place())
 
         if include_sublayers:
             for layer_name, layer_item in self._sub_layers.items():
@@ -254,6 +263,12 @@ class PyLayer(core.PyLayer):
     def __init__(self):
         super(PyLayer, self).__init__()
 
+    def train(self):
+        framework._dygraph_tracer().train_mode()
+
+    def eval(self):
+        framework._dygraph_tracer().eval_mode()
+
     @classmethod
     def _do_forward(cls, inputs):
         return cls._to_tuple(cls.forward(inputs))
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 71abb9e3eca974138fe2d8bedd41e4d58983f80c..0ab981518beb4cc48e18c17e4f0f91c22b60dbb7 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -19,7 +19,7 @@ from six.moves import reduce
 from .. import core
 from ..layers import utils
 from . import layers
-from ..framework import Variable, _in_dygraph_mode, OpProtoHolder, Parameter
+from ..framework import Variable, in_dygraph_mode, OpProtoHolder, Parameter
 from ..param_attr import ParamAttr
 from ..initializer import Normal, Constant, NumpyArrayInitializer
 import numpy as np
@@ -33,6 +33,109 @@ __all__ = [
 
 
 class Conv2D(layers.Layer):
+    """
+    The convolution2D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input and
+    Output are in NCHW format, where N is batch size, C is the number of
+    channels, H is the height of the feature, and W is the width of the feature.
+    Filter is in MCHW format, where M is the number of output image channels,
+    C is the number of input image channels, H is the height of the filter,
+    and W is the width of the filter. If the groups is greater than 1,
+    C will equal the number of input image channels divided by the groups.
+    Please refer to UFLDL's `convolution
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_
+    for more detials.
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    Where:
+
+    * :math:`X`: Input value, a tensor with NCHW format.
+    * :math:`W`: Filter value, a tensor with MCHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
+
+          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+
+    Args:
+        input (Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+        stride (int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+        padding (int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding. Default: padding = 0.
+        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
+            contain two integers, (dilation_H, dilation_W). Otherwise, the
+            dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups (int): The groups number of the Conv2d Layer. According to grouped
+            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+            the first half of the filters is only connected to the first half
+            of the input channels, while the second half of the filters is only
+            connected to the second half of the input channels. Default: groups=1.
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
+            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None
+        name (str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically. Default: None
+
+    Returns:
+        Variable: The tensor variable storing the convolution and \
+                  non-linearity activation result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
+    """
+
     def __init__(self,
                  name_scope,
                  num_channels,
@@ -265,7 +368,7 @@ class Conv3D(layers.Layer):
         self._param_attr = param_attr
         self._bias_attr = bias_attr
 
-    def _build_once(self, input):
+    def build_once(self, input):
         num_channels = input.shape[1]
         self._dtype = self._helper.input_dtype(input)
 
@@ -332,6 +435,116 @@ class Conv3D(layers.Layer):
 
 
 class Conv3DTranspose(layers.Layer):
+    """
+    **Convlution3D transpose layer**
+
+    The convolution3D transpose layer calculates the output based on the input,
+    filter, and dilations, strides, paddings. Input(Input) and output(Output)
+    are in NCDHW format. Where N is batch size, C is the number of channels,
+    D is the depth of the feature, H is the height of the feature, and W
+    is the width of the feature. Parameters(dilations, strides, paddings) are
+    two elements. These two elements represent height and width, respectively.
+    The details of convolution transpose layer, please refer to the following
+    explanation and references `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+    If bias attribution and activation type are provided, bias is added to
+    the output of the convolution, and the corresponding activation function
+    is applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    In the above equation:
+
+    * :math:`X`: Input value, a tensor with NCDHW format.
+    * :math:`W`: Filter value, a tensor with MCDHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
+
+    Example:
+
+        - Input:
+
+          Input shape: :math:`(N, C_{in}, D_{in}, H_{in}, W_{in})`
+
+          Filter shape: :math:`(C_{in}, C_{out}, D_f, H_f, W_f)`
+
+        - Output:
+
+          Output shape: :math:`(N, C_{out}, D_{out}, H_{out}, W_{out})`
+
+        Where
+
+        .. math::
+
+           D_{out} &= (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\\\
+           H_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\\\
+           W_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1
+
+    Args:
+        input(Variable): The input image with [N, C, D, H, W] format.
+        num_filters(int): The number of the filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain three integers, (image_D, image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain three integers, (filter_size_D, filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square. None if use output size to
+            calculate filter_size.
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain three integers, (padding_D, padding_H, padding_W). Otherwise, the
+            padding_D = padding_H = padding_W = padding. Default: padding = 0.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain three integers, (stride_D, stride_H, stride_W). Otherwise, the
+            stride_D = stride_H = stride_W = stride. Default: stride = 1.
+        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+            contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
+            dilation_D = dilation_H = dilation_W = dilation. Default: dilation = 1.
+        groups(int): The groups number of the Conv3d transpose layer. Inspired by
+            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
+            when group=2, the first half of the filters is only connected to the
+            first half of the input channels, while the second half of the
+            filters is only connected to the second half of the input channels.
+            Default: groups=1
+        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
+            of conv3d_transpose. If it is set to None or one attribute of ParamAttr, conv3d_transpose
+            will create ParamAttr as param_attr. If the Initializer of the param_attr
+            is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv3d_transpose.
+            If it is set to False, no bias will be added to the output units.
+            If it is set to None or one attribute of ParamAttr, conv3d_transpose
+            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+            is not set, the bias is initialized zero. Default: None.
+        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+            library is installed. Default: True
+        act (str): Activation type, if it is set to None, activation is not appended.
+            Default: None.
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Variable: The tensor variable storing the convolution transpose result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+
+    Examples:
+       .. code-block:: python
+
+          conv3d_transpose = nn.Conv3DTranspose(
+                'Conv3DTranspose',
+                num_filters=12,
+                filter_size=12,
+                use_cudnn=False)
+          transpose_res = conv3d_transpose(base.to_variable(input_array))
+    """
+
     def __init__(self,
                  name_scope,
                  num_filters,
@@ -362,7 +575,7 @@ class Conv3DTranspose(layers.Layer):
         self._bias_attr = bias_attr
         self._act = act
 
-    def _build_once(self, input):
+    def build_once(self, input):
         self._dtype = self._helper.input_dtype(input)
         self._input_channel = input.shape[1]
 
@@ -436,6 +649,54 @@ class Conv3DTranspose(layers.Layer):
 
 
 class Pool2D(layers.Layer):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): The input tensor of pooling operator. The format of
+                          input tensor is NCHW, where N is batch size, C is
+                          the number of channels, H is the height of the
+                          feature, and W is the width of the feature.
+        pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
+            it must contain two integers, (pool_size_Height, pool_size_Width).
+            Otherwise, the pool kernel size will be a square of an int.
+        pool_type: ${pooling_type_comment}
+        pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+            it must contain two integers, (pool_stride_Height, pool_stride_Width).
+            Otherwise, the pool stride size will be a square of an int.
+        pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple,
+            it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width).
+            Otherwise, the pool padding size will be a square of an int.
+        global_pooling (bool): ${global_pooling_comment}
+        use_cudnn (bool): ${use_cudnn_comment}
+        ceil_mode (bool): ${ceil_mode_comment}
+        name (str|None): A name for this layer(optional). If set None, the
+                        layer will be named automatically.
+        exclusive (bool): Whether to exclude padding points in average pooling
+                          mode, default is true
+
+    Returns:
+        Variable: The pooling result.
+
+    Raises:
+        ValueError: If 'pool_type' is not "max" nor "avg"
+        ValueError: If 'global_pooling' is False and 'pool_size' is -1
+        ValueError: If 'use_cudnn' is not a bool value.
+
+    Examples:
+
+        .. code-block:: python
+
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          pool2d = fluid.Pool2D("pool2d",pool_size=2,
+                            pool_type='max',
+                            pool_stride=1,
+                            global_pooling=False)
+
+          pool2d_res = pool2d(data)
+    """
+
     def __init__(self,
                  name_scope,
                  pool_size=-1,
@@ -495,6 +756,102 @@ class Pool2D(layers.Layer):
 
 
 class FC(layers.Layer):
+    """
+    **Fully Connected Layer**
+
+    This function creates a fully connected layer in the network. It can take
+    one or multiple tensors as its inputs(input can be a list of Variable, see
+    Args in detail). It creates a variable called weights for each input tensor,
+    which represents a fully connected weight matrix from each input unit to
+    each output unit. The fully connected layer multiplies each input tensor
+    with its corresponding weight to produce an output Tensor with shape [M, `size`],
+    where M is batch size. If multiple input tensors are given, the results of
+    multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
+    is not None, a bias variable will be created and added to the output.
+    Finally, if activation is not None, it will be applied to the output as well.
+
+    When the input is single tensor:
+
+    .. math::
+
+        Out = Act({XW + b})
+
+    When the input are multiple tensors:
+
+    .. math::
+
+        Out = Act({\sum_{i=0}^{N-1}X_iW_i + b})
+
+    In the above equation:
+
+    * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
+    * :math:`X_i`: The i-th input tensor.
+    * :math:`W_i`: The i-th weights matrix corresponding i-th input tensor.
+    * :math:`b`: The bias parameter created by this layer (if needed).
+    * :math:`Act`: The activation function.
+    * :math:`Out`: The output tensor.
+
+    See below for an example.
+
+    .. code-block:: text
+
+        Given:
+            data_1.data = [[[0.1, 0.2],
+                           [0.3, 0.4]]]
+            data_1.shape = (1, 2, 2) # 1 is batch_size
+
+            data_2 = [[[0.1, 0.2, 0.3]]]
+            data_2.shape = (1, 1, 3)
+
+            out = fluid.layers.fc(input=[data_1, data_2], size=2)
+
+        Then:
+            out.data = [[0.18669507, 0.1893476]]
+            out.shape = (1, 2)
+
+    Args:
+        input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
+            the input tensor(s) is at least 2.
+        size(int): The number of output units in this layer.
+        num_flatten_dims (int, default 1): The fc layer can accept an input tensor with more than
+            two dimensions. If this happens, the multidimensional tensor will first be flattened
+            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
+            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
+            dimensions will be flatten to form the first dimension of the final matrix (height of
+            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
+            form the second dimension of the final matrix (width of the matrix). For example, suppose
+            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
+            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
+        param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable
+            parameters/weights of this layer.
+        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
+            of this layer. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+        act (str, default None): Activation to be applied to the output of this layer.
+        is_test(bool): A flag indicating whether execution is in test phase.
+        name (str, default None): The name of this layer.
+
+    Returns:
+        Variable: The transformation result.
+
+    Raises:
+        ValueError: If rank of the input tensor is less than 2.
+
+    Examples:
+        .. code-block:: python
+
+          # when input is single tensor
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          fc = fluid.FC("fc", size=1000, act="tanh")
+          fc_res = fc(data)
+
+          # when input are multiple tensors
+          data_1 = fluid.layers.data(name="data_1", shape=[32, 32], dtype="float32")
+          data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32")
+          fc = fluid.FC("fc", size=1000, act="tanh")
+          fc_res = fc([data_1, data_2])
+    """
+
     def __init__(self,
                  name_scope,
                  size,
@@ -522,7 +879,7 @@ class FC(layers.Layer):
         assert isinstance(value, Parameter)
         self.__w[i] = value
 
-    def _build_once(self, input):
+    def build_once(self, input):
         i = 0
         for inp, param in self._helper.iter_inputs_and_params(input,
                                                               self._param_attr):
@@ -591,6 +948,91 @@ class FC(layers.Layer):
 
 
 class BatchNorm(layers.Layer):
+    """
+    **Batch Normalization Layer**
+
+    Can be used as a normalizer function for conv2d and fully_connected operations.
+    The required data format for this layer is one of the following:
+
+    1. NHWC `[batch, in_height, in_width, in_channels]`
+
+    2. NCHW `[batch, in_channels, in_height, in_width]`
+
+    Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
+    Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
+    for more details.
+
+    :math:`input` is the input features over a mini-batch.
+
+    ..  math::
+
+        \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
+        \ mini-batch\ mean \\\\
+        \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
+        \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
+
+
+    When use_global_stats = True, the :math:`\\mu_{\\beta}`
+    and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch.
+    They are global (or running) statistics. (It usually got from the
+    pre-trained model.)
+    The training and testing (or inference) have the same behavior:
+
+    ..  math::
+
+        \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
+        \\sigma_{\\beta}^{2} + \\epsilon}}  \\\\
+        y_i &\\gets \\gamma \\hat{x_i} + \\beta
+
+    Args:
+        input(variable): The rank of input variable can be 2, 3, 4, 5.
+        act(string, Default None): Activation type, linear|relu|prelu|...
+        is_test (bool, Default False): A flag indicating whether it is in
+            test phrase or not.
+        momentum(float, Default 0.9): The value used for the moving_mean and
+            moving_var computation. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        epsilon(float, Default 1e-05): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
+             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as param_attr. If the Initializer of the param_attr
+             is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm.
+             If it is set to None or one attribute of ParamAttr, batch_norm
+             will create ParamAttr as bias_attr. If the Initializer of the bias_attr
+             is not set, the bias is initialized zero. Default: None.
+        data_layout(string, default NCHW): NCHW|NHWC
+        in_place(bool, Default False): Make the input and output of batch norm reuse memory.
+        name(string, Default None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+        moving_mean_name(string, Default None): The name of moving_mean which store the global Mean.
+        moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance.
+        do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not.
+        fuse_with_relu (bool): if True, this OP performs relu after batch norm.
+        use_global_stats(bool, Default False): Whether to use global mean and
+            variance. In inference or test mode, set use_global_stats to true
+            or is_test to true, and the behavior is equivalent.
+            In train mode, when setting use_global_stats True, the global mean
+            and variance are also used during train period.
+
+    Returns:
+        Variable: A tensor variable which is the result after applying batch normalization on the input.
+
+    Examples:
+
+        .. code-block:: python
+            fc = fluid.FC('fc', size=200, param_attr='fc1.w')
+            hidden1 = fc(x)
+            batch_norm = fluid.BatchNorm("batch_norm", 10)
+            hidden2 = batch_norm(hidden1)
+    """
+
     def __init__(self,
                  name_scope,
                  num_channels,
@@ -629,7 +1071,7 @@ class BatchNorm(layers.Layer):
             dtype=self._dtype,
             default_initializer=Constant(1.0))
         if use_global_stats and self._param_attr.learning_rate == 0.:
-            self._scale._stop_gradient = True
+            self._scale.stop_gradient = True
 
         self._bias = self.create_parameter(
             attr=self._param_attr,
@@ -637,7 +1079,7 @@ class BatchNorm(layers.Layer):
             dtype=self._dtype,
             is_bias=True)
         if use_global_stats and self._param_attr.learning_rate == 0.:
-            self._bias._stop_gradient = True
+            self._bias.stop_gradient = True
 
         self._mean = self.create_parameter(
             attr=ParamAttr(
@@ -647,7 +1089,7 @@ class BatchNorm(layers.Layer):
                 do_model_average=do_model_average_for_mean_and_var),
             shape=param_shape,
             dtype=self._dtype)
-        self._mean._stop_gradient = True
+        self._mean.stop_gradient = True
 
         self._variance = self.create_parameter(
             attr=ParamAttr(
@@ -657,7 +1099,7 @@ class BatchNorm(layers.Layer):
                 do_model_average=do_model_average_for_mean_and_var),
             shape=param_shape,
             dtype=self._dtype)
-        self._variance._stop_gradient = True
+        self._variance.stop_gradient = True
 
         self._in_place = in_place
         self._momentum = momentum
@@ -666,7 +1108,7 @@ class BatchNorm(layers.Layer):
         self._fuse_with_relu = fuse_with_relu
         self._use_global_stats = use_global_stats
 
-    def _build_once(self, input):
+    def build_once(self, input):
         pass
 
     def forward(self, input):
@@ -747,7 +1189,7 @@ class Embedding(layers.Layer):
 
           dict_size = len(dataset.ids)
           input = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
-          embedding = fluid.dygraph.Embedding(size=[dict_size, 16])
+          embedding = fluid.Embedding(size=[dict_size, 16])
           fc = embedding(input)
     """
 
@@ -797,70 +1239,70 @@ class Embedding(layers.Layer):
 
 
 class LayerNorm(layers.Layer):
-    def __init__(self,
-                 name_scope,
-                 scale=True,
-                 shift=True,
-                 begin_norm_axis=1,
-                 epsilon=1e-05,
-                 param_attr=None,
-                 bias_attr=None,
-                 act=None):
-        """
-        ${comment}
+    """
+    ${comment}
 
-        The formula is as follows:
+    The formula is as follows:
 
-        ..  math::
+    ..  math::
 
-            \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
+        \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} a_i
 
-            \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
+        \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}(a_i - \\mu)^2}
 
-            h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
+        h & = f(\\frac{g}{\\sigma}(a - \\mu) + b)
 
-        * :math:`a`: the vector representation of the summed inputs to the neurons
-        in that layer.
+    * :math:`a`: the vector representation of the summed inputs to the neurons
+    in that layer.
 
-        * :math:`H`: the number of hidden units in a layers
+    * :math:`H`: the number of hidden units in a layers
 
-        * :math:`g`: the trainable scale parameter.
+    * :math:`g`: the trainable scale parameter.
 
-        * :math:`b`: the trainable bias parameter.
+    * :math:`b`: the trainable bias parameter.
 
-        Args:
-            input(Variable): The input tensor variable.
-            scale(bool): Whether to learn the adaptive gain :math:`g` after
-                normalization. Default True.
-            shift(bool): Whether to learn the adaptive bias :math:`b` after
-                normalization. Default True.
-            begin_norm_axis(int): The normalization will be performed along
-                dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
-                Default 1.
-            epsilon(float): The small value added to the variance to prevent
-                division by zero. Default 1e-05.
-            param_attr(ParamAttr|None): The parameter attribute for the learnable
-                gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
-                omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
-                a default :code:`ParamAttr` would be added as scale. The
-                :attr:`param_attr` is initialized as 1 if it is added. Default None.
-            bias_attr(ParamAttr|None): The parameter attribute for the learnable
-                bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
-                omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
-                a default :code:`ParamAttr` would be added as bias. The
-                :attr:`bias_attr` is initialized as 0 if it is added. Default None.
-            act(str): Activation to be applied to the output of layer normalizaiton.
-                      Default None.
-        Returns:
-            ${y_comment}
+    Args:
+        input(Variable): The input tensor variable.
+        scale(bool): Whether to learn the adaptive gain :math:`g` after
+            normalization. Default True.
+        shift(bool): Whether to learn the adaptive bias :math:`b` after
+            normalization. Default True.
+        begin_norm_axis(int): The normalization will be performed along
+            dimensions from :attr:`begin_norm_axis` to :attr:`rank(input)`.
+            Default 1.
+        epsilon(float): The small value added to the variance to prevent
+            division by zero. Default 1e-05.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            gain :math:`g`. If :attr:`scale` is False, :attr:`param_attr` is
+            omitted. If :attr:`scale` is True and :attr:`param_attr` is None,
+            a default :code:`ParamAttr` would be added as scale. The
+            :attr:`param_attr` is initialized as 1 if it is added. Default None.
+        bias_attr(ParamAttr|None): The parameter attribute for the learnable
+            bias :math:`b`. If :attr:`shift` is False, :attr:`bias_attr` is
+            omitted. If :attr:`shift` is True and :attr:`param_attr` is None,
+            a default :code:`ParamAttr` would be added as bias. The
+            :attr:`bias_attr` is initialized as 0 if it is added. Default None.
+        act(str): Activation to be applied to the output of layer normalizaiton.
+                  Default None.
+    Returns:
+        ${y_comment}
 
-        Examples:
+    Examples:
 
-            >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
-            >>>                          dtype='float32')
-            >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
-        """
+        >>> data = fluid.layers.data(name='data', shape=[3, 32, 32],
+        >>>                          dtype='float32')
+        >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
+    """
 
+    def __init__(self,
+                 name_scope,
+                 scale=True,
+                 shift=True,
+                 begin_norm_axis=1,
+                 epsilon=1e-05,
+                 param_attr=None,
+                 bias_attr=None,
+                 act=None):
         super(LayerNorm, self).__init__(name_scope)
         self._scale = scale
         self._shift = shift
@@ -870,7 +1312,7 @@ class LayerNorm(layers.Layer):
         self._bias_attr = bias_attr
         self._act = act
 
-    def _build_once(self, input):
+    def build_once(self, input):
         self._dtype = self._helper.input_dtype(input)
         input_shape = input.shape
         param_shape = [
@@ -1232,7 +1674,7 @@ class NCE(layers.Layer):
             'remote_prefetch': remote_prefetch
         }
 
-    def _build_once(self, input, label, sample_weight=None):
+    def build_once(self, input, label, sample_weight=None):
         assert isinstance(input, Variable)
         assert isinstance(label, Variable)
 
@@ -1318,7 +1760,7 @@ class PRelu(layers.Layer):
             raise ValueError('mode should be one of all, channel, element.')
         self._alpha_shape = [1]
 
-    def _build_once(self, input):
+    def build_once(self, input):
         if self._mode == 'channel':
             self._alpha_shape = [1, input.shape[1], 1, 1]
         elif self._mode == 'element':
@@ -1396,7 +1838,7 @@ class BilinearTensorProduct(layers.Layer):
         self._name = name
         self._inputs = dict()
 
-    def _build_once(self, x, y):
+    def build_once(self, x, y):
         self._dtype = self._helper.input_dtype(x)
 
         param_shape = [self._size, x.shape[1], y.shape[1]]
@@ -1572,7 +2014,7 @@ class Conv2DTranspose(layers.Layer):
         self._output_size = output_size
         self._op_type = 'conv2d_transpose'
 
-    def _build_once(self, input):
+    def build_once(self, input):
         input_channel = input.shape[1]
         if (input_channel == self._groups and
                 self._num_filters == input_channel and not self._use_cudnn):
@@ -1686,7 +2128,7 @@ class SequenceConv(layers.Layer):
                  bias_attr=None,
                  param_attr=None,
                  act=None):
-        assert not _in_dygraph_mode(
+        assert not in_dygraph_mode(
         ), "SequenceConv is not supported by dynamic graph mode yet!"
         super(SequenceConv, self).__init__(name_scope)
         self._num_filters = num_filters
@@ -1696,7 +2138,7 @@ class SequenceConv(layers.Layer):
         self._bias_attr = bias_attr
         self._param_attr = param_attr
 
-    def _build_once(self, input):
+    def build_once(self, input):
         self._dtype = self._helper.input_dtype(input)
         filter_shape = [self._filter_size * input.shape[1], self._num_filters]
         self._filter_param = self.create_parameter(
@@ -1726,14 +2168,14 @@ class RowConv(layers.Layer):
                  future_context_size,
                  param_attr=None,
                  act=None):
-        assert not _in_dygraph_mode(
+        assert not in_dygraph_mode(
         ), "RowConv is not supported by dynamic graph mode yet!"
         super(RowConv, self).__init__(name_scope)
         self._act = act
         self._param_attr = param_attr
         self._future_context_size = future_context_size
 
-    def _build_once(self, input):
+    def build_once(self, input):
         self._dtype = self._helper.input_dtype(input)
         filter_shape = [self._future_context_size + 1, input.shape[1]]
         self._filter_param = self.create_parameter(
@@ -1796,7 +2238,7 @@ class GroupNorm(layers.Layer):
         if data_layout != 'NCHW':
             raise ValueError("unsupported data layout:" + data_layout)
 
-    def _build_once(self, input):
+    def build_once(self, input):
         self._dtype = self._helper.input_dtype(input)
         param_shape = [input.shape[1]]
         if self._bias_attr:
@@ -1849,7 +2291,7 @@ class SpectralNorm(layers.Layer):
         self._eps = eps
         self._dim = dim
 
-    def _build_once(self, weight):
+    def build_once(self, weight):
         self._dtype = self._helper.input_dtype(weight)
         input_shape = weight.shape
         h = input_shape[self._dim]
@@ -1904,7 +2346,7 @@ class TreeConv(layers.Layer):
         self._bias_attr = bias_attr
         self._param_attr = param_attr
 
-    def _build_once(self, nodes_vector, edge_set):
+    def build_once(self, nodes_vector, edge_set):
         assert isinstance(nodes_vector, Variable)
         assert isinstance(edge_set, Variable)
         self._dtype = self._helper.input_dtype(nodes_vector)
diff --git a/python/paddle/fluid/dygraph/tracer.py b/python/paddle/fluid/dygraph/tracer.py
index 94e212b139b2b375aa9f5252d396e90235ba33c1..9d2cbb4f03fdc807e1609f46eac44a0bb92af785 100644
--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -24,7 +24,9 @@ __all__ = ['Tracer']
 
 
 def release_op(op):
-    del framework._dygraph_tracer()._ops[op._trace_id]
+    del framework._dygraph_tracer()._ops[op._trace_id].inputs
+    del framework._dygraph_tracer()._ops[op._trace_id].outputs
+    del framework._dygraph_tracer()._ops[op._trace_id].backward_refs
 
 
 class Tracer(core.Tracer):
@@ -38,6 +40,7 @@ class Tracer(core.Tracer):
         self._ops = defaultdict()
         self._vars = defaultdict()
         self._trace_id = 0
+        self._train_mode = True
 
     def trace_var(self, name, var):
         self._vars[name] = var
@@ -46,15 +49,57 @@ class Tracer(core.Tracer):
         return list((item for name, item in six.iteritems(self._vars)
                      if isinstance(item, framework.Parameter)))
 
-    def trace_op(self, op, stop_gradient=False):
+    def trace_op(self, op, inputs, outputs, stop_gradient=False):
+        # TODO(minqiyang): remove this line after we take apart all
+        # backward grads and forward variables
+        if self._train_mode:
+            op.inputs = inputs
+            inps = defaultdict(list)
+            for k, vars in six.iteritems(inputs):
+                if isinstance(vars, framework.Variable):
+                    inps[k].append(vars._ivar)
+                elif isinstance(vars, list) or isinstance(vars, tuple):
+                    for var in vars:
+                        inps[k].append(var._ivar)
+
+            op.outputs = outputs
+            outs = defaultdict(list)
+            for k, vars in six.iteritems(outputs):
+                if isinstance(vars, framework.Variable):
+                    outs[k].append(vars._ivar)
+                elif isinstance(vars, list) or isinstance(vars, tuple):
+                    for var in vars:
+                        outs[k].append(var._ivar)
+        else:
+            inps = defaultdict(list)
+            for k, vars in six.iteritems(inputs):
+                if isinstance(vars, framework.Variable):
+                    op.previous_ops.append(vars.op)
+                    inps[k].append(vars._ivar)
+                elif isinstance(vars, list) or isinstance(vars, tuple):
+                    for var in vars:
+                        op.previous_ops.append(var.op)
+                        inps[k].append(var._ivar)
+
+            op.outputs = outputs
+            outs = defaultdict(list)
+            for k, vars in six.iteritems(outputs):
+                if isinstance(vars, framework.Variable):
+                    vars.op = op
+                    outs[k].append(vars._ivar)
+                elif isinstance(vars, list) or isinstance(vars, tuple):
+                    for var in vars:
+                        var.op = op
+                        outs[k].append(var._ivar)
+
         # record op's trace id
         op.iop._trace_id = self._trace_id
 
-        backward_refs = self.trace(op.iop, op.inputs, op.outputs, op.attrs,
+        backward_refs = self.trace(op.iop, inps, outs, op.attrs,
                                    framework._current_expected_place(),
                                    stop_gradient)
 
-        if not stop_gradient:
+        if not stop_gradient and self._train_mode:
             self._trace_id += 1
             self._ops[op.iop._trace_id] = op
 
@@ -65,10 +110,16 @@ class Tracer(core.Tracer):
                 # TODO(minqiyang): remove all inputs and outputs after separate
                 # var and grad
                 op.backward_refs = defaultdict(list)
-                for k, v in six.iteritems(op.inputs):
+                for k, v in six.iteritems(inputs):
                     if k in backward_refs:
-                        op.backward_refs[k] = op.inputs[k]
+                        op.backward_refs[k] = inputs[k]
 
-                for k, v in six.iteritems(op.outputs):
+                for k, v in six.iteritems(outputs):
                     if k in backward_refs:
-                        op.backward_refs[k] = op.outputs[k]
+                        op.backward_refs[k] = outputs[k]
+
+    def train_mode(self):
+        self._train_mode = True
+
+    def eval_mode(self):
+        self._train_mode = False
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index e4666deb7fabe3628856269b6c665aacec1e9ee4..fa8b49a021294e8555e979459615b1956d9b2b55 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -712,10 +712,6 @@ class Executor(object):
         if dataset == None:
             raise RuntimeError("dataset is needed and should be initialized")
 
-        if self.place == paddle.fluid.CUDAPlace():
-            raise RuntimeError("infer_from_dataset is verified on CPUPlace"
-                               "We will open CUDAPlace in the future")
-
         scope, trainer = self._prepare_trainer(
             program=program,
             dataset=dataset,
@@ -796,10 +792,6 @@ class Executor(object):
         if dataset == None:
             raise RuntimeError("dataset is need and should be initialized")
 
-        if self.place == paddle.fluid.CUDAPlace():
-            raise RuntimeError("train_from_dataset is verified on CPUPlace"
-                               "We will open CUDAPlace in the future")
-
         scope, trainer = self._prepare_trainer(
             program=program,
             dataset=dataset,
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 7953d98bcbb826267fa21f6503e55049c8aff5ba..535909d710eb352bfc90ba42cc3b1894491e9ede 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -67,6 +67,7 @@ __all__ = [
     'cuda_places',
     'cpu_places',
     'cuda_pinned_places',
+    'in_dygraph_mode',
 ]
 
 EMPTY_VAR_NAME = core.kEmptyVarName()
@@ -79,7 +80,10 @@ _dygraph_tracer_ = None
 _dygraph_current_expected_place_ = None
 
 
-def _in_dygraph_mode():
+def in_dygraph_mode():
+    '''
+    Returns(bool): True if the program is running in dynamic graph mode
+    '''
     return _dygraph_tracer_ is not None
 
 
@@ -396,7 +400,7 @@ class Variable(object):
             if not isinstance(dtype, core.VarDesc.VarType):
                 dtype = convert_np_dtype_to_dtype_(dtype)
 
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             # record vars in tracer rather than blocks
             self._ivar = kwargs.get("ivar", None)
             if not self._ivar:
@@ -407,6 +411,7 @@ class Variable(object):
                     if persistable else False)
             if persistable:
                 _dygraph_tracer().trace_var(name, self)
+            self.op = None
         else:
             self.error_clip = error_clip
 
@@ -482,21 +487,21 @@ class Variable(object):
 
             self.block.vars[name] = self
             self.op = None
-            self.stop_gradient = stop_gradient
+            self._stop_gradient = stop_gradient
             self.is_data = is_data
 
-    def _numpy(self):
+    def numpy(self):
         new_ivar = self._ivar._copy_to(core.CPUPlace(), True)
         return np.array(new_ivar.value().get_tensor())
 
-    def _backward(self):
+    def backward(self):
         self._ivar._run_backward()
 
-    def _gradient(self):
+    def gradient(self):
         new_ivar = self._ivar._grad_ivar()._copy_to(core.CPUPlace(), True)
         return np.array(new_ivar.value().get_tensor())
 
-    def _clear_gradient(self):
+    def clear_gradient(self):
         self._ivar._clear_gradient()
 
     def __str__(self):
@@ -516,7 +521,7 @@ class Variable(object):
         Returns:
             str: The debug string.
         """
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             # TODO(panyx0718): add more dygraph debug info.
             return 'name %s, dtype: %s shape: %s' % (self.name, self.dtype,
                                                      self.shape)
@@ -535,7 +540,7 @@ class Variable(object):
 
     __repr__ = __str__
 
-    def _set_desc(self, input):
+    def set_desc(self, input):
         """
         Set the variable description.
 
@@ -548,43 +553,43 @@ class Variable(object):
         self.desc = input
 
     @property
-    def _stop_gradient(self):
-        if _in_dygraph_mode():
+    def stop_gradient(self):
+        if in_dygraph_mode():
             return self._ivar.stop_gradient
         else:
-            return self.stop_gradient
+            return self._stop_gradient
 
-    @_stop_gradient.setter
-    def _stop_gradient(self, s):
-        if _in_dygraph_mode():
+    @stop_gradient.setter
+    def stop_gradient(self, s):
+        if in_dygraph_mode():
             self._ivar.stop_gradient = s
         else:
-            self.stop_gradient = s
+            self._stop_gradient = s
 
     @property
     def persistable(self):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             return self._ivar.persistable
         else:
             return self.desc.persistable()
 
     @persistable.setter
     def persistable(self, p):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             return self._ivar.persistable
         else:
             self.desc.set_persistable(p)
 
     @property
     def name(self):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             return self._ivar.name
         else:
             return cpt.to_text(self.desc.name())
 
     @name.setter
     def name(self, new_name):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             self._ivar.name = new_name
         else:
             self.desc.set_name(new_name)
@@ -592,14 +597,14 @@ class Variable(object):
     @property
     def shape(self):
         # convert to tuple, make it as same as numpy API.
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             return self._ivar.shape
         else:
             return tuple(self.desc.shape())
 
     @property
     def dtype(self):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             return self._ivar.dtype
         else:
             return self.desc.dtype()
@@ -611,7 +616,7 @@ class Variable(object):
 
     @property
     def type(self):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             return self._ivar.dtype
         else:
             return self.desc.type()
@@ -721,7 +726,7 @@ class Variable(object):
                 name=unique_name.generate(".".join(self.name)),
                 dtype=self.dtype,
                 persistable=self.persistable,
-                stop_gradient=self._stop_gradient, )
+                stop_gradient=self.stop_gradient, )
         else:
             return self
 
@@ -930,29 +935,12 @@ class Operator(object):
                  inputs=None,
                  outputs=None,
                  attrs=None):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             if type is None:
                 raise ValueError(
                     "`type` to initialized an Operator can not be None.")
             self.iop = core.OpBase(type)
-
-            # TODO(minqiyang): remove these lines after we take apart all
-            # backward grads and forward variables
-            self.inputs = defaultdict(list)
-            if inputs is not None:
-                for k, v in six.iteritems(inputs):
-                    if isinstance(v, Variable):
-                        self.inputs[k].append(v._ivar)
-                    elif isinstance(v, list) or isinstance(v, tuple):
-                        self.inputs[k].extend([var._ivar for var in v])
-
-            self.outputs = defaultdict(list)
-            if outputs is not None:
-                for k, v in six.iteritems(outputs):
-                    if isinstance(v, Variable):
-                        self.outputs[k].append(v._ivar)
-                    elif isinstance(v, list) or isinstance(v, tuple):
-                        self.outputs[k].extend([var._ivar for var in v])
+            self.previous_ops = []
 
             self.attrs = attrs if attrs else {}
         else:
@@ -1049,7 +1037,7 @@ class Operator(object):
                     for arg in out_args:
                         out_arg_names.append(cpt.to_text(arg.name))
                         # TODO(minqiyang): could we remove variable's op in static mode?
-                        if not _in_dygraph_mode():
+                        if not in_dygraph_mode():
                             arg.op = self
                     self.desc.set_output(out_proto.name, out_arg_names)
 
@@ -1095,7 +1083,7 @@ class Operator(object):
 
     @property
     def type(self):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             return self.iop.type
         else:
             return self.desc.type()
@@ -1638,20 +1626,23 @@ class Block(object):
         Returns:
             Operator: the append Operator.
         """
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             op = Operator(
                 block=self,
                 desc=None,
                 type=kwargs.get("type", None),
-                inputs=kwargs.get("inputs", None),
-                outputs=kwargs.get("outputs", None),
-                attrs=kwargs.get("attrs", None))
+                inputs=None,
+                outputs=None,
+                attrs=kwargs.get("attrs", {}))
 
             # record ops in tracer rather than blocks
             #
             # TODO(minqiyang): add op stop_gradient support in static mode too.
             # currently, we only support stop_gradient in dygraph mode.
-            _dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False))
+            _dygraph_tracer().trace_op(op,
+                                       kwargs.get("inputs", {}),
+                                       kwargs.get("outputs", {}),
+                                       kwargs.get("stop_gradient", False))
         else:
             op_desc = self.desc.append_op()
             op = Operator(
@@ -1710,15 +1701,19 @@ class Block(object):
         return self.ops[start:end]
 
     def _prepend_op(self, *args, **kwargs):
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             op = Operator(
                 self,
                 None,
                 type=kwargs.get("type", None),
-                inputs=kwargs.get("inputs", None),
-                outputs=kwargs.get("outputs", None),
-                attrs=kwargs.get("attrs", None))
-            _dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False))
+                inputs=None,
+                outputs=None,
+                attrs=kwargs.get("attrs", {}))
+
+            _dygraph_tracer().trace_op(op,
+                                       kwargs.get("inputs", {}),
+                                       kwargs.get("outputs", {}),
+                                       kwargs.get("stop_gradient", False))
         else:
             op_desc = self.desc._prepend_op()
             op = Operator(
diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py
index 528f7b3269eb90435d88cffadfa185cc664e430a..ffc7ae0172e26191264625d0a8bdd28dab69c833 100644
--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py
@@ -23,10 +23,10 @@ class RoleMakerBase(object):
     """
 
     def __init__(self):
-        self.role_maker_name_ = ""
-        self.trainer_endpoints_ = []
-        self.pserver_endpoints_ = []
-        self.role_is_generated_ = False
+        self._role_maker_name = ""
+        self._trainer_endpoints = []
+        self._pserver_endpoints = []
+        self._role_is_generated = False
 
     def _is_worker(self):
         """
@@ -45,20 +45,20 @@ class RoleMakerBase(object):
         return get local ip
         """
         import socket
-        self.ip_ = socket.gethostbyname(socket.gethostname())
-        return self.ip_
+        self._ip = socket.gethostbyname(socket.gethostname())
+        return self._ip
 
     def _get_trainer_endpoints(self):
         """
         return trainer endpoints
         """
-        return self.trainer_endpoints_
+        return self._trainer_endpoints
 
     def _get_pserver_endpoints(self):
         """
         return pserver endpoints
         """
-        return self.pserver_endpoints_
+        return self._pserver_endpoints
 
     def _generate_role(self):
         """
@@ -76,59 +76,59 @@ class MPIRoleMaker(RoleMakerBase):
     def __init__(self):
         super(MPIRoleMaker, self).__init__()
         from mpi4py import MPI
-        self.comm_ = MPI.COMM_WORLD
+        self._comm = MPI.COMM_WORLD
         self.MPI = MPI
-        self.ips_ = None
+        self._ips = None
 
     def _get_rank(self):
         """
         return rank
         """
-        self.rank_ = self.comm_.Get_rank()
-        return self.rank_
+        self._rank = self._comm.Get_rank()
+        return self._rank
 
     def _get_size(self):
         """
         return size
         """
-        self.size_ = self.comm_.Get_size()
-        return self.size_
+        self._size = self._comm.Get_size()
+        return self._size
 
     def _all_gather(self, obj):
         """
         all_gather(obj) will call MPI's allgather function
         """
         self._barrier_all()
-        return self.comm_.allgather(obj)
+        return self._comm.allgather(obj)
 
     def _worker_gather(self, obj):
         """
         worker_gather(obj) will call MPI's allgather function
         """
         if self._is_worker():
-            self.node_type_comm_.barrier()
-            return self.node_type_comm_.allgather(obj)
+            self._node_type_comm.barrier()
+            return self._node_type_comm.allgather(obj)
         return None
 
     def _barrier_all(self):
         """
         barrier_all() will call MPI's barrier_all function
         """
-        self.comm_.barrier()
+        self._comm.barrier()
 
     def _get_ips(self):
         """
         collect current distributed job's ip list
         """
-        if self.ips_ == None:
-            self.ips_ = self.comm_.allgather(self._get_local_ip())
-        return self.ips_
+        if self._ips == None:
+            self._ips = self._comm.allgather(self._get_local_ip())
+        return self._ips
 
     def _finalize(self):
         """
         finalize the current MPI instance.
         """
-        self.comm_.finalize()
+        pass
 
 
 class MPISymetricRoleMaker(MPIRoleMaker):
@@ -140,11 +140,11 @@ class MPISymetricRoleMaker(MPIRoleMaker):
 
     def __init__(self):
         super(MPISymetricRoleMaker, self).__init__()
-        self.node_type_ = None
-        self.proc_per_node_ = 2
+        self._node_type = None
+        self._proc_per_node = 2
 
     def _check_role_generation(self):
-        if not self.role_is_generated_:
+        if not self._role_is_generated:
             sys.stderr.write("generate_role() should be called first")
             sys.exit(-1)
             return False
@@ -163,7 +163,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
         return whether current process is worker assigned by role maker
         """
         if self._check_role_generation():
-            return self.node_type_ == 1
+            return self._node_type == 1
         return False
 
     def _is_server(self):
@@ -171,7 +171,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
         return whether current process is server assigned by role maker
         """
         if self._check_role_generation():
-            return self.node_type_ == 0
+            return self._node_type == 0
         return False
 
     def _worker_num(self):
@@ -197,7 +197,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
         return the index of worker
         """
         if self._check_role_generation():
-            return self.rank_ / self.proc_per_node_
+            return self._rank / self._proc_per_node
         return 0
 
     def _server_index(self):
@@ -205,7 +205,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
         return the index of server
         """
         if self._check_role_generation():
-            return self.rank_ / self.proc_per_node_
+            return self._rank / self._proc_per_node
         return 0
 
     def _barrier_worker(self):
@@ -214,7 +214,7 @@ class MPISymetricRoleMaker(MPIRoleMaker):
         """
         if self._check_role_generation():
             if self._is_worker():
-                self.node_type_comm_.barrier()
+                self._node_type_comm.barrier()
 
     def _barrier_server(self):
         """
@@ -222,20 +222,20 @@ class MPISymetricRoleMaker(MPIRoleMaker):
         """
         if self._check_role_generation():
             if self._is_server():
-                self.node_type_comm_.barrier()
+                self._node_type_comm.barrier()
 
     def _generate_role(self):
         """
         generate currently process's role
         """
-        if not self.role_is_generated_:
+        if not self._role_is_generated:
             # TODO(guru4elephant): only allow to be called once
-            self.trainer_endpoints_ = self._get_ips()
-            self.pserver_endpoints_ = self._get_ips()
+            self._trainer_endpoints = self._get_ips()
+            self._pserver_endpoints = self._get_ips()
 
-            if 0 == self._get_rank() % self.proc_per_node_ % 2:
-                self.node_type_ = 0
+            if 0 == self._get_rank() % self._proc_per_node % 2:
+                self._node_type = 0
             else:
-                self.node_type_ = 1
-            self.node_type_comm_ = self.comm_.Split(self.node_type_)
-            self.role_is_generated_ = True
+                self._node_type = 1
+            self._node_type_comm = self._comm.Split(self._node_type)
+            self._role_is_generated = True
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py
index 044aa33c2b5b572aa40169e8c57936b105ba0121..4a7665b9bced9df5f3fb8a82bbcfd7a8feb6a24a 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py
@@ -64,9 +64,9 @@ class Fleet(object):
 
     def __init__(self):
         self._opt_info = None  # for fleet only
-        self.role_maker_ = None
-        self.local_ip_ = 0
-        self.is_initialized_ = False
+        self._role_maker = None
+        self._local_ip = 0
+        self._is_initialized = False
 
     def init(self):
         # TODO(guru4elephant)
@@ -78,22 +78,22 @@ class Fleet(object):
             current node's role, e.g. worker, server, etc.
         """
         if not self.is_initialized_:
-            self.role_maker_ = MPISymetricRoleMaker()
-            self.role_maker_._generate_role()
+            self._role_maker = MPISymetricRoleMaker()
+            self._role_maker._generate_role()
             self._fleet_ptr = fluid.core.Fleet()
-            self.is_initialized_ = True
+            self._is_initialized = True
 
     def stop(self):
         """
         stop(): will be called after a user finishes his/her training task. Fleet instance will be
             destroyed when stop() is called.
         """
-        self.role_maker_._barrier_worker()
-        if self.role_maker_._is_first_worker():
+        self._role_maker._barrier_worker()
+        if self._role_maker._is_first_worker():
             self._fleet_ptr.stop_server()
-        self.role_maker_._barrier_worker()
-        self.role_maker_._barrier_all()
-        self.role_maker_._finalize()
+        self._role_maker._barrier_worker()
+        self._role_maker._barrier_all()
+        self._role_maker._finalize()
 
     def init_pserver(self):
         """
@@ -110,31 +110,38 @@ class Fleet(object):
                 sys.exit(-1)
             self._fleet_ptr.init_server(self._dist_desc_str,
                                         self.role_maker_._get_rank())
-            self.local_ip_ = self._fleet_ptr.run_server()
+            self._local_ip = self._fleet_ptr.run_server()
             # barrier_all for init_server
-            self.role_maker_._barrier_all()
-            self.all_ips_ = self.role_maker_._all_gather(self.local_ip_)
+            self._role_maker._barrier_all()
+            self._all_ips = self._role_maker._all_gather(self.local_ip_)
 
-            self._fleet_ptr.gather_servers(self.all_ips_,
-                                           self.role_maker_._get_size())
+            self._fleet_ptr.gather_servers(self._all_ips,
+                                           self._role_maker._get_size())
             # barrier_all for init_worker, wait all workers start
-            self.role_maker_._barrier_all()
+            self._role_maker._barrier_all()
         else:
             print("You should run DistributedOptimizer.minimize() first")
             sys.exit(-1)
 
-    def init_worker(self, programs):
+    def init_worker(self, programs, scopes=None):
         """
         init_worker(): will be called by user. When a user knows current process is_server(), he/she
                     should call init_worker() to initialize global information about worker and connect
-                    worker with pserver.
+                    worker with pserver. You should run startup program before init_worker.
 
         Args:
             programs(Program|list): a Program or a list of Programs
-
+            scopes(Scope|list): a Scope or  a list of Scopes, default None.
         """
         if not isinstance(programs, list):
             programs = [programs]
+        if scopes is None:
+            scopes = [fluid.global_scope()] * len(programs)
+        if len(scopes) != len(programs):
+            print(
+                "You should make sure len(scopes) == len(programs) or set scopes None"
+            )
+            sys.exit(-1)
         if self._opt_info:
             if "fleet_desc" in self._opt_info:
                 self._dist_desc_str = text_format.MessageToString(
@@ -144,23 +151,23 @@ class Fleet(object):
                 print("You should run DistributedOptimizer.minimize() first")
                 sys.exit(-1)
             # barrier_all for init_server, wait for server starts
-            self.role_maker_._barrier_all()
-            self.all_ips_ = self.role_maker_._all_gather(self.local_ip_)
-            self._fleet_ptr.init_worker(self._dist_desc_str, self.all_ips_,
-                                        self.role_maker_._get_size(),
-                                        self.role_maker_._get_rank())
+            self._role_maker._barrier_all()
+            self._all_ips = self._role_maker._all_gather(self.local_ip_)
+            self._fleet_ptr.init_worker(self._dist_desc_str, self._all_ips,
+                                        self._role_maker._get_size(),
+                                        self._role_maker._get_rank())
             # barrier_all for init_worker
-            self.role_maker_._barrier_all()
+            self._role_maker._barrier_all()
             # prepare for client to client communication
             info = self._fleet_ptr.get_clients_info()
-            all_info = self.role_maker_._worker_gather(info[0])
+            all_info = self._role_maker._worker_gather(info[0])
             self._fleet_ptr.gather_clients(all_info)
             self._fleet_ptr.create_client2client_connection()
             # barrier for init model
-            self.role_maker_._barrier_worker()
-            if self.role_maker_._is_first_worker():
+            self._role_maker._barrier_worker()
+            if self._role_maker._is_first_worker():
                 tables = self._dist_desc.trainer_param.dense_table
-                for prog in programs:
+                for prog, scope in zip(programs, scopes):
                     prog_id = str(id(prog))
                     prog_conf = self._opt_info['program_configs'][prog_id]
                     prog_tables = {}
@@ -174,12 +181,18 @@ class Fleet(object):
                             continue
                         var_name_list = []
                         for i in range(0, len(table.dense_variable_name)):
-                            var_name_list.append(table.dense_variable_name[i])
-                    self._fleet_ptr.init_model(prog.desc,
-                                               int(table.table_id),
-                                               var_name_list)
+                            var_name = table.dense_variable_name[i]
+                            if scope.find_var(var_name) is None:
+                                print("var " + var_name +
+                                      " not found in scope, " +
+                                      "you should run startup program first")
+                                sys.exit(-1)
+                            var_name_list.append(var_name)
+                        self._fleet_ptr.init_model(scope,
+                                                   int(table.table_id),
+                                                   var_name_list)
             # barrier for init model done
-            self.role_maker_._barrier_worker()
+            self._role_maker._barrier_worker()
         else:
             print("You should run DistributedOptimizer.minimize() first")
             sys.exit(-1)
@@ -188,39 +201,39 @@ class Fleet(object):
         """
         return the number of current job's worker num
         """
-        return self.role_maker_._worker_num()
+        return self._role_maker._worker_num()
 
     def get_server_num(self):
         """
         return the number of current job's server num
         """
-        return self.role_maker_._server_num()
+        return self._role_maker._server_num()
 
     def get_worker_index(self):
         """
         return the mpi rank of current worker
         """
-        return self.role_maker_._worker_index()
+        return self._role_maker._worker_index()
 
     def is_worker(self):
         """
         return whether current node is a worker
         """
-        return self.role_maker_._is_worker()
+        return self._role_maker._is_worker()
 
     def is_server(self):
         """
         return whether current node is pserver
         """
-        return self.role_maker_._is_server()
+        return self._role_maker._is_server()
 
     def init_pserver_model(self):
         """
         init pserver model called from pserver
         """
-        if self.role_maker_._is_first_worker():
+        if self._role_maker._is_first_worker():
             self._fleet_ptr.init_model()
-        self.role_maker_._barrier_worker()
+        self._role_maker._barrier_worker()
 
     def save_pserver_model(self, save_path):
         """
@@ -228,6 +241,40 @@ class Fleet(object):
         """
         self._fleet_ptr.save_model(save_path)
 
+    def split_filelist(self, filelist):
+        """
+        split filelist before distributed training,
+        for example, filelist is [a, b, c ,d, e]  and trainer_num = 2,
+        then trainer 0 gets [a, b, c] and trainer 1 gets [d, e]
+
+        Example:
+            >>> all_filelist = ["a.txt", "b.txt", "c.txt"]
+            >>> my_filelist = fleet.split_filelist(all_filelist)
+            >>> dataset = fluid.DatasetFactory().create_dataset()
+            >>> dataset.set_filelist(my_filelist)
+
+        Args:
+            filelist(list): list of filename, can be local or hdfs/afs.
+
+        Returns:
+            list of filename which belongs to this trainer.
+        """
+        file_num = len(filelist)
+        trainer_id = self.get_worker_index()
+        trainer_num = self.get_worker_num()
+        if trainer_num > file_num:
+            raise ValueError("trainer_num should be <= file_num : "
+                             "%s > %s" % (trainer_num, file_num))
+        # get interval of filelist, it's [ )
+        start = 0
+        end = 0
+        for i in range(0, trainer_id + 1):
+            length = file_num / trainer_num + (i < (file_num % trainer_num))
+            start = end
+            end += length
+        my_filelist = filelist[start:end]
+        return my_filelist
+
     def _set_opt_info(self, opt_info):
         """
         this function saves the result from DistributedOptimizer.minimize()
@@ -324,3 +371,4 @@ save_pserver_model = fleet_instance.save_pserver_model
 worker_num = fleet_instance.get_worker_num
 server_num = fleet_instance.get_server_num
 worker_index = fleet_instance.get_worker_index
+split_filelist = fleet_instance.split_filelist
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/node.py b/python/paddle/fluid/incubate/fleet/parameter_server/node.py
index 60035b6e8da3e40158f8be0bafdd911f6bd6f543..641c294c4a6edeb3d9823b4152b0ea158c8faa80 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/node.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/node.py
@@ -42,13 +42,13 @@ class DownpourServer(Server):
     """
 
     def __init__(self):
-        self.server_ = pslib.ServerParameter()
-        self.server_.downpour_server_param.service_param.start_server_port = 0
-        self.server_.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer"
-        self.server_.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient"
-        self.server_.downpour_server_param.service_param.service_class = "DownpourPsService"
-        self.server_.downpour_server_param.service_param.start_server_port = 0
-        self.server_.downpour_server_param.service_param.server_thread_num = 12
+        self._server = pslib.ServerParameter()
+        self._server.downpour_server_param.service_param.start_server_port = 0
+        self._server.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer"
+        self._server.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient"
+        self._server.downpour_server_param.service_param.service_class = "DownpourPsService"
+        self._server.downpour_server_param.service_param.start_server_port = 0
+        self._server.downpour_server_param.service_param.server_thread_num = 12
 
     def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
                          slot_value_var):
@@ -62,7 +62,7 @@ class DownpourServer(Server):
         Returns:
             return None 
         """
-        table = self.server_.downpour_server_param.downpour_table_param.add()
+        table = self._server.downpour_server_param.downpour_table_param.add()
         table.table_id = table_id
         table.table_class = "DownpourSparseTable"
         table.type = pslib.PS_SPARSE_TABLE
@@ -123,7 +123,7 @@ class DownpourServer(Server):
         Returns:
             return None 
         """
-        table = self.server_.downpour_server_param.downpour_table_param.add()
+        table = self._server.downpour_server_param.downpour_table_param.add()
         table.table_id = table_id
         table.table_class = "DownpourDenseTable"
         table.type = pslib.PS_DENSE_TABLE
@@ -140,7 +140,7 @@ class DownpourServer(Server):
         """
         Return downpour server program_desc
         """
-        return self.server_
+        return self._server
 
 
 class DownpourWorker(Worker):
@@ -155,7 +155,7 @@ class DownpourWorker(Worker):
 
     def __init__(self, window):
         self.window = window
-        self.worker_ = pslib.DownpourTrainerParameter()
+        self._worker = pslib.DownpourTrainerParameter()
 
     def add_sparse_table(self, table_id, learning_rate, slot_key_vars,
                          slot_value_vars):
@@ -187,7 +187,7 @@ class DownpourWorker(Worker):
         Returns:
             return None 
         """
-        table = self.worker_.dense_table.add()
+        table = self._worker.dense_table.add()
         table.table_id = table_id
         table.dense_variable_name.extend(
             filter(lambda x: x.find("embedding") == -1,
@@ -200,4 +200,4 @@ class DownpourWorker(Worker):
         """
         Return downpour worker program_desc
         """
-        return self.worker_
+        return self._worker
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/optimizer_factory.py
index 94f79e77e72bfa2d0a09502722ef36d474b610b2..ba1f2c8f6ba43bcdb8d4240e33210370e5a454f6 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/optimizer_factory.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/optimizer_factory.py
@@ -24,9 +24,9 @@ from .node import DownpourWorker, DownpourServer
 
 class DistributedOptimizerImplBase(object):
     def __init__(self, optimizer):
-        self.optimizer_ = optimizer
-        self.learning_rate_ = optimizer._learning_rate
-        self.regularization_ = optimizer.regularization
+        self._optimizer = optimizer
+        self._learning_rate = optimizer._learning_rate
+        self._regularization = optimizer.regularization
 
     def minimize(self,
                  losses,
@@ -41,7 +41,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
         # todo(guru4elephant): add more optimizers here as argument
         # todo(guru4elephant): make learning_rate as a variable
         super(DistributedAdam, self).__init__(optimizer)
-        self.window_ = 1
+        self._window = 1
         self.type = "downpour"
         self.data_norm_name = [
             ".batch_size", ".batch_square_sum", ".batch_sum",
@@ -79,9 +79,9 @@ class DistributedAdam(DistributedOptimizerImplBase):
         server = DownpourServer()
         worker = DownpourWorker(self.window_)
         sparse_table_index = 0
-        server.add_sparse_table(sparse_table_index, self.learning_rate_,
+        server.add_sparse_table(sparse_table_index, self._learning_rate,
                                 prefetch_slots, prefetch_slots_emb)
-        worker.add_sparse_table(sparse_table_index, self.learning_rate_,
+        worker.add_sparse_table(sparse_table_index, self._learning_rate,
                                 prefetch_slots, prefetch_slots_emb)
         dense_table_index = 1
         program_configs = {}
@@ -124,9 +124,9 @@ class DistributedAdam(DistributedOptimizerImplBase):
                         data_norm_grads.append(i[1])
                 if not is_data_norm_data:
                     grads.append(i[1])
-            server.add_dense_table(dense_table_index, self.learning_rate_,
+            server.add_dense_table(dense_table_index, self._learning_rate,
                                    params, grads)
-            worker.add_dense_table(dense_table_index, self.learning_rate_,
+            worker.add_dense_table(dense_table_index, self._learning_rate,
                                    params, grads)
             program_configs[program_id]["pull_dense"] = [dense_table_index]
             program_configs[program_id]["push_dense"] = [dense_table_index]
@@ -135,9 +135,9 @@ class DistributedAdam(DistributedOptimizerImplBase):
             if len(data_norm_params) != 0 and len(data_norm_grads) != 0:
                 dense_table_index += 1
                 server.add_data_norm_table(dense_table_index,
-                                           self.learning_rate_,
+                                           self._learning_rate,
                                            data_norm_params, data_norm_grads)
-                worker.add_dense_table(dense_table_index, self.learning_rate_,
+                worker.add_dense_table(dense_table_index, self._learning_rate,
                                        data_norm_params, data_norm_grads)
                 #program_config.pull_dense_table_id.extend([dense_table_index])
                 #program_config.push_dense_table_id.extend([dense_table_index])
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 6aff93dceaf5cfd299bdc9f68246ed579f248f3c..da2591b98058a2283275cc222194e89240e87ae1 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -165,7 +165,7 @@ class ConstantInitializer(Initializer):
                 'force_cpu': self._force_cpu or force_init_on_cpu()
             },
             stop_gradient=True)
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
             var.op = op
         return op
 
@@ -245,7 +245,7 @@ class UniformInitializer(Initializer):
                 attrs={"in_dtype": out_var.dtype,
                        "out_dtype": var.dtype})
 
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
             var.op = op
         return op
 
@@ -324,7 +324,7 @@ class NormalInitializer(Initializer):
                 outputs={"Out": var},
                 attrs={"in_dtype": out_var.dtype,
                        "out_dtype": var.dtype})
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
             var.op = op
         return op
 
@@ -403,7 +403,7 @@ class TruncatedNormalInitializer(Initializer):
                 outputs={"Out": var},
                 attrs={"in_dtype": out_var.dtype,
                        "out_dtype": var.dtype})
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
             var.op = op
         return op
 
@@ -509,7 +509,7 @@ class XavierInitializer(Initializer):
                     "seed": self._seed
                 },
                 stop_gradient=True)
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
             var.op = op
         return op
 
@@ -610,7 +610,7 @@ class MSRAInitializer(Initializer):
                     "seed": self._seed
                 },
                 stop_gradient=True)
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
             var.op = op
         return op
 
@@ -709,7 +709,7 @@ class BilinearInitializer(Initializer):
                 'shape': list(shape),
                 value_name: values
             })
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
             var.op = op
         return op
 
@@ -768,7 +768,7 @@ class NumpyArrayInitializer(Initializer):
                 value_name: values
             },
             stop_gradient=True)
-        if not framework._in_dygraph_mode():
+        if not framework.in_dygraph_mode():
             var.op = op
         return op
 
diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py
index 7eb912645e5077d35a2d11d7d09a033d28345e15..11e3c4938bef4a3c97a724798e2f7273c25f06ed 100644
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import copy
 import six
 
-from .framework import Parameter, dtype_is_floating, _in_dygraph_mode
+from .framework import Parameter, dtype_is_floating, in_dygraph_mode
 from . import unique_name
 from paddle.fluid.initializer import Constant, Xavier
 from .param_attr import ParamAttr
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 869a5f54e9cdf5740c5e216917d92880d7d61e2d..9eed00b16185d00f30dfd75f03e31fb45cf9567c 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -17,7 +17,7 @@ from __future__ import print_function
 import copy
 import numpy as np
 
-from .framework import Variable, default_main_program, default_startup_program, _in_dygraph_mode, _current_expected_place
+from .framework import Variable, default_main_program, default_startup_program, in_dygraph_mode, _current_expected_place
 from . import unique_name
 from .param_attr import ParamAttr, WeightNormParamAttr
 from . import core
@@ -54,7 +54,7 @@ class LayerHelperBase(object):
         Return Variable construct from value
         """
         if isinstance(value, np.ndarray):
-            assert _in_dygraph_mode(
+            assert in_dygraph_mode(
             ), "to_variable could only be called in dygraph mode"
 
             if not block:
@@ -302,7 +302,7 @@ class LayerHelperBase(object):
             param = self._create_weight_normalize(attr, shape, dtype)
             WeightNormParamAttr.params_with_weight_norm.append(param)
             return param
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             # In dygraph mode, we want the returned parameter to be
             # initialized so that it can be used imperatively.
             return self.main_program.global_block().create_parameter(
@@ -370,7 +370,7 @@ class LayerHelperBase(object):
                initializer: initializer to use
         """
         assert isinstance(var, Variable)
-        if _in_dygraph_mode():
+        if in_dygraph_mode():
             initializer(var, var.block)
         else:
             self.startup_program.global_block().create_var(
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index a5e513ed5e35d530dd07c49339995461da8454a1..2df63d723e6ce91d3819c5e4301b9d5682158d79 100644
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -29,7 +29,8 @@ from functools import reduce
 
 __all__ = [
     'While', 'Switch', 'increment', 'array_write', 'create_array', 'less_than',
-    'equal', 'array_read', 'array_length', 'IfElse', 'DynamicRNN', 'StaticRNN',
+    'less_equal', 'greater_than', 'greater_equal', 'equal', 'not_equal',
+    'array_read', 'array_length', 'IfElse', 'DynamicRNN', 'StaticRNN',
     'reorder_lod_tensor_by_rank', 'Print', 'is_empty'
 ]
 
@@ -189,6 +190,7 @@ def Print(input,
             'print_tensor_lod': print_tensor_lod,
             'print_phase': print_phase.upper()
         })
+    return input
 
 
 class BlockGuard(object):
@@ -267,8 +269,44 @@ class StaticRNN(object):
     """
     StaticRNN class.
 
-    StaticRNN class is used to create a StaticRNN. The RNN will have its
-    own parameters like inputs, outputs, memories, status and length.
+    The StaticRNN can process a batch of sequence data. The length of each
+    sample sequence must be equal. The StaticRNN will have its own parameters
+    like inputs, outputs, memories. **Note that the first dimension of inputs
+    represents sequence length, and all the sequence length of inputs must be
+    the same. And the meaning of each axis of input and output are the same.**
+
+    Examples:
+        >>> import paddle.fluid as fluid
+        >>> import paddle.fluid.layers as layers
+        >>>
+        >>> vocab_size, hidden_size=10000, 200
+        >>> x = layers.data(name="x", shape=[-1, 1, 1], dtype='int64')
+        >>> x_emb = layers.embedding(
+        >>>         input=x,
+        >>>         size=[vocab_size, hidden_size],
+        >>>         dtype='float32',
+        >>>         is_sparse=False)
+        >>> x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
+        >>>
+        >>> rnn = fluid.layers.StaticRNN()
+        >>> with rnn.step():
+        >>>    word = rnn.step_input(x_emb)
+        >>>    prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word)
+        >>>    hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
+        >>>    rnn.update_memory(prev, hidden)  # set prev to hidden
+        >>>    rnn.step_output(hidden)
+        >>>
+        >>> result = rnn()
+
+    The StaticRNN will unfold sequence into time steps. Users need to define
+    how to process each time step during the :code:`with` step.
+
+    The :code:`memory` is used as a staging data cross time step. The initial
+    value of memory can be a variable that is filled with a constant value or
+    a specified variable.
+
+    The StaticRNN can mark multiple variables as its output. Use `rnn()` to
+    get the output sequence.
     """
     BEFORE_RNN_BLOCK = 0
     IN_RNN_BLOCK = 1
@@ -284,6 +322,9 @@ class StaticRNN(object):
         self.seq_len = None
 
     def step(self):
+        """
+        The block for user to define operators in RNN.
+        """
         return BlockGuardWithCompletion(self)
 
     def _assert_in_rnn_block_(self, method):
@@ -298,13 +339,28 @@ class StaticRNN(object):
                init_batch_dim_idx=0,
                ref_batch_dim_idx=1):
         """
+        Create a memory variable for static rnn.
+
+        If the :code:`init` is not None, :code:`memory` will be initialized by
+        this Variable. If the :code:`init` is None, :code:`shape` and :code:`batch_ref`
+        must be set, and this function will initialize a :code:`init` Variable.
+
         Args:
-            init: boot memory, if not set, a shape, batch_ref must be provided
-            shape: shape of the boot memory
-            batch_ref: batch size reference variable
-            init_value: the init value of boot memory
-            init_batch_dim_idx: the index of batch size in init's dimension
-            ref_batch_dim_idx: the index of batch size in batch_ref's dimension
+            init(Variable|None): The initialized variable. If it is not set,
+                :code:`shape` and :code:`batch_ref` must be provided.
+                Default: None.
+            shape(list|tuple): The shape of the boot memory. NOTE the shape
+                does not contain batch_size. Default: None.
+            batch_ref(Variable|None): The batch size reference Variable.
+                Default: None.
+            init_value(float): the init value of boot memory. Default: 0.0.
+            init_batch_dim_idx(int): the batch_size axis of the
+                :code:`init` Variable. Default: 0.
+            ref_batch_dim_idx(int): the batch_size axis of the
+                :code:`batch_ref` Variable. Default: 1.
+
+        Returns:
+            The memory variable.
         """
         self._assert_in_rnn_block_('memory')
         if init is None:
@@ -343,6 +399,16 @@ class StaticRNN(object):
             return pre_mem
 
     def step_input(self, x):
+        """
+        Mark a sequence as a StaticRNN input.
+
+        Args:
+            x(Variable): The input sequence, the shape of x
+                should be [seq_len, ...].
+
+        Returns:
+            The current time step in the input sequence.
+        """
         self._assert_in_rnn_block_('step_input')
         if not isinstance(x, Variable):
             raise TypeError("step input takes a Variable")
@@ -357,6 +423,15 @@ class StaticRNN(object):
         return ipt
 
     def step_output(self, o):
+        """
+        Mark a sequence as a StaticRNN output.
+
+        Args:
+            o(Variable): The output sequence.
+
+        Returns:
+            None.
+        """
         self._assert_in_rnn_block_('step_output')
         if not isinstance(o, Variable):
             raise TypeError("step output takes a Variable")
@@ -376,10 +451,30 @@ class StaticRNN(object):
         self.outputs.append(out_var)
 
     def output(self, *outputs):
+        """
+        Mark the StaticRNN output variables.
+
+        Args:
+            outputs: The output Variables.
+
+        Returns:
+            None
+        """
         for each in outputs:
             self.step_output(each)
 
     def update_memory(self, mem, var):
+        """
+        Update the memory from ex_mem to new_mem. NOTE that the shape and data
+        type of :code:`ex_mem` and :code:`new_mem` must be same.
+
+        Args:
+            mem(Variable): the memory variable.
+            var(Variable): the plain variable generated in RNN block.
+
+        Returns:
+            None
+        """
         if not isinstance(mem, Variable) or not isinstance(var, Variable):
             raise TypeError("update memory should take variables")
         self.memories[mem.name].mem = var
@@ -419,6 +514,9 @@ class StaticRNN(object):
         for m in self.memories:
             local_inputs.add(m)
 
+        # NOTE(zcd): the params have two categories of variables.
+        #   - the variables that are the out of StaticRnn.
+        #   - the variables that are the parameters of some layers, for example, conv2d.
         params = list()
         for op in rnn_block.ops:
             assert isinstance(op, Operator)
@@ -435,17 +533,19 @@ class StaticRNN(object):
         inlinks = [parent_block.var(i.name) for i in self.inputs]
         outlinks = self.outputs
 
+        # NOTE(zcd): the states maybe empty in some case.
         boot_memories = []
         pre_memories = []
         memories = []
         for _, mem in six.iteritems(self.memories):
             boot_memories.append(mem.init)
             pre_memories.append(mem.pre_mem.name)
+            assert mem.mem is not None, "%s should be updated in every step." % (
+                mem.init.name)
             mem_var = rnn_block.var(mem.mem.name)
             assert isinstance(mem_var, Variable)
             new_mem = self.helper.create_variable_for_type_inference(
                 dtype=mem_var.dtype)
-
             rnn_block.append_op(
                 type='rnn_memory_helper',
                 inputs={'X': [mem_var]},
@@ -464,6 +564,7 @@ class StaticRNN(object):
             outputs={'outputs': outlinks,
                      'step_scopes': [step_scope]},
             attrs={
+                'has_states': len(pre_memories) > 0,
                 'ex_states': pre_memories,
                 'states': memories,
                 'sub_block': rnn_block
@@ -872,6 +973,114 @@ def less_than(x, y, force_cpu=None, cond=None):
     return cond
 
 
+@templatedoc()
+def less_equal(x, y, cond=None):
+    """
+    This layer returns the truth value of :math:`x <= y` elementwise, which is equivalent to the overloaded operator `<=`.
+
+    Args:
+        x(Variable): First operand of *less_equal*
+        y(Variable): Second operand of *less_equal*
+        cond(Variable|None): Optional output variable to store the result of *less_equal*
+
+    Returns:
+        Variable: The tensor variable storing the output of *less_equal*.
+
+    Examples:
+        .. code-block:: python
+
+          out = fluid.layers.less_equal(x=label, y=limit)
+    """
+    helper = LayerHelper("less_equal", **locals())
+    if cond is None:
+        cond = helper.create_variable_for_type_inference(dtype='bool')
+        cond.stop_gradient = True
+
+    attrs = dict()
+    if force_init_on_cpu():
+        attrs['force_cpu'] = force_init_on_cpu()
+
+    helper.append_op(
+        type='less_equal',
+        inputs={'X': [x],
+                'Y': [y]},
+        outputs={'Out': [cond]},
+        attrs=attrs)
+    return cond
+
+
+@templatedoc()
+def greater_than(x, y, cond=None):
+    """
+    This layer returns the truth value of :math:`x > y` elementwise, which is equivalent to the overloaded operator `>`.
+
+    Args:
+        x(Variable): First operand of *greater_than*
+        y(Variable): Second operand of *greater_than*
+        cond(Variable|None): Optional output variable to store the result of *greater_than*
+
+    Returns:
+        Variable: The tensor variable storing the output of *greater_than*.
+
+    Examples:
+        .. code-block:: python
+
+          out = fluid.layers.greater_than(x=label, y=limit)
+    """
+    helper = LayerHelper("greater_than", **locals())
+    if cond is None:
+        cond = helper.create_variable_for_type_inference(dtype='bool')
+        cond.stop_gradient = True
+
+    attrs = dict()
+    if force_init_on_cpu():
+        attrs['force_cpu'] = force_init_on_cpu()
+
+    helper.append_op(
+        type='greater_than',
+        inputs={'X': [x],
+                'Y': [y]},
+        outputs={'Out': [cond]},
+        attrs=attrs)
+    return cond
+
+
+@templatedoc()
+def greater_equal(x, y, cond=None):
+    """
+    This layer returns the truth value of :math:`x >= y` elementwise, which is equivalent to the overloaded operator `>=`.
+
+    Args:
+        x(Variable): First operand of *greater_equal*
+        y(Variable): Second operand of *greater_equal*
+        cond(Variable|None): Optional output variable to store the result of *greater_equal*
+
+    Returns:
+        Variable: The tensor variable storing the output of *greater_equal*.
+
+    Examples:
+        .. code-block:: python
+
+          out = fluid.layers.greater_equal(x=label, y=limit)
+    """
+    helper = LayerHelper("greater_equal", **locals())
+    if cond is None:
+        cond = helper.create_variable_for_type_inference(dtype='bool')
+        cond.stop_gradient = True
+
+    attrs = dict()
+    if force_init_on_cpu():
+        attrs['force_cpu'] = force_init_on_cpu()
+
+    helper.append_op(
+        type='greater_equal',
+        inputs={'X': [x],
+                'Y': [y]},
+        outputs={'Out': [cond]},
+        attrs=attrs)
+    return cond
+
+
 def equal(x, y, cond=None):
     """
     This layer returns the truth value of :math:`x == y` elementwise.
@@ -900,6 +1109,34 @@ def equal(x, y, cond=None):
     return cond
 
 
+def not_equal(x, y, cond=None):
+    """
+    This layer returns the truth value of :math:`x != y` elementwise, which is equivalent to the overloader operator `!=`.
+
+    Args:
+        x(Variable): First operand of *not_equal*
+        y(Variable): Second operand of *not_equal*
+        cond(Variable|None): Optional output variable to store the result of *not_equal*
+
+    Returns:
+        Variable: The tensor variable storing the output of *not_equal*.
+
+    Examples:
+        .. code-block:: python
+
+          out = fluid.layers.not_equal(x=label, y=limit)
+    """
+    helper = LayerHelper("not_equal", **locals())
+    if cond is None:
+        cond = helper.create_variable_for_type_inference(dtype='bool')
+        cond.stop_gradient = True
+
+    helper.append_op(
+        type='not_equal', inputs={'X': [x],
+                                  'Y': [y]}, outputs={'Out': [cond]})
+    return cond
+
+
 def array_read(array, i):
     """
     This function performs the operation to read the data in as an
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 0a1ddbc1dba51692e75fa76856dd689b77ab9f35..09a573e8381e7402f6f617e6045110a93ba4f589 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -509,14 +509,14 @@ def polygon_box_transform(input, name=None):
 
 @templatedoc(op_type="yolov3_loss")
 def yolov3_loss(x,
-                gtbox,
-                gtlabel,
+                gt_box,
+                gt_label,
                 anchors,
                 anchor_mask,
                 class_num,
                 ignore_thresh,
                 downsample_ratio,
-                gtscore=None,
+                gt_score=None,
                 use_label_smooth=True,
                 name=None):
     """
@@ -524,12 +524,12 @@ def yolov3_loss(x,
 
     Args:
         x (Variable): ${x_comment}
-        gtbox (Variable): groud truth boxes, should be in shape of [N, B, 4],
+        gt_box (Variable): groud truth boxes, should be in shape of [N, B, 4],
                           in the third dimenstion, x, y, w, h should be stored 
                           and x, y, w, h should be relative value of input image.
                           N is the batch number and B is the max box number in 
                           an image.
-        gtlabel (Variable): class id of ground truth boxes, shoud be in shape
+        gt_label (Variable): class id of ground truth boxes, shoud be in shape
                             of [N, B].
         anchors (list|tuple): ${anchors_comment}
         anchor_mask (list|tuple): ${anchor_mask_comment}
@@ -537,7 +537,7 @@ def yolov3_loss(x,
         ignore_thresh (float): ${ignore_thresh_comment}
         downsample_ratio (int): ${downsample_ratio_comment}
         name (string): the name of yolov3 loss. Default None.
-        gtscore (Variable): mixup score of ground truth boxes, shoud be in shape
+        gt_score (Variable): mixup score of ground truth boxes, shoud be in shape
                             of [N, B]. Default None.
         use_label_smooth (bool): ${use_label_smooth_comment}
 
@@ -558,13 +558,13 @@ def yolov3_loss(x,
       .. code-block:: python
 
           x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
-          gtbox = fluid.layers.data(name='gtbox', shape=[6, 4], dtype='float32')
-          gtlabel = fluid.layers.data(name='gtlabel', shape=[6], dtype='int32')
-          gtscore = fluid.layers.data(name='gtscore', shape=[6], dtype='float32')
+          gt_box = fluid.layers.data(name='gt_box', shape=[6, 4], dtype='float32')
+          gt_label = fluid.layers.data(name='gt_label', shape=[6], dtype='int32')
+          gt_score = fluid.layers.data(name='gt_score', shape=[6], dtype='float32')
           anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
           anchor_mask = [0, 1, 2]
-          loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel,
-                                          gtscore=gtscore, anchors=anchors, 
+          loss = fluid.layers.yolov3_loss(x=x, gt_box=gt_box, gt_label=gt_label,
+                                          gt_score=gt_score, anchors=anchors, 
                                           anchor_mask=anchor_mask, class_num=80,
                                           ignore_thresh=0.7, downsample_ratio=32)
     """
@@ -572,11 +572,11 @@ def yolov3_loss(x,
 
     if not isinstance(x, Variable):
         raise TypeError("Input x of yolov3_loss must be Variable")
-    if not isinstance(gtbox, Variable):
+    if not isinstance(gt_box, Variable):
         raise TypeError("Input gtbox of yolov3_loss must be Variable")
-    if not isinstance(gtlabel, Variable):
+    if not isinstance(gt_label, Variable):
         raise TypeError("Input gtlabel of yolov3_loss must be Variable")
-    if gtscore is not None and not isinstance(gtscore, Variable):
+    if gt_score is not None and not isinstance(gt_score, Variable):
         raise TypeError("Input gtscore of yolov3_loss must be Variable")
     if not isinstance(anchors, list) and not isinstance(anchors, tuple):
         raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
@@ -602,11 +602,11 @@ def yolov3_loss(x,
 
     inputs = {
         "X": x,
-        "GTBox": gtbox,
-        "GTLabel": gtlabel,
+        "GTBox": gt_box,
+        "GTLabel": gt_label,
     }
-    if gtscore:
-        inputs["GTScore"] = gtscore
+    if gt_score:
+        inputs["GTScore"] = gt_score
 
     attrs = {
         "anchors": anchors,
@@ -1542,7 +1542,7 @@ def multi_box_head(inputs,
         .. code-block:: python
 
           mbox_locs, mbox_confs, box, var = fluid.layers.multi_box_head(
-            inputs=[conv1, conv2, conv3, conv4, conv5, conv5],
+            inputs=[conv1, conv2, conv3, conv4, conv5, conv6],
             image=images,
             num_classes=21,
             min_ratio=20,
diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py
index b7d1eeba80d93d549a019455087bb7cc1d2a1083..a67c8058f2c42713738420e81316452e15acb697 100644
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -35,8 +35,8 @@ from ..dygraph import learning_rate_scheduler as imperate_lr
 
 __all__ = [
     'exponential_decay', 'natural_exp_decay', 'inverse_time_decay',
-    'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS',
-    'cosine_decay', 'linear_lr_warmup'
+    'polynomial_decay', 'piecewise_decay', 'noam_decay', 'cosine_decay',
+    'linear_lr_warmup'
 ]
 
 
@@ -349,24 +349,26 @@ def cosine_decay(learning_rate, step_each_epoch, epochs):
     training progresses. By using this function, the learning rate will be decayed by
     following cosine decay strategy.
 
-    decayed_lr = learning_rate * 0.5 * (math.cos(epoch * math.pi / epochs) + 1)
+    .. math::
+
+	decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1)
     
     Args:
         learning_rate(Variable|float): The initial learning rate.
         step_each_epoch(int): the number of steps in an epoch.
         epochs(int): the number of epochs.
 
-     Returns:
-        Variable: The decayed learning rate.
-
-     Examples:
+    Returns:
+	Variable: The decayed learning rate.
 
-    ..code-block:: python
+    Examples:
+	.. code-block:: python
 
-  	base_lr = 0.1
-	lr = fluid.layers.cosine_decay(
-	learning_rate = base_lr, step_each_epoch=10000, epochs=120)
+  	    base_lr = 0.1
+	    lr = fluid.layers.cosine_decay(
+	    learning_rate = base_lr, step_each_epoch=10000, epochs=120)
     """
+
     with default_main_program()._lr_schedule_guard():
         if imperative_base.enabled():
             decay = imperate_lr.CosineDecay(learning_rate, step_each_epoch,
@@ -381,50 +383,6 @@ def cosine_decay(learning_rate, step_each_epoch, epochs):
             return decayed_lr
 
 
-def append_LARS(params_grads, learning_rate, weight_decay):
-    """
-    Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for
-    each layer.
-
-    Args:
-        learning_rate: A learning rate Variable. This
-          is the global learning rate for LARS.
-        weight_decay: A Python `float` number.
-
-    Returns:
-        The decayed learning rate
-    Examples:
-        .. code-block:: python
-
-            learning_rate *= local_gw_ratio * sqrt(sumsq(param))
-                        / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param)))
-    """
-
-    assert not imperative_base.enabled(
-    ), "append_LARS is NOT supported in dygraph mode now"
-
-    def _balanced_weight(param_norm, grad_norm):
-        if weight_decay == 1.0:
-            return grad_norm + param_norm
-        else:
-            return grad_norm + weight_decay * param_norm
-
-    for param, grad in params_grads:
-        with param.block.program.optimized_guard(
-            [param, grad]), name_scope("optimizer"):
-            param_lr = param.optimize_attr['learning_rate']
-            param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param)))
-            grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad)))
-            if type(param_lr) == float and param_lr == 1.0:
-                decayed_lr = learning_rate * param_norm \
-                    / _balanced_weight(param_norm, grad_norm)
-            else:
-                decayed_lr = learning_rate * param_lr * param_norm \
-                    / _balanced_weight(param_norm, grad_norm)
-            # set back param local learning rate
-            param.optimize_attr['learning_rate'] = decayed_lr
-
-
 def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr):
     """
     Applies linear learning rate warmup before the normal learning rate
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index a5d4d3947ab9b9699f6fc8ac0d7f088ede345290..6bf437b4dfbd06847122d64dada14ad3accc8181 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -23,7 +23,7 @@ import os
 import inspect
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder, _in_dygraph_mode
+from ..framework import Variable, OpProtoHolder, in_dygraph_mode
 from ..dygraph import base
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
@@ -73,6 +73,8 @@ __all__ = [
     'reduce_max',
     'reduce_min',
     'reduce_prod',
+    'reduce_all',
+    'reduce_any',
     'sequence_first_step',
     'sequence_last_step',
     'sequence_slice',
@@ -159,6 +161,7 @@ __all__ = [
     'sum',
     'slice',
     'shape',
+    'rank',
     'logical_and',
     'logical_or',
     'logical_xor',
@@ -193,6 +196,7 @@ __all__ = [
     'npair_loss',
     'pixel_shuffle',
     'fsp_matrix',
+    'continuous_value_model',
 ]
 
 kIgnoreIndex = -100
@@ -481,7 +485,7 @@ def dynamic_lstm(input,
             forward, _ = fluid.layers.dynamic_lstm(
                 input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
     """
-    assert _in_dygraph_mode(
+    assert in_dygraph_mode(
     ) is not True, "please use lstm instead of dynamic_lstm in dygraph mode!"
     assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
     helper = LayerHelper('lstm', **locals())
@@ -867,7 +871,7 @@ def dynamic_lstmp(input,
                                                      proj_activation="tanh")
     """
 
-    assert _in_dygraph_mode(
+    assert in_dygraph_mode(
     ) is not True, "please use lstm instead of dynamic_lstmp in dygraph mode!"
 
     assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
@@ -1041,7 +1045,7 @@ def dynamic_gru(input,
             hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim)
     """
 
-    assert _in_dygraph_mode(
+    assert in_dygraph_mode(
     ) is not True, "please use gru instead of dynamic_gru in dygraph mode!"
 
     helper = LayerHelper('gru', **locals())
@@ -1760,7 +1764,7 @@ def sequence_conv(input,
         Variable: output of sequence_conv
     """
 
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_conv', **locals())
     dtype = helper.input_dtype()
@@ -1821,7 +1825,7 @@ def sequence_softmax(input, use_cudnn=False, name=None):
                               dtype='float32', lod_level=1)
              x_sequence_softmax = fluid.layers.sequence_softmax(input=x)
     """
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_softmax', **locals())
     dtype = helper.input_dtype()
@@ -2315,7 +2319,7 @@ def sequence_pool(input, pool_type, is_test=False):
              last_x = fluid.layers.sequence_pool(input=x, pool_type='last')
              first_x = fluid.layers.sequence_pool(input=x, pool_type='first')
     """
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_pool', **locals())
     dtype = helper.input_dtype()
@@ -2356,7 +2360,7 @@ def sequence_concat(input, name=None):
 
            out = fluid.layers.sequence_concat(input=[seq1, seq2, seq3])
     """
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_concat', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
@@ -2485,7 +2489,7 @@ def sequence_slice(input, offset, length, name=None):
              subseqs = fluid.layers.sequence_slice(input=seqs, offset=offset,
                                                    length=length)
     """
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper("sequence_slice", **locals())
     dtype = helper.input_dtype()
@@ -3307,7 +3311,7 @@ def layer_norm(input,
         >>>                          dtype='float32')
         >>> x = fluid.layers.layer_norm(input=data, begin_norm_axis=1)
     """
-    assert _in_dygraph_mode(
+    assert in_dygraph_mode(
     ) is not True, "please use FC instead of fc in dygraph mode!"
     helper = LayerHelper('layer_norm', **locals())
     dtype = helper.input_dtype()
@@ -3946,7 +3950,7 @@ def sequence_expand(x, y, ref_level=-1, name=None):
                              dtype='float32', lod_level=1)
             out = layers.sequence_expand(x=x, y=y, ref_level=0)
     """
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_expand', input=x, **locals())
     dtype = helper.input_dtype()
@@ -4014,7 +4018,7 @@ def sequence_expand_as(x, y, name=None):
                              dtype='float32', lod_level=1)
             out = layers.sequence_expand_as(x=x, y=y)
     """
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_expand_as', input=x, **locals())
     dtype = helper.input_dtype()
@@ -4062,7 +4066,7 @@ def sequence_pad(x, pad_value, maxlen=None, name=None):
             out = fluid.layers.sequence_pad(x=x, pad_value=pad_value)
     """
 
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_pad', input=x, **locals())
     dtype = helper.input_dtype()
@@ -4130,7 +4134,7 @@ def sequence_unpad(x, length, name=None):
             out = fluid.layers.sequence_unpad(x=x, length=len)
     """
 
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_unpad', input=x, **locals())
     dtype = helper.input_dtype()
@@ -4738,6 +4742,106 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
     return out
 
 
+def reduce_all(input, dim=None, keep_dim=False, name=None):
+    """
+    Computes the ``logical and`` of tensor elements over the given dimension.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (list|int|None): The dimension along which the logical and is computed.
+            If :attr:`None`, compute the logical and over all elements of
+            :attr:`input` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-rank(input), rank(input))`.
+            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+        
+            # x is a bool Tensor variable with following elements:
+            #    [[True, False]
+            #     [True, True]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_all(x)  # False 
+            fluid.layers.reduce_all(x, dim=0)  # [True, False]
+            fluid.layers.reduce_all(x, dim=-1)  # [False, True]
+            fluid.layers.reduce_all(x, dim=1,
+                                     keep_dim=True)  # [[False], [True]]
+
+    """
+    helper = LayerHelper('reduce_all', **locals())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
+    helper.append_op(
+        type='reduce_all',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def reduce_any(input, dim=None, keep_dim=False, name=None):
+    """
+    Computes the ``logical or`` of tensor elements over the given dimension.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (list|int|None): The dimension along which the logical or is computed.
+            If :attr:`None`, compute the logical or over all elements of
+            :attr:`input` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-rank(input), rank(input))`.
+            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a bool Tensor variable with following elements:
+            #    [[True, False]
+            #     [False, False]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_any(x)  # True
+            fluid.layers.reduce_any(x, dim=0)  # [True, False]
+            fluid.layers.reduce_any(x, dim=-1)  # [True, False]
+            fluid.layers.reduce_any(x, dim=1,
+                                     keep_dim=True)  # [[True], [False]]
+
+    """
+    helper = LayerHelper('reduce_any', **locals())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
+    helper.append_op(
+        type='reduce_any',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
 def split(input, num_or_sections, dim=-1, name=None):
     """
     Split the input tensor into multiple sub-tensors.
@@ -4819,7 +4923,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
             the dimension to normalization is rank(X) + axis. -1 is the
             last dimension.
         epsilon(float): The epsilon value is used to avoid division by zero, \
-            the defalut value is 1e-10.
+            the defalut value is 1e-12.
         name(str|None): A name for this layer(optional). If set None, the layer \
             will be named automatically.
 
@@ -5305,7 +5409,7 @@ def sequence_reshape(input, new_dim):
             x = fluid.layers.data(shape=[5, 20], dtype='float32', lod_level=1)
             x_reshaped = fluid.layers.sequence_reshape(input=x, new_dim=10)
     """
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_reshape', **locals())
     out = helper.create_variable_for_type_inference(helper.input_dtype())
@@ -5617,12 +5721,21 @@ def hsigmoid(input,
         raise ValueError(
             "num_classes must not be less than 2 with default tree")
 
+    if (not is_custom) and (is_sparse):
+        print("Sparse mode should not be used without custom tree")
+        is_sparse = False
+
+    if (not is_custom) and ((path_table is not None) or
+                            (path_code is not None)):
+        raise ValueError(
+            "only num_classes should be passed without custom tree")
+
     if (is_custom) and (path_code is None):
-        raise ValueError("path_code should not be None with costum tree")
+        raise ValueError("path_code should not be None with custom tree")
     elif (is_custom) and (path_table is None):
-        raise ValueError("path_table should not be None with costum tree")
+        raise ValueError("path_table should not be None with custom tree")
     elif (is_custom) and (num_classes is None):
-        raise ValueError("num_classes should not be None with costum tree")
+        raise ValueError("num_classes should not be None with custom tree")
     else:
         pass
 
@@ -5841,7 +5954,7 @@ def im2sequence(input,
                 input=layer, stride=[1, 1], filter_size=[2, 2])
 
     """
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
 
     if isinstance(filter_size, int):
@@ -6165,6 +6278,8 @@ def sampled_softmax_with_cross_entropy(logits,
     sampled_label = helper.create_variable_for_type_inference(dtype='int64')
     sampled_softlabel = helper.create_variable_for_type_inference(
         dtype=logits.dtype)
+    logits_dim = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    labels_dim = helper.create_variable_for_type_inference(dtype=label.type)
 
     helper.append_op(
         type='sample_logits',
@@ -6178,7 +6293,9 @@ def sampled_softmax_with_cross_entropy(logits,
             'Samples': samples,
             'Probabilities': probabilities,
             'SampledLabels': sampled_label,
-            'SampledLogits': sampled_logits
+            'SampledLogits': sampled_logits,
+            'LogitsDim': logits_dim,
+            'LabelsDim': labels_dim
         },
         attrs={
             'use_customized_samples': use_customized_samples,
@@ -6277,7 +6394,7 @@ def one_hot(input, depth):
     Examples:
         .. code-block:: python
 
-            label = layers.data(name="label", shape=[1], dtype="float32")
+            label = layers.data(name="label", shape=[1], dtype="int64")
             one_hot_label = layers.one_hot(input=label, depth=10)
     """
     helper = LayerHelper("one_hot", **locals())
@@ -6485,7 +6602,7 @@ def squeeze(input, axes, name=None):
             x = layers.data(name='x', shape=[5, 1, 10])
             y = layers.sequeeze(input=x, axes=[1])
     """
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "squeeze layer is not supported in dygraph mode yet.")
     helper = LayerHelper("squeeze", **locals())
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -7138,10 +7255,10 @@ def image_resize(input,
         out_shape(list|tuple|Variable|None): Output shape of image resize
                                     layer, the shape is (out_h, out_w).
                                     Default: None
-        scale(float|None): The multiplier for the input height or width.
-                         At least one of out_shape or scale must be set.
-                         And out_shape has a higher priority than scale.
-                         Default: None
+        scale(float|None): The multiplier for the input height or width. At
+             least one of :attr:`out_shape` or :attr:`scale` must be set. 
+             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
+             Default: None.
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
         resample(str): The resample method. It supports 'BILINEAR' and 'NEAREST'
@@ -7179,6 +7296,7 @@ def image_resize(input,
                     or 'NEAREST' currently.
         ValueError: One of out_shape and scale must not be None.
         ValueError: out_shape length should be 2.
+        ValueError: scale should be greater than zero.
         TypeError: align_corners shoule be a bool value
         ValueError: align_mode can only be '0' or '1'
 
@@ -7210,26 +7328,36 @@ def image_resize(input,
     def _is_list_or_turple_(data):
         return (isinstance(data, list) or isinstance(data, tuple))
 
-    out_h = 0
-    out_w = 0
     inputs = {"X": input}
+    attrs = {
+        "out_h": 0,
+        "out_w": 0,
+        "interp_method": resample_type,
+        "align_corners": align_corners,
+        "align_mode": align_mode
+    }
+
     if out_shape is not None:
         if isinstance(out_shape, Variable):
             warnings.warn("out_shape as Variable type is deprecated, \
                     it is recommended to use actual_shape instead of \
                     out_shape to specify output shape dynamically.")
             inputs['OutSize'] = out_shape
-        elif not (_is_list_or_turple_(out_shape)):
-            raise TypeError("out_shape should be a list or tuple or Variable.")
-        elif len(out_shape) != 2:
-            raise ValueError("out_shape length should be 2.")
-
-        out_shape = list(map(int, out_shape))
-        out_h = out_shape[0]
-        out_w = out_shape[1]
+        else:
+            if not (_is_list_or_turple_(out_shape)):
+                raise TypeError(
+                    "out_shape should be a list or tuple or Variable.")
+            if len(out_shape) != 2:
+                raise ValueError("out_shape length should be 2.")
+
+            out_shape = list(map(int, out_shape))
+            attrs['out_h'] = out_shape[0]
+            attrs['out_w'] = out_shape[1]
+
     else:
-        out_h = int(input.shape[2] * scale)
-        out_w = int(input.shape[3] * scale)
+        if scale <= 0:
+            raise ValueError("scale should be greater than zero.")
+        attrs['scale'] = float(scale)
 
     if isinstance(actual_shape, Variable):
         inputs["OutSize"] = actual_shape
@@ -7241,13 +7369,7 @@ def image_resize(input,
         type='{}_interp'.format(resample_type),
         inputs=inputs,
         outputs={"Out": out},
-        attrs={
-            "out_h": out_h,
-            "out_w": out_w,
-            "interp_method": resample_type,
-            "align_corners": align_corners,
-            "align_mode": align_mode
-        })
+        attrs=attrs)
     return out
 
 
@@ -7315,11 +7437,14 @@ def resize_bilinear(input,
     Args:
         input(${x_type}): ${x_comment}.
 
-        out_shape(${out_size_type}): ${out_size_comment}.
+        out_shape(list|tuple|Variable|None): Output shape of resize bilinear
+                                    layer, the shape is (out_h, out_w).
+                                    Default: None
 
         scale(float|None): The multiplier for the input height or width. At
-             least one of out_shape or scale must be set. And out_shape has
-             a higher priority than scale. Default: None.
+             least one of :attr:`out_shape` or :attr:`scale` must be set. 
+             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
+             Default: None.
 
         name(str|None): The output variable name.
         actual_shape(Variable): An optional input to specify output shape
@@ -7406,11 +7531,14 @@ def resize_nearest(input,
     Args:
         input(${x_type}): ${x_comment}.
 
-        out_shape(${out_size_type}): ${out_size_comment}.
+        out_shape(list|tuple|Variable|None): Output shape of resize nearest
+                                    layer, the shape is (out_h, out_w).
+                                    Default: None
 
         scale(float|None): The multiplier for the input height or width. At
-             least one of out_shape or scale must be set. And out_shape has
-             a higher priority than scale. Default: None.
+             least one of :attr:`out_shape` or :attr:`scale` must be set. 
+             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
+             Default: None.
 
         name(str|None): The output variable name.
         actual_shape(Variable): An optional input to specify output shape
@@ -7620,7 +7748,7 @@ def sequence_scatter(input, index, updates, name=None):
             output = fluid.layers.sequence_scatter(input, index, updates)
 
     """
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_scatter', **locals())
     dtype = helper.input_dtype()
@@ -8710,7 +8838,7 @@ def sequence_enumerate(input, win_size, pad_value=0, name=None):
             x = fluid.layers.data(shape[30, 1], dtype='int32', lod_level=1)
             out = fluid.layers.sequence_enumerate(input=x, win_size=3, pad_value=0)
     """
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper('sequence_enumerate', **locals())
     out = helper.create_variable_for_type_inference(
@@ -8751,7 +8879,7 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
         Variable: The output sequence mask.
 
     """
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
 
     helper = LayerHelper('sequence_mask', **locals())
@@ -9226,11 +9354,37 @@ def shape(input):
     return out
 
 
+def rank(input):
+    """
+    **Rank Layer**
+
+    Returns the number of dimensions for a tensor, which is a 0-D int32 Tensor.
+
+    Args:
+        input (Variable): The input variable.
+
+    Returns:
+        Variable: The rank of the input variable.
+
+    Examples:
+        .. code-block:: python
+
+            input = layers.data(
+                name="input", shape=[3, 100, 100], dtype="float32")
+            rank = layers.rank(input) # 4
+    """
+
+    ndims = len(input.shape)
+    out = assign(np.array(ndims, 'int32'))
+
+    return out
+
+
 def _elementwise_op(helper):
     op_type = helper.layer_type
     x = helper.kwargs.get('x', None)
     y = helper.kwargs.get('y', None)
-    if _in_dygraph_mode():
+    if in_dygraph_mode():
         x = base.to_variable(x)
         y = base.to_variable(y)
 
@@ -9803,7 +9957,7 @@ def sequence_reverse(x, name=None):
     Returns:
         out(${y_type}): ${y_comment}
     """
-    assert not _in_dygraph_mode(), (
+    assert not in_dygraph_mode(), (
         "sequence layer is not supported in dygraph mode yet.")
     helper = LayerHelper("sequence_reverse", **locals())
     if name is None:
@@ -10991,7 +11145,7 @@ def pixel_shuffle(x, upscale_factor):
 
     Returns:
 
-        Out(Variable): the pixel shuffle result is a tensor variable with the same shape and the same type as the input.
+        Out(Variable): Reshaped tensor according to the new dimension.
 
     Raises:
 
@@ -11062,3 +11216,54 @@ def fsp_matrix(x, y):
         input_param_name='x'))
     helper.append_op(type='fsp', inputs={'X': x, 'Y': y}, outputs={'Out': out})
     return out
+
+
+def continuous_value_model(input, cvm, use_cvm=True):
+    """
+
+    **continuous_value_model layers**
+
+    continuous value model(cvm). Now, it only considers show and click value in CTR project.
+    We assume that input is an embedding vector with cvm_feature, whose shape is [N * D] (D is 2 + embedding dim).
+    If use_cvm is True, it will log(cvm_feature), and output shape is [N * D].
+    If use_cvm is False, it will remove cvm_feature from input, and output shape is [N * (D - 2)].
+    
+    This layer accepts a tensor named input which is ID after embedded(lod level is 1), cvm is a show_click info.
+
+    Args:
+
+        input (Variable): a 2-D LodTensor with shape [N x D], where N is the batch size, D is 2 + the embedding dim. lod level = 1.
+        cvm (Variable):   a 2-D Tensor with shape [N x 2], where N is the batch size, 2 is show and click.
+        use_cvm  (bool):  use cvm or not. if use cvm, the output dim is the same as input
+                          if don't use cvm, the output dim is input dim - 2(remove show and click)
+                          (cvm op is a customized op, which input is a sequence has embedd_with_cvm default, so we need an op named cvm to decided whever use it or not.)
+
+    Returns:
+
+        Variable: A 2-D LodTensor with shape [N x D], if use cvm, D is equal to input dim, if don't use cvm, D is equal to input dim - 2. 
+
+    Examples:
+
+        .. code-block:: python
+
+          input = fluid.layers.data(name="input", shape=[-1, 1], lod_level=1, append_batch_size=False, dtype="int64")#, stop_gradient=False)
+          label = fluid.layers.data(name="label", shape=[-1, 1], append_batch_size=False, dtype="int64")
+          embed = fluid.layers.embedding(
+                            input=input,
+                            size=[100, 11],
+                            dtype='float32')
+          ones = fluid.layers.fill_constant_batch_size_like(input=label, shape=[-1, 1], dtype="int64", value=1)
+          show_clk = fluid.layers.cast(fluid.layers.concat([ones, label], axis=1), dtype='float32')
+          show_clk.stop_gradient = True
+          input_with_cvm = fluid.layers.continuous_value_model(embed, show_clk, True)
+
+    """
+    helper = LayerHelper('cvm', **locals())
+    out = helper.create_variable(dtype=input.dtype)
+    helper.append_op(
+        type='cvm',
+        inputs={'X': [input],
+                'CVM': [cvm]},
+        outputs={'Y': [out]},
+        attrs={"use_cvm": use_cvm})
+    return out
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index f018bb8af8cc9f7ed965c86d5aff40352014c393..f06c0abaf901950d072cf857696bc7479fb7b52f 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -27,6 +27,7 @@ __activations_noattr__ = [
     'tanh_shrink',
     'softshrink',
     'sqrt',
+    'rsqrt',
     'abs',
     'ceil',
     'floor',
@@ -81,8 +82,8 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
 
     Examples:
         .. code-block:: python
-
-        result = fluid.layers.uniform_random(shape=[32, 784])
+     
+            result = fluid.layers.uniform_random(shape=[32, 784])
     """
 
     locals_var = locals().keys()
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 80450119f44e93aae4b483983484ea18be5b2035..d1681580bebc454d26be518180b649bfb3c76e4e 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -24,26 +24,11 @@ from .layer_function_generator import templatedoc
 import numpy
 
 __all__ = [
-    'create_tensor',
-    'create_parameter',
-    'create_global_var',
-    'cast',
-    'tensor_array_to_tensor',
-    'concat',
-    'sums',
-    'assign',
-    'fill_constant_batch_size_like',
-    'fill_constant',
-    'argmin',
-    'argmax',
-    'argsort',
-    'ones',
-    'zeros',
-    'reverse',
-    'has_inf',
-    'has_nan',
-    'isfinite',
-    'range',
+    'create_tensor', 'create_parameter', 'create_global_var', 'cast',
+    'tensor_array_to_tensor', 'concat', 'sums', 'assign',
+    'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
+    'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite',
+    'range', 'linspace', 'zeros_like'
 ]
 
 
@@ -826,3 +811,76 @@ def range(start, end, step, dtype):
                 'Step': step},
         outputs={'Out': [out]})
     return out
+
+
+def linspace(start, stop, num, dtype):
+    """
+    Return fixed number of evenly spaced values within a given interval.
+
+    First entry is start, and last entry is stop. In the case when Num is 1, only Start is returned. Like linspace function of numpy.
+
+    Args:
+        start(float|Variable): First entry in the sequence. It is a float scalar, or a tensor of shape [1] with type 'float32'|'float64'.
+        stop(float|Variable): Last entry in the sequence. It is a float scalar, or a tensor of shape [1] with type 'float32'|'float64'.
+        num(int|Variable): Number of entry in the sequence. It is an int scalar, or a tensor of shape [1] with type int32.
+        dtype(string): 'float32'|'float64', the data type of the output tensor.
+
+    Returns:
+        Variable: The tensor variable storing a 1-D tensor. 
+
+    Examples:
+        .. code-block:: python
+
+             data = fluid.layers.linspace(0, 10, 5, 'float32') # [0.0,  2.5,  5.0,  7.5, 10.0]
+             data = fluid.layers.linspace(0, 10, 1, 'float32') # [0.0]
+
+    """
+    helper = LayerHelper("linspace", **locals())
+
+    if not isinstance(start, Variable):
+        start = fill_constant([1], dtype, start)
+    if not isinstance(stop, Variable):
+        stop = fill_constant([1], dtype, stop)
+    if not isinstance(num, Variable):
+        num = fill_constant([1], 'int32', num)
+
+    out = helper.create_variable_for_type_inference(dtype=start.dtype)
+
+    helper.append_op(
+        type='linspace',
+        inputs={'Start': start,
+                'Stop': stop,
+                'Num': num},
+        outputs={'Out': [out]})
+    return out
+
+
+def zeros_like(x, out=None):
+    """
+    **zeros_like**
+
+    This function creates a zeros tensor which has identical shape and dtype 
+    with `x`.
+
+    Args:
+        x(Variable): The input tensor which specifies shape and dtype.
+        out(Variable): The output tensor.
+
+    Returns:
+        Variable: The tensor variable storing the output.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', dtype='float32', shape=[3], append_batch_size=False)
+          data = fluid.layers.zeros_like(x) # [0.0, 0.0, 0.0]
+
+    """
+
+    helper = LayerHelper("zeros_like", **locals())
+    if out is None:
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='fill_zeros_like', inputs={'X': [x]}, outputs={'Out': [out]})
+    out.stop_gradient = True
+    return out
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
index 7e6e37116fe23f26eb14dd0573dbe031aec98dd8..28126b72a429714dfe66ae709e31d99d843fab74 100644
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -55,7 +55,7 @@ class Optimizer(object):
     """
 
     def __init__(self, learning_rate, regularization=None, name=None):
-        if framework._in_dygraph_mode():
+        if framework.in_dygraph_mode():
             if not isinstance(learning_rate, float) and \
                     not isinstance(learning_rate, LearningRateDecay):
                 raise TypeError(
@@ -205,7 +205,7 @@ class Optimizer(object):
             name = self._name + "_" + name
         if (name in self._accumulators and
                 param.name in self._accumulators[name]):
-            if framework._in_dygraph_mode():
+            if framework.in_dygraph_mode():
                 return self._accumulators[name][param.name]
             raise Exception("Accumulator {} already exists for parameter {}".
                             format(name, param.name))
@@ -275,15 +275,26 @@ class Optimizer(object):
         self._create_global_learning_rate()
 
         optimize_ops = []
-        for param_and_grad in parameters_and_grads:
-            if param_and_grad[1] is None:
-                continue
-            with param_and_grad[0].block.program._optimized_guard(
-                    param_and_grad), name_scope("optimizer"):
-                if param_and_grad[0].trainable is True:
-                    optimize_op = self._append_optimize_op(global_block,
-                                                           param_and_grad)
-                    optimize_ops.append(optimize_op)
+        if framework.in_dygraph_mode():
+            for param_and_grad in parameters_and_grads:
+                if param_and_grad[1] is None:
+                    continue
+                with param_and_grad[0].block.program._optimized_guard(
+                        param_and_grad):
+                    if param_and_grad[0].trainable is True:
+                        optimize_op = self._append_optimize_op(global_block,
+                                                               param_and_grad)
+                        optimize_ops.append(optimize_op)
+        else:
+            for param_and_grad in parameters_and_grads:
+                if param_and_grad[1] is None:
+                    continue
+                with param_and_grad[0].block.program._optimized_guard(
+                        param_and_grad), name_scope("optimizer"):
+                    if param_and_grad[0].trainable is True:
+                        optimize_op = self._append_optimize_op(global_block,
+                                                               param_and_grad)
+                        optimize_ops.append(optimize_op)
 
         # Get custom finish ops for subclasses
         # FIXME: Need to fix this once we figure out how to handle dependencies
@@ -363,7 +374,7 @@ class Optimizer(object):
             See examples in `apply_gradients`.
         """
         self._dtype = loss.dtype
-        if framework._in_dygraph_mode():
+        if framework.in_dygraph_mode():
             if parameter_list is not None:
                 parameters = parameter_list
             else:
@@ -448,7 +459,7 @@ class Optimizer(object):
         Returns:
             list: A list of operators appended to the current program.
         """
-        if framework._in_dygraph_mode():
+        if framework.in_dygraph_mode():
             with program_guard(framework.default_main_program(),
                                framework.default_startup_program()):
                 optimize_ops = self._create_optimization_pass(params_grads)
@@ -628,16 +639,16 @@ class DGCMomentumOptimizer(MomentumOptimizer):
 
     Original paper is https://arxiv.org/abs/1712.01887
 
-    DGC reduce the communication bandwidth by sending only the important gradients (sparse update):\
+    DGC reduces the communication bandwidth by sending only the important gradients (sparse update):\
         only gradients larger than a threshold are transmitted.
 
-    To avoid losing information, DGC accumulate the rest of the gradients locally.
+    To avoid losing information, DGC accumulates the rest of the gradients locally.
 
     Eventually, these gradients become large enough to be transmitted.
 
-    Thus, DGC send the large gradients immediately but eventually send all of the gradients over time.
+    Thus, DGC sends the large gradients immediately but eventually send all of the gradients over time.
 
-    To ensure no loss of accuracy, DGC employs momentum correc-tionandlocal gradient clipping on top of the gradient sparsification to maintain model performance.
+    To ensure no loss of accuracy, DGC employs momentum correction and local gradient clipping on top of the gradient sparsification to maintain model performance.
 
     DGC also uses momentum factor masking and warmup training to overcome the staleness problem caused by reduced communication.
 
@@ -652,7 +663,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
         learning_rate (float|Variable): the learning rate used to update parameters. \
             Can be a float value or a Variable with one float value as data element.
         momentum (float): Momentum factor.
-        rampup_begin_step (int): The begining step from which gradient compression is implemented.
+        rampup_begin_step (int): The beginning step from which gradient compression is implemented.
         rampup_step (int): How long it use the sparsity periods. Default is 1.
             for example: If the sparsity is [0.75, 0.9375, 0.984375, 0.996, 0.999], and the rampup_step is 5, \
                 it will use 0.75 at 0 step, and 0.9375 at 1 step, and so on. And when reach sparsity array ends, \
@@ -660,9 +671,9 @@ class DGCMomentumOptimizer(MomentumOptimizer):
         sparsity (list[float]): Get top important element from gradient tensor, the ratio is (1 - current sparsity).
         use_nesterov (bool): Enables Nesterov momentum. True means use nesterov.
         local_grad_clip_norm (float): Clip norm value if needed.
-        num_trainers: The number of training node.
+        num_trainers: The number of training nodes.
         regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer.
-        name: A optional name prefix.
+        name: An optional name prefix.
 
     Examples:
         .. code-block:: python
@@ -740,19 +751,19 @@ class DGCMomentumOptimizer(MomentumOptimizer):
 
         # step counter
         self._global_step_var = self._add_auto_increment_var(
-            counter_name='__g_dgc_counter__', begin=0)
+            counter_name=core.dgc.kDGCCounterName(), begin=0)
 
         # rampup begin step var for all_reduce_op_handle
         self._rampup_begin_step_var = tensor.create_global_var(
             shape=[1],
             dtype=core.VarDesc.VarType.FP32,
             persistable=True,
-            name='__g_rampup_begin_step__',
+            name=core.dgc.kDGCRampUpBeginStepName(),
             value=self._rampup_begin_step * 1.0,
             force_cpu=True)
 
         for param_var, grad_var in param_and_grads:
-            var_numel = reduce(lambda x, y: x * y, param_var.shape)
+            var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
             if var_numel < 16384 or \
                 param_var.type == core.VarDesc.VarType.SELECTED_ROWS  or \
                 grad_var.type == core.VarDesc.VarType.SELECTED_ROWS  or  \
@@ -763,20 +774,20 @@ class DGCMomentumOptimizer(MomentumOptimizer):
                 shape=param_var.shape,
                 dtype=param_var.dtype,
                 persistable=True,
-                name=param_var.name + "__dgc_u__",
+                name=param_var.name + core.dgc.kDGCUName(),
                 value=0.0)
             v_var = tensor.create_global_var(
                 shape=param_var.shape,
                 dtype=param_var.dtype,
                 persistable=True,
-                name=param_var.name + "__dgc_v__",
+                name=param_var.name + core.dgc.kDGCVName(),
                 value=0.0)
 
             k_var = tensor.create_global_var(
                 shape=[1],
                 dtype=param_var.dtype,
                 persistable=True,
-                name=param_var.name + "__dgc_k__",
+                name=param_var.name + core.dgc.kDGCKName(),
                 value=0.0,
                 force_cpu=True)
 
@@ -784,7 +795,7 @@ class DGCMomentumOptimizer(MomentumOptimizer):
                 shape=[1],
                 dtype=param_var.dtype,
                 persistable=True,
-                name=param_var.name + "__dgc_encoded__",
+                name=param_var.name + core.dgc.kDGCEncodedName(),
                 value=0.0,
                 force_cpu=False)
 
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 6b88e7a99fd78f6a7670ba55bc678e85d229ddf4..6bf91887dcdbbc817e9560aa5575fa19ace3767a 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -104,10 +104,13 @@ class ParallelExecutor(object):
         self._scope = scope if scope is not None else executor.global_scope()
 
         if main_program is not None and main_program._enable_dgc:
-            assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce
+            assert num_trainers > 1, "dgc is not useful when num_trainers <= 1"
+            assert build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce, "dgc \
+                only used for allreduce"
+
             assert num_trainers * len(
                 self._places) > 1, "dgc is not useful for single card training"
-            assert use_cuda
+            assert use_cuda, "dgc only used under cuda"
 
         main_program = main_program if main_program is not None \
             else framework.default_main_program()
@@ -123,6 +126,7 @@ class ParallelExecutor(object):
             exec_strategy=exec_strategy,
             share_vars_from=share_vars_from._compiled_program
             if share_vars_from else None)
+
         self._place = core.CUDAPlace(0) if use_cuda else core.CPUPlace()
         self._exe = executor.Executor(self._place)
         self._compiled_program._compile(place=self._place, scope=self._scope)
diff --git a/python/paddle/fluid/tests/book/CMakeLists.txt b/python/paddle/fluid/tests/book/CMakeLists.txt
index ee734f3c782adb5196a03aca5718377009a5b4e7..999a765b6dc32323a24f9069f11134360dbadcb8 100644
--- a/python/paddle/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/CMakeLists.txt
@@ -6,4 +6,6 @@ foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
 
-add_subdirectory(high-level-api)
+if(WITH_HIGH_LEVEL_API_TEST)
+  add_subdirectory(high-level-api)
+endif()
diff --git a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
index efa5ee2d06af3d31e7d84122dd7eea37d6dcf3a3..c034709fbdc2aa315ca995a42c278b261e6283a4 100644
--- a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt
@@ -1,16 +1,28 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*_new_api.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
+# This test is buggy
+# py_test(test_understand_sentiment_dynamic_rnn SRCS
+# 	test_understand_sentiment_dynamic_rnn.py SERIAL)
+LIST(REMOVE_ITEM TEST_OPS test_understand_sentiment_dynamic_rnn_new_api)
 
-add_subdirectory(fit_a_line)
-add_subdirectory(recognize_digits)
-add_subdirectory(image_classification)
-add_subdirectory(understand_sentiment)
-add_subdirectory(label_semantic_roles)
-add_subdirectory(word2vec)
-add_subdirectory(recommender_system)
-add_subdirectory(machine_translation)
+if(NOT APPLE)
+    # default test
+    foreach(src ${TEST_OPS})
+        py_test(${src} SRCS ${src}.py)
+    endforeach()
+else()
+    foreach(src ${TEST_OPS})
+        if(${src} STREQUAL "test_image_classification_vgg_new_api")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        elseif(${src} STREQUAL "test_image_classification_resnet_new_api")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        elseif(${src} STREQUAL "test_recognize_digits_conv_new_api")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        elseif(${src} STREQUAL "test_recognize_digits_mlp_new_api")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        elseif()
+            py_test(${src} SRCS ${src}.py)
+        endif()
+    endforeach()
+endif()
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py b/python/paddle/fluid/tests/book/high-level-api/cifar10_small_test_set.py
similarity index 87%
rename from python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
rename to python/paddle/fluid/tests/book/high-level-api/cifar10_small_test_set.py
index 48c0f3d3611547308b5d4460748d3aab765f5805..6f24ec45aa6f27814e489b8dce49fe69f62d4f10 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
+++ b/python/paddle/fluid/tests/book/high-level-api/cifar10_small_test_set.py
@@ -88,3 +88,19 @@ def train10(batch_size=None):
         paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
         'data_batch',
         batch_size=batch_size)
+
+
+def test10(batch_size=None):
+    """
+    CIFAR-10 test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'test_batch',
+        batch_size=batch_size)
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/CMakeLists.txt
deleted file mode 100644
index 673c965b662a022739f8d489c331f4de9455a926..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
deleted file mode 100644
index 91c1d17eb5391ea37a41a886594cc71c6e6c56bd..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-if(NOT APPLE)
-    # default test
-    foreach(src ${TEST_OPS})
-        py_test(${src} SRCS ${src}.py)
-    endforeach()
-else()
-    foreach(src ${TEST_OPS})
-        if(${src} STREQUAL "test_image_classification_vgg")
-            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
-        elseif(${src} STREQUAL "test_image_classification_resnet")
-            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
-        elseif()
-            py_test(${src} SRCS ${src}.py)
-        endif()
-    endforeach()
-endif()
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt
deleted file mode 100644
index 673c965b662a022739f8d489c331f4de9455a926..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt
deleted file mode 100644
index 673c965b662a022739f8d489c331f4de9455a926..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
deleted file mode 100644
index f9c6d60540fcb6f8a73fdc4e68471448e16cbdc2..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-# default test
-if(NOT APPLE)
-    foreach(src ${TEST_OPS})
-        py_test(${src} SRCS ${src}.py)
-    endforeach()
-else()
-    foreach(src ${TEST_OPS})
-        if(${src} STREQUAL "test_recognize_digits_conv")
-            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
-        elseif(${src} STREQUAL "test_recognize_digits_mlp")
-            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
-        else()
-            py_test(${src} SRCS ${src}.py)
-        endif()
-    endforeach()
-endif()
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt
deleted file mode 100644
index 673c965b662a022739f8d489c331f4de9455a926..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/test_fit_a_line_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
rename to python/paddle/fluid/tests/book/high-level-api/test_fit_a_line_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/test_image_classification_resnet_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
rename to python/paddle/fluid/tests/book/high-level-api/test_image_classification_resnet_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/test_image_classification_vgg_new_api.py
similarity index 96%
rename from python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
rename to python/paddle/fluid/tests/book/high-level-api/test_image_classification_vgg_new_api.py
index 82294d4b26fe64e6cddc81f9ba3480caf5b51620..0a27aa0fcfece36f1a8ae5ad0477d75a15fd88da 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/test_image_classification_vgg_new_api.py
@@ -89,9 +89,11 @@ def train(use_cuda, train_program, parallel, params_dirname):
             cifar10_small_test_set.train10(batch_size=10), buf_size=128 * 10),
         batch_size=BATCH_SIZE,
         drop_last=False)
-
+    # Use only part of the test set data validation program
     test_reader = paddle.batch(
-        paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
+        cifar10_small_test_set.test10(BATCH_SIZE),
+        batch_size=BATCH_SIZE,
+        drop_last=False)
 
     def event_handler(event):
         if isinstance(event, EndStepEvent):
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/test_label_semantic_roles_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
rename to python/paddle/fluid/tests/book/high-level-api/test_label_semantic_roles_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/test_machine_translation_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
rename to python/paddle/fluid/tests/book/high-level-api/test_machine_translation_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_conv_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
rename to python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_conv_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_mlp_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
rename to python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_mlp_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/test_recommender_system_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
rename to python/paddle/fluid/tests/book/high-level-api/test_recommender_system_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_conv_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
rename to python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_conv_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_dynamic_rnn_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
rename to python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_dynamic_rnn_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_stacked_lstm_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
rename to python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_stacked_lstm_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/test_word2vec_new_api.py
similarity index 100%
rename from python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
rename to python/paddle/fluid/tests/book/high-level-api/test_word2vec_new_api.py
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
deleted file mode 100644
index d71147a85e77ea6dc5b6391aa169abd9b02a0aa1..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-# This test is buggy
-# py_test(test_understand_sentiment_dynamic_rnn SRCS
-# 	test_understand_sentiment_dynamic_rnn.py SERIAL)
-LIST(REMOVE_ITEM TEST_OPS test_understand_sentiment_dynamic_rnn)
-
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt
deleted file mode 100644
index 673c965b662a022739f8d489c331f4de9455a926..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-# default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 7d1b869cf5991dc5ef960ff4d72289979aae158a..e1c4c2eca08d4652ecda8e2579d342818c803f4a 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -474,17 +474,17 @@ class TestYoloDetection(unittest.TestCase):
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
-            gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32')
-            gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
-            gtscore = layers.data(name='gtscore', shape=[10], dtype='float32')
+            gt_box = layers.data(name='gt_box', shape=[10, 4], dtype='float32')
+            gt_label = layers.data(name='gt_label', shape=[10], dtype='int32')
+            gt_score = layers.data(name='gt_score', shape=[10], dtype='float32')
             loss = layers.yolov3_loss(
                 x,
-                gtbox,
-                gtlabel, [10, 13, 30, 13], [0, 1],
+                gt_box,
+                gt_label, [10, 13, 30, 13], [0, 1],
                 10,
                 0.7,
                 32,
-                gtscore=gtscore,
+                gt_score=gt_score,
                 use_label_smooth=False)
 
             self.assertIsNotNone(loss)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 0291bc25ed3145a580582fa30a965fd76047daba..1d2db8187224ed5721a2e0c051315264d5d3ff3b 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -25,7 +25,6 @@ endif()
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
-list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
 
 list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test
@@ -74,7 +73,6 @@ list(REMOVE_ITEM TEST_OPS test_dgc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_nccl)
 list(REMOVE_ITEM TEST_OPS test_dist_transformer)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
-list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
 list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op)
 list(REMOVE_ITEM TEST_OPS test_nearest_interp_op)
 list(REMOVE_ITEM TEST_OPS test_imperative_resnet)
@@ -90,22 +88,24 @@ py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=$
 py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op SERIAL)
 py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op SERIAL)
 py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS
-  FLAGS_cudnn_deterministic=1)
+    FLAGS_cudnn_deterministic=1 SERIAL)
 py_test_modules(test_imperative_mnist MODULES test_imperative_mnist ENVS
-  FLAGS_cudnn_deterministic=1)
+    FLAGS_cudnn_deterministic=1 SERIAL)
 py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext ENVS
-  FLAGS_cudnn_deterministic=1 SERIAL)
+    FLAGS_cudnn_deterministic=1 SERIAL)
 
 if(WITH_DISTRIBUTE)
     py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
     set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
+    if(WITH_DGC)
+        py_test_modules(test_dgc_op MODULES test_dgc_op)
+    endif()
     if(NOT APPLE)
         set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200)
         set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200)
         py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext)
-        py_test_modules(test_dgc_op MODULES test_dgc_op)
         set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000)
-        py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl)
+        py_test_modules(test_dist_se_resnext_nccl MODULES test_dist_se_resnext_nccl SERIAL)
         set_tests_properties(test_dist_se_resnext_nccl PROPERTIES TIMEOUT 1000)
         # FIXME(typhoonzero): add these tests back
         # py_test_modules(test_dist_transformer MODULES test_dist_transformer)
@@ -118,16 +118,13 @@ endif()
 py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_parallel_executor_fetch_feed PROPERTIES TIMEOUT 450)
+set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 740)
 py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executor_transformer SERIAL)
 py_test_modules(test_layers MODULES test_layers ENVS FLAGS_cudnn_deterministic=1)
 if(NOT WIN32)
     py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL)
 endif()
 
-if(NOT APPLE)
-    py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
-endif()
-
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
     # change the timeout from 600 to 2200, because in debug mode, this test need more time.
     set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 2200)
diff --git a/python/paddle/fluid/tests/unittests/fake_reader.py b/python/paddle/fluid/tests/unittests/fake_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..34a256e15dd2f3a8a83aaba4e178efe52c8d8547
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/fake_reader.py
@@ -0,0 +1,34 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import six
+
+
+def fake_imdb_reader(word_dict_size,
+                     sample_num,
+                     lower_seq_len=100,
+                     upper_seq_len=200,
+                     class_dim=2):
+    def __reader__():
+        for _ in six.moves.range(sample_num):
+            length = np.random.random_integers(
+                low=lower_seq_len, high=upper_seq_len, size=[1])[0]
+            ids = np.random.random_integers(
+                low=0, high=word_dict_size - 1, size=[length]).astype('int64')
+            label = np.random.random_integers(
+                low=0, high=class_dim - 1, size=[1]).astype('int64')[0]
+            yield ids, label
+
+    return __reader__
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
index 738715dd70181988028adff1c50be3a52199c312..34837d8a638490a0d66414fa453703250216f4db 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
@@ -21,6 +21,9 @@ from paddle.fluid.op import Operator
 from paddle.fluid.tests.unittests.test_elementwise_mul_op import *
 
 
+# TODO(LeoZhao-Intel): re-enable this case
+# https://github.com/PaddlePaddle/Paddle/issues/16764
+@unittest.skip("Not supported well on avx2.")
 class TestElementwiseMulMKLDNNOp_BroadcastNCHW16c(ElementwiseMulOp):
     def init_input_output(self):
         x = np.random.rand(1, 16, 2, 2).astype(self.dtype)
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 18ed02a72275437fa6106e57c0383e17647d9700..723aafb171271ed248c93665a21089029a30a836 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -29,7 +29,8 @@ __all__ = ['TestParallelExecutorBase']
 
 
 class TestParallelExecutorBase(unittest.TestCase):
-    def check_network_convergence(self,
+    @classmethod
+    def check_network_convergence(cls,
                                   method,
                                   use_cuda=True,
                                   memory_opt=True,
diff --git a/python/paddle/fluid/tests/unittests/simple_nets.py b/python/paddle/fluid/tests/unittests/simple_nets.py
new file mode 100644
index 0000000000000000000000000000000000000000..20ec6c34c3d5fd4d62e5ffed3bdfe4734f9587ca
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/simple_nets.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid as fluid
+import numpy as np
+
+
+def simple_fc_net(use_feed=None):
+    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    hidden = img
+    for _ in range(4):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='relu',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def fc_with_batchnorm(use_feed=None):
+    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    hidden = img
+    for _ in range(2):
+        hidden = fluid.layers.fc(
+            hidden,
+            size=200,
+            act='relu',
+            bias_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.Constant(value=1.0)))
+
+        hidden = fluid.layers.batch_norm(input=hidden)
+
+    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    loss = fluid.layers.mean(loss)
+    return loss
+
+
+def init_data(batch_size=32, img_shape=[784], label_range=9):
+    np.random.seed(5)
+    assert isinstance(img_shape, list)
+    input_shape = [batch_size] + img_shape
+    img = np.random.random(size=input_shape).astype(np.float32)
+    label = np.array(
+        [np.random.randint(0, label_range) for _ in range(batch_size)]).reshape(
+            (-1, 1)).astype("int64")
+    return img, label
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index d587715d607c6da16da5c009db16322e8cd7d176..4d66b7a989732e37c48c73b9617943874ad07bba 100644
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -192,6 +192,23 @@ class TestSqrt(TestActivation):
         self.check_grad(['X'], 'Out', max_relative_error=0.007)
 
 
+class TestRsqrt(TestActivation):
+    def setUp(self):
+        self.op_type = "rsqrt"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [2, 3]).astype(self.dtype)
+        out = 1.0 / np.sqrt(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out', max_relative_error=0.0005)
+
+
 class TestAbs(TestActivation):
     def setUp(self):
         self.op_type = "abs"
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index 9cb88d4a8553f3b750f6cf3b24115b4d188ed1d6..04a36f7cafe7b4445125c4e9bd58f6d30d6c71aa 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -18,7 +18,7 @@ import numpy as np
 import paddle.fluid as fluid
 
 
-class L1(fluid.dygraph.Layer):
+class L1(fluid.Layer):
     def __init__(self, prefix):
         super(L1, self).__init__(prefix)
         self._param_attr = fluid.ParamAttr(
@@ -32,7 +32,7 @@ class L1(fluid.dygraph.Layer):
         return self.w1 + self.w2
 
 
-class L2(fluid.dygraph.Layer):
+class L2(fluid.Layer):
     def __init__(self, prefix):
         super(L2, self).__init__(prefix)
         self.layer1 = L1(self.full_name())
@@ -42,7 +42,7 @@ class L2(fluid.dygraph.Layer):
         return self.layer1() + self.layer2()
 
 
-class L3(fluid.dygraph.Layer):
+class L3(fluid.Layer):
     def __init__(self, prefix):
         super(L3, self).__init__(prefix)
         self.layer1 = L2(self.full_name())
@@ -59,7 +59,7 @@ class TestBaseLayer(unittest.TestCase):
             ret = l()
             self.assertEqual(l.w1.name, "test_one_level/L1_0.w_0")
             self.assertEqual(l.w2.name, "test_one_level/L1_0.w_1")
-            self.assertTrue(np.allclose(ret._numpy(), 0.2 * np.ones([2, 2])))
+            self.assertTrue(np.allclose(ret.numpy(), 0.2 * np.ones([2, 2])))
 
     def test_three_level(self):
         with fluid.dygraph.guard():
@@ -72,7 +72,7 @@ class TestBaseLayer(unittest.TestCase):
             self.assertEqual(names[3], "test_three_level/L3_0/L2_0/L1_1.w_1")
             self.assertEqual(names[4], "test_three_level/L3_0/L2_1/L1_0.w_0")
             self.assertEqual(names[5], "test_three_level/L3_0/L2_1/L1_0.w_1")
-            self.assertTrue(np.allclose(ret._numpy(), 0.8 * np.ones([2, 2])))
+            self.assertTrue(np.allclose(ret.numpy(), 0.8 * np.ones([2, 2])))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
index f60ed1d79ae5778f751d6101fde386ae3a90c0f7..963a17e7d697512e871a97ef24cb1c4ba37a7547 100644
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
@@ -91,17 +91,26 @@ class TestBilinearInterpOp(OpTest):
         self.op_type = "bilinear_interp"
         input_np = np.random.random(self.input_shape).astype("float32")
 
-        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
-                                       self.out_size, self.actual_shape,
-                                       self.align_corners, self.align_mode)
+        if self.scale > 0:
+            out_h = int(self.input_shape[2] * self.scale)
+            out_w = int(self.input_shape[3] * self.scale)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
+                                       self.actual_shape, self.align_corners,
+                                       self.align_mode)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
         if self.actual_shape is not None:
             self.inputs['OutSize'] = self.actual_shape
+
         self.attrs = {
             'out_h': self.out_h,
             'out_w': self.out_w,
+            'scale': self.scale,
             'interp_method': self.interp_method,
             'align_corners': self.align_corners,
             'align_mode': self.align_mode
@@ -119,6 +128,7 @@ class TestBilinearInterpOp(OpTest):
         self.input_shape = [2, 3, 4, 4]
         self.out_h = 2
         self.out_w = 2
+        self.scale = 0.
         self.out_size = np.array([3, 3]).astype("int32")
         self.align_corners = True
         self.align_mode = 1
@@ -130,6 +140,7 @@ class TestBilinearInterpCase1(TestBilinearInterpOp):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 1
         self.out_w = 1
+        self.scale = 0.
         self.align_corners = True
         self.align_mode = 1
 
@@ -140,6 +151,7 @@ class TestBilinearInterpCase2(TestBilinearInterpOp):
         self.input_shape = [3, 3, 9, 6]
         self.out_h = 12
         self.out_w = 12
+        self.scale = 0.
         self.align_corners = True
         self.align_mode = 1
 
@@ -150,6 +162,7 @@ class TestBilinearInterpCase3(TestBilinearInterpOp):
         self.input_shape = [1, 1, 128, 64]
         self.out_h = 64
         self.out_w = 128
+        self.scale = 0.
         self.align_corners = True
         self.align_mode = 1
 
@@ -160,6 +173,7 @@ class TestBilinearInterpCase4(TestBilinearInterpOp):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 1
         self.out_w = 1
+        self.scale = 0.
         self.out_size = np.array([2, 2]).astype("int32")
         self.align_corners = True
         self.align_mode = 1
@@ -171,6 +185,7 @@ class TestBilinearInterpCase5(TestBilinearInterpOp):
         self.input_shape = [3, 3, 9, 6]
         self.out_h = 12
         self.out_w = 12
+        self.scale = 0.
         self.out_size = np.array([11, 11]).astype("int32")
         self.align_corners = True
         self.align_mode = 1
@@ -182,6 +197,7 @@ class TestBilinearInterpCase6(TestBilinearInterpOp):
         self.input_shape = [1, 1, 128, 64]
         self.out_h = 64
         self.out_w = 128
+        self.scale = 0.
         self.out_size = np.array([65, 129]).astype("int32")
         self.align_corners = True
         self.align_mode = 1
@@ -193,6 +209,7 @@ class TestBilinearInterpActualShape(TestBilinearInterpOp):
         self.input_shape = [3, 2, 32, 16]
         self.out_h = 64
         self.out_w = 32
+        self.scale = 0.
         self.out_size = np.array([66, 40]).astype("int32")
         self.align_corners = True
         self.align_mode = 1
@@ -206,15 +223,25 @@ class TestBilinearInterpOpUint8(OpTest):
         self.op_type = "bilinear_interp"
         input_np = np.random.randint(
             low=0, high=256, size=self.input_shape).astype("uint8")
-        output_np = bilinear_interp_np(input_np, self.out_h, self.out_w,
-                                       self.out_size, self.actual_shape,
-                                       self.align_corners, self.align_mode)
+
+        if self.scale > 0:
+            out_h = int(self.input_shape[2] * self.scale)
+            out_w = int(self.input_shape[3] * self.scale)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = bilinear_interp_np(input_np, out_h, out_w, self.out_size,
+                                       self.actual_shape, self.align_corners,
+                                       self.align_mode)
         self.inputs = {'X': input_np}
         if self.out_size is not None:
             self.inputs['OutSize'] = self.out_size
+
         self.attrs = {
             'out_h': self.out_h,
             'out_w': self.out_w,
+            'scale': self.scale,
             'interp_method': self.interp_method,
             'align_corners': self.align_corners,
             'align_mode': self.align_mode
@@ -229,6 +256,7 @@ class TestBilinearInterpOpUint8(OpTest):
         self.input_shape = [1, 3, 9, 6]
         self.out_h = 10
         self.out_w = 9
+        self.scale = 0.
         self.align_corners = True
         self.align_mode = 1
 
@@ -239,6 +267,7 @@ class TestBilinearInterpCase1Uint8(TestBilinearInterpOpUint8):
         self.input_shape = [2, 3, 128, 64]
         self.out_h = 120
         self.out_w = 50
+        self.scale = 0.
         self.align_corners = True
         self.align_mode = 1
 
@@ -249,6 +278,7 @@ class TestBilinearInterpCase2Uint8(TestBilinearInterpOpUint8):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 5
         self.out_w = 13
+        self.scale = 0.
         self.out_size = np.array([6, 15]).astype("int32")
         self.align_corners = True
         self.align_mode = 1
@@ -272,5 +302,38 @@ class TestBilinearInterpWithMethod3(TestBilinearInterpOp):
         self.align_mode = 0
 
 
+class TestBilinearInterpScale1(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 16, 32]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 2.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpScale2(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 16, 32]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 1.
+        self.align_corners = True
+        self.align_mode = 1
+
+
+class TestBilinearInterpScale3(TestBilinearInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'bilinear'
+        self.input_shape = [2, 3, 16, 32]
+        self.out_h = 60
+        self.out_w = 25
+        self.scale = 1.5
+        self.align_corners = True
+        self.align_mode = 1
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cvm_op.py b/python/paddle/fluid/tests/unittests/test_cvm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..67c310bd2f1155e4c5492e90a96cbdac9e8a3481
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cvm_op.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from math import log
+from math import exp
+from op_test import OpTest
+import unittest
+
+
+class TestCVMOp(OpTest):
+    """
+        Test cvm op with discrete one-hot labels.
+    """
+
+    def setUp(self):
+        self.op_type = "cvm"
+        batch_size = 4
+        dims = 11
+        lod = [[1]]
+        self.inputs = {
+            'X': (np.random.uniform(0, 1, [1, dims]).astype("float32"), lod),
+            'CVM': np.array([[0.6, 0.4]]).astype("float32"),
+        }
+        self.attrs = {'use_cvm': False}
+        out = []
+        for index, emb in enumerate(self.inputs["X"][0]):
+            out.append(emb[2:])
+        self.outputs = {'Y': (np.array(out), lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index 8c705a095c768c861aac07249467cf75bb289b2d..4cfd99150562438d9ca64a2b0db215915e682d34 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -29,7 +29,6 @@ class TestDataset(unittest.TestCase):
 
     def test_dataset_create(self):
         """ Testcase for dataset create. """
-        return
         try:
             dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
         except:
@@ -48,7 +47,6 @@ class TestDataset(unittest.TestCase):
 
     def test_dataset_config(self):
         """ Testcase for dataset configuration. """
-        return
         dataset = fluid.core.Dataset("MultiSlotDataset")
         dataset.set_thread_num(12)
         dataset.set_filelist(["a.txt", "b.txt", "c.txt"])
@@ -75,7 +73,6 @@ class TestDataset(unittest.TestCase):
         """
         Testcase for InMemoryDataset from create to run.
         """
-        return
         with open("test_in_memory_dataset_run_a.txt", "w") as f:
             data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
@@ -112,9 +109,10 @@ class TestDataset(unittest.TestCase):
         for i in range(2):
             try:
                 exe.train_from_dataset(fluid.default_main_program(), dataset)
-            except:
-                #self.assertTrue(False)
+            except ImportError as e:
                 pass
+            except Exception as e:
+                self.assertTrue(False)
 
         os.remove("./test_in_memory_dataset_run_a.txt")
         os.remove("./test_in_memory_dataset_run_b.txt")
@@ -123,7 +121,6 @@ class TestDataset(unittest.TestCase):
         """
         Testcase for QueueDataset from create to run.
         """
-        return
         with open("test_queue_dataset_run_a.txt", "w") as f:
             data = "1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 2 2 3 4 4 6 6 6 6 1 2\n"
@@ -156,15 +153,14 @@ class TestDataset(unittest.TestCase):
         for i in range(2):
             try:
                 exe.train_from_dataset(fluid.default_main_program(), dataset)
-            except:
-                #self.assertTrue(False)
+            except ImportError as e:
                 pass
+            except Exception as e:
+                self.assertTrue(False)
 
         os.remove("./test_queue_dataset_run_a.txt")
         os.remove("./test_queue_dataset_run_b.txt")
 
 
 if __name__ == '__main__':
-    #unittest.main()
-    import sys
-    sys.exit(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
index 377014510b55633f697ef7bf2f5f597281e5f5a5..0fbf0d42f5dcc34947235d9bd1db6f8b1c07d59a 100644
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
@@ -19,7 +19,7 @@ import time
 import six
 import unittest
 
-EPOCH_NUM = 60
+EPOCH_NUM = 20
 BATCH_SIZE = 32
 CLASS_NUM = 10
 
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b8fdcc887beb4879b2ce1101184dabe6f819acf
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import contextlib
+import unittest
+import numpy as np
+import six
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
+from paddle.fluid.dygraph.base import to_variable
+from test_imperative_base import new_program_scope
+
+
+class SimpleImgConvPool(fluid.dygraph.Layer):
+    def __init__(self,
+                 name_scope,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 pool_size,
+                 pool_stride,
+                 pool_padding=0,
+                 pool_type='max',
+                 global_pooling=False,
+                 conv_stride=1,
+                 conv_padding=0,
+                 conv_dilation=1,
+                 conv_groups=1,
+                 act=None,
+                 use_cudnn=False,
+                 param_attr=None,
+                 bias_attr=None):
+        super(SimpleImgConvPool, self).__init__(name_scope)
+
+        self._conv2d = Conv2D(
+            self.full_name(),
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=conv_stride,
+            padding=conv_padding,
+            dilation=conv_dilation,
+            groups=conv_groups,
+            param_attr=None,
+            bias_attr=None,
+            use_cudnn=use_cudnn)
+
+        self._pool2d = Pool2D(
+            self.full_name(),
+            pool_size=pool_size,
+            pool_type=pool_type,
+            pool_stride=pool_stride,
+            pool_padding=pool_padding,
+            global_pooling=global_pooling,
+            use_cudnn=use_cudnn)
+
+    def forward(self, inputs):
+        x = self._conv2d(inputs)
+        x = self._pool2d(x)
+        return x
+
+
+class MNIST(fluid.dygraph.Layer):
+    def __init__(self, name_scope):
+        super(MNIST, self).__init__(name_scope)
+
+        self._simple_img_conv_pool_1 = SimpleImgConvPool(
+            self.full_name(), 1, 20, 5, 2, 2, act="relu")
+
+        self._simple_img_conv_pool_2 = SimpleImgConvPool(
+            self.full_name(), 20, 50, 5, 2, 2, act="relu")
+
+        pool_2_shape = 50 * 4 * 4
+        SIZE = 10
+        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
+        self._fc = FC(self.full_name(),
+                      10,
+                      param_attr=fluid.param_attr.ParamAttr(
+                          initializer=fluid.initializer.NormalInitializer(
+                              loc=0.0, scale=scale)),
+                      act="softmax")
+
+    def forward(self, inputs):
+        x = self._simple_img_conv_pool_1(inputs)
+        x = self._simple_img_conv_pool_2(x)
+        x = self._fc(x)
+        return x
+
+
+class TestDygraphMultiForward(unittest.TestCase):
+    def test_mnist_forward_float32(self):
+        seed = 90
+        epoch_num = 1
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            mnist = MNIST("mnist")
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+
+            dy_param_init_value = {}
+            mnist.eval()
+            for epoch in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    dy_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(128, 1)
+
+                    img = to_variable(dy_x_data)
+                    label = to_variable(y_data)
+                    label.stop_gradient = True
+
+                    cost = mnist(img)
+                    loss = fluid.layers.cross_entropy(cost, label)
+                    avg_loss = fluid.layers.mean(loss)
+
+                    dy_out = avg_loss.numpy()
+
+                    if epoch == 0 and batch_id == 0:
+                        for param in mnist.parameters():
+                            dy_param_init_value[param.name] = param.numpy()
+
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+
+            mnist = MNIST("mnist")
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+
+            img = fluid.layers.data(
+                name='pixel', shape=[1, 28, 28], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            cost = mnist(img)
+            loss = fluid.layers.cross_entropy(cost, label)
+            avg_loss = fluid.layers.mean(loss)
+
+            # initialize params and fetch them
+            static_param_init_value = {}
+            static_param_name_list = []
+            for param in mnist.parameters():
+                static_param_name_list.append(param.name)
+
+            out = exe.run(fluid.default_startup_program(),
+                          fetch_list=static_param_name_list)
+
+            for i in range(len(static_param_name_list)):
+                static_param_init_value[static_param_name_list[i]] = out[i]
+
+            for epoch in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    static_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape([128, 1])
+
+                    fetch_list = [avg_loss.name]
+                    out = exe.run(
+                        fluid.default_main_program(),
+                        feed={"pixel": static_x_data,
+                              "label": y_data},
+                        fetch_list=fetch_list)
+
+                    static_out = out[0]
+
+        self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all()))
+
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
+
+        self.assertTrue(np.allclose(static_out, dy_out))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
index 48fb93ec529bee32b9652a89ba7da3dc77f7853a..4b0195d307dc83f77ff04e89544d7bc751b8c011 100644
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
@@ -24,15 +24,15 @@ from paddle.fluid.layers.control_flow import max_sequence_len
 from paddle.fluid.layers.control_flow import lod_tensor_to_array
 from paddle.fluid.layers.control_flow import array_to_lod_tensor
 from paddle.fluid.layers.control_flow import shrink_memory
+from fake_reader import fake_imdb_reader
 
 
 class TestDynRNN(unittest.TestCase):
     def setUp(self):
-        self.word_dict = paddle.dataset.imdb.word_dict()
+        self.word_dict_len = 5147
         self.BATCH_SIZE = 2
-        self.train_data = paddle.batch(
-            paddle.dataset.imdb.train(self.word_dict),
-            batch_size=self.BATCH_SIZE)
+        reader = fake_imdb_reader(self.word_dict_len, self.BATCH_SIZE * 100)
+        self.train_data = paddle.batch(reader, batch_size=self.BATCH_SIZE)
 
     def test_plain_while_op(self):
         main_program = fluid.Program()
@@ -42,7 +42,7 @@ class TestDynRNN(unittest.TestCase):
             sentence = fluid.layers.data(
                 name='word', shape=[1], dtype='int64', lod_level=1)
             sent_emb = fluid.layers.embedding(
-                input=sentence, size=[len(self.word_dict), 32], dtype='float32')
+                input=sentence, size=[self.word_dict_len, 32], dtype='float32')
 
             label = fluid.layers.data(name='label', shape=[1], dtype='float32')
 
@@ -109,7 +109,7 @@ class TestDynRNN(unittest.TestCase):
             sentence = fluid.layers.data(
                 name='word', shape=[1], dtype='int64', lod_level=1)
             sent_emb = fluid.layers.embedding(
-                input=sentence, size=[len(self.word_dict), 32], dtype='float32')
+                input=sentence, size=[self.word_dict_len, 32], dtype='float32')
 
             rnn = fluid.layers.DynamicRNN()
 
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
index d4c043d9c76f21482f17b9bb20c4fde5ce7cc6e7..eb3832ca9ffb7ac9b4261de1036c85c93c6d0a81 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py
@@ -22,6 +22,8 @@ import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from paddle.fluid import compiler
+import numpy as np
+from fake_reader import fake_imdb_reader
 
 
 def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
@@ -35,16 +37,16 @@ def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2):
         )
         return
 
-    word_dict = paddle.dataset.imdb.word_dict()
-    train_reader = paddle.batch(
-        paddle.dataset.imdb.train(word_dict), batch_size=batch_size)
+    word_dict_size = 5147
+    reader = fake_imdb_reader(word_dict_size, batch_size * 40)
+    train_reader = paddle.batch(reader, batch_size=batch_size)
 
     data = fluid.layers.data(
         name="words", shape=[1], dtype="int64", lod_level=1)
 
     label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
-    cost = network(data, label, len(word_dict))
+    cost = network(data, label, word_dict_size)
     cost.persistable = True
     optimizer = fluid.optimizer.Adagrad(learning_rate=0.2)
     optimizer.minimize(cost)
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py
index a84ff1fd6d46c30ad7aa72f1b29a8ae668b90e20..3fd582e4d5cb7cec1db0719160a4a795a30e54f1 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_no_need_buffer_vars_inference.py
@@ -18,20 +18,21 @@ import importlib
 
 fluid.core._set_eager_deletion_mode(0.0, 1.0, True)
 
+from test_bilinear_interp_op import *
+from test_concat_op import *
 from test_elementwise_add_op import *
 from test_elementwise_sub_op import *
-from test_concat_op import *
+from test_fill_constant_batch_size_like_op import *
+from test_fill_zeros_like2_op import *
 from test_gather_op import *
 from test_gaussian_random_batch_size_like_op import *
-from test_uniform_random_batch_size_like_op import *
-from test_fill_constant_batch_size_like_op import *
+from test_linear_chain_crf_op import *
 from test_lod_reset_op import *
-from test_scatter_op import *
+from test_lookup_table_op import *
 from test_mean_op import *
-from test_slice_op import *
-from test_linear_chain_crf_op import *
-from test_bilinear_interp_op import *
 from test_nearest_interp_op import *
+from test_pad2d_op import *
+from test_scatter_op import *
 from test_sequence_concat import *
 from test_seq_conv import *
 from test_seq_pool import *
@@ -41,8 +42,10 @@ from test_sequence_pad_op import *
 from test_sequence_unpad_op import *
 from test_sequence_scatter_op import *
 from test_sequence_slice_op import *
-from test_pad2d_op import *
-from test_fill_zeros_like2_op import *
+from test_slice_op import *
+from test_space_to_depth_op import *
+from test_squared_l2_distance_op import *
+from test_uniform_random_batch_size_like_op import *
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
index ca8669bbc6f3ea7b3f3340793712a221b0bf8c6a..0990045a8fd8775b90ddb6569c5c269ff57d6e38 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from simple_nets import simple_fc_net, fc_with_batchnorm, init_data
 from parallel_executor_test_base import TestParallelExecutorBase
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -22,45 +22,6 @@ import unittest
 import os
 
 
-def simple_fc_net(use_feed):
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    hidden = img
-    for _ in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def fc_with_batchnorm(use_feed):
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    hidden = img
-    for _ in range(2):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-
-        hidden = fluid.layers.batch_norm(input=hidden)
-
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
 class TestMNIST(TestParallelExecutorBase):
     @classmethod
     def setUpClass(cls):
@@ -75,10 +36,10 @@ class TestMNIST(TestParallelExecutorBase):
         label = np.ones(shape=[32, 1], dtype='int64')
         return img, label
 
-    def _compare_fuse_all_reduce_ops(self, model, use_cuda, random_data=True):
+    def _compare_fuse_all_reduce_ops(self, model, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
             return
-        img, label = self._init_data(random_data)
+        img, label = init_data()
 
         def _optimizer(learning_rate=1e-6):
             optimizer = fluid.optimizer.SGD(
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
index 763dfa2160d22c2d89cce834a839b5e2b5eaff55..552f94e769e5a8764dd8426d130fd879dc718b20 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
@@ -12,108 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from simple_nets import simple_fc_net, fc_with_batchnorm, init_data
 from parallel_executor_test_base import TestParallelExecutorBase
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-import numpy as np
-import paddle
-import paddle.dataset.mnist as mnist
 import unittest
 import os
 
-MNIST_RECORDIO_FILE = "./mnist_test_pe.recordio"
-
-
-def simple_fc_net(use_feed):
-    if use_feed:
-        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    else:
-        reader = fluid.layers.open_files(
-            filenames=[MNIST_RECORDIO_FILE],
-            shapes=[[-1, 784], [-1, 1]],
-            lod_levels=[0, 0],
-            dtypes=['float32', 'int64'])
-        reader = fluid.layers.io.double_buffer(reader)
-        img, label = fluid.layers.read_file(reader)
-    hidden = img
-    for _ in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def fc_with_batchnorm(use_feed):
-    if use_feed:
-        img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-        label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    else:
-        reader = fluid.layers.open_files(
-            filenames=[MNIST_RECORDIO_FILE],
-            shapes=[[-1, 784], [-1, 1]],
-            lod_levels=[0, 0],
-            dtypes=['float32', 'int64'])
-        reader = fluid.layers.io.double_buffer(reader)
-        img, label = fluid.layers.read_file(reader)
-
-    hidden = img
-    for _ in range(2):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-
-        hidden = fluid.layers.batch_norm(input=hidden)
-
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
 
 class TestMNIST(TestParallelExecutorBase):
     @classmethod
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
-        # Convert mnist to recordio file
-        with fluid.program_guard(fluid.Program(), fluid.Program()):
-            reader = paddle.batch(mnist.train(), batch_size=4)
-            feeder = fluid.DataFeeder(
-                feed_list=[  # order is image and label
-                    fluid.layers.data(
-                        name='image', shape=[784]),
-                    fluid.layers.data(
-                        name='label', shape=[1], dtype='int64'),
-                ],
-                place=fluid.CPUPlace())
-            fluid.recordio_writer.convert_reader_to_recordio_file(
-                MNIST_RECORDIO_FILE, reader, feeder)
-
-    def _init_data(self, random=True):
-        np.random.seed(5)
-        if random:
-            img = np.random.random(size=[32, 784]).astype(np.float32)
-        else:
-            img = np.ones(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
-        return img, label
 
-    def _compare_fuse_elewise_add_act_ops(self,
-                                          model,
-                                          use_cuda,
-                                          random_data=True):
+    def _compare_fuse_elewise_add_act_ops(self, model, use_cuda):
         if use_cuda and not core.is_compiled_with_cuda():
             return
-        img, label = self._init_data(random_data)
+        img, label = init_data()
 
         def _optimizer(learning_rate=1e-6):
             optimizer = fluid.optimizer.SGD(
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
index 93e67deaf3c9f7fe17296049137fbbe00374c6f1..510be19af406ba821ab8159abf071440ae3d1831 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
@@ -11,78 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+from simple_nets import simple_fc_net, fc_with_batchnorm, init_data
 from parallel_executor_test_base import TestParallelExecutorBase
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-import numpy as np
-import paddle
-import paddle.dataset.mnist as mnist
 import unittest
 import os
 
 
-def simple_fc_net(use_feed):
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    hidden = img
-    for _ in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
-def fc_with_batchnorm(use_feed):
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-    hidden = img
-    for _ in range(2):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='relu',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-
-        hidden = fluid.layers.batch_norm(input=hidden)
-
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
 class TestFuseAdamOps(TestParallelExecutorBase):
     @classmethod
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
 
-    def _init_data(self, random=True):
-        np.random.seed(5)
-        if random:
-            img = np.random.random(size=[32, 784]).astype(np.float32)
-        else:
-            img = np.ones(shape=[32, 784], dtype='float32')
-        label = np.ones(shape=[32, 1], dtype='int64')
-        return img, label
-
     def _compare_fused_optimizer_ops(self,
                                      model,
                                      use_cuda,
-                                     random_data=True,
                                      optimizer=fluid.optimizer.Adam):
         if use_cuda and not core.is_compiled_with_cuda():
             return
-        img, label = self._init_data(random_data)
+        img, label = init_data()
         not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence(
             model,
             feed_dict={"image": img,
@@ -111,7 +59,7 @@ class TestFuseAdamOps(TestParallelExecutorBase):
 
     def test_batchnorm_fc_with_fuse_op(self):
         self._compare_fused_optimizer_ops(fc_with_batchnorm, True)
-        # self._compare_fused_optimizer_ops(fc_with_batchnorm, False)
+        self._compare_fused_optimizer_ops(fc_with_batchnorm, False)
 
 
 class TestFuseSGDOps(TestFuseAdamOps):
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index e49239da6d3918211fbbc302d2c56818460b6d51..470187e6421173d1cb1213d06660331c164859c4 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -19,6 +19,8 @@ import numpy as np
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
+import six
+from fake_reader import fake_imdb_reader
 
 
 def bow_net(data,
@@ -48,11 +50,10 @@ def bow_net(data,
 
 class TestGradientClip(unittest.TestCase):
     def setUp(self):
-        self.word_dict = paddle.dataset.imdb.word_dict()
+        self.word_dict_len = 5147
         self.BATCH_SIZE = 2
-        self.train_data = paddle.batch(
-            paddle.dataset.imdb.train(self.word_dict),
-            batch_size=self.BATCH_SIZE)
+        reader = fake_imdb_reader(self.word_dict_len, self.BATCH_SIZE * 100)
+        self.train_data = paddle.batch(reader, batch_size=self.BATCH_SIZE)
 
     def get_places(self):
         places = [core.CPUPlace()]
@@ -131,7 +132,7 @@ class TestGradientClip(unittest.TestCase):
             data = fluid.layers.data(
                 name="words", shape=[1], dtype="int64", lod_level=1)
             label = fluid.layers.data(name="label", shape=[1], dtype="int64")
-            cost = bow_net(data, label, len(self.word_dict))
+            cost = bow_net(data, label, self.word_dict_len)
 
             fluid.clip.set_gradient_clip(
                 clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 8b3094cb2a3a633cf89e571e85e494ce7e887063..8404a57eb85a30edda6889150e588cab783be685 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -18,11 +18,11 @@ import numpy as np
 
 import paddle.fluid as fluid
 from paddle.fluid import core
-from paddle.fluid.dygraph.nn import FC
+from paddle.fluid import FC
 from test_imperative_base import new_program_scope
 
 
-class MyLayer(fluid.dygraph.Layer):
+class MyLayer(fluid.Layer):
     def __init__(self, name_scope):
         super(MyLayer, self).__init__(name_scope)
 
@@ -34,7 +34,7 @@ class MyLayer(fluid.dygraph.Layer):
         return [x]
 
 
-class MyPyLayer(fluid.dygraph.PyLayer):
+class MyPyLayer(fluid.PyLayer):
     def __init__(self):
         super(MyPyLayer, self).__init__()
 
@@ -48,7 +48,7 @@ class MyPyLayer(fluid.dygraph.PyLayer):
         return np.array(dout) * (1 - np.square(np.array(out)))
 
 
-class MLP(fluid.dygraph.Layer):
+class MLP(fluid.Layer):
     def __init__(self, name_scope):
         super(MLP, self).__init__(name_scope)
         self._fc1 = FC(self.full_name(),
@@ -71,7 +71,7 @@ class MLP(fluid.dygraph.Layer):
         return x
 
 
-class SimpleRNNCell(fluid.dygraph.Layer):
+class SimpleRNNCell(fluid.Layer):
     def __init__(self, name_scope, step_input_size, hidden_size, output_size,
                  param_attr):
         super(SimpleRNNCell, self).__init__(name_scope)
@@ -81,7 +81,7 @@ class SimpleRNNCell(fluid.dygraph.Layer):
         self._dtype = core.VarDesc.VarType.FP32
         self.param_attr = param_attr
 
-    def _build_once(self, inputs, pre_hidden):
+    def build_once(self, inputs, pre_hidden):
         i2h_param_shape = [self.step_input_size, self.hidden_size]
         h2h_param_shape = [self.hidden_size, self.hidden_size]
         h2o_param_shape = [self.output_size, self.hidden_size]
@@ -159,7 +159,7 @@ class SimpleRNNCell(fluid.dygraph.Layer):
         return reduce_out, hidden
 
 
-class SimpleRNN(fluid.dygraph.Layer):
+class SimpleRNN(fluid.Layer):
     def __init__(self, name_scope):
         super(SimpleRNN, self).__init__(name_scope)
         self.seq_len = 4
@@ -200,22 +200,22 @@ class TestImperative(unittest.TestCase):
                 inputs.append(fluid.dygraph.base.to_variable(x))
             ret = fluid.layers.sums(inputs)
             loss = fluid.layers.reduce_sum(ret)
-            loss._backward()
-            self.assertTrue(np.allclose(ret._numpy(), x * 10))
-            self.assertTrue(np.allclose(inputs[0]._gradient(), x))
+            loss.backward()
+            self.assertTrue(np.allclose(ret.numpy(), x * 10))
+            self.assertTrue(np.allclose(inputs[0].gradient(), x))
 
     def test_layer(self):
         with fluid.dygraph.guard():
             cl = core.Layer()
             cl.forward([])
-            l = fluid.dygraph.Layer("l")
+            l = fluid.Layer("l")
             self.assertRaises(NotImplementedError, l.forward, [])
 
     def test_pylayer_func_id(self):
 
         with fluid.dygraph.guard():
 
-            class PyLayer1(fluid.dygraph.PyLayer):
+            class PyLayer1(fluid.PyLayer):
                 def __init__(self):
                     super(PyLayer1, self).__init__()
 
@@ -227,7 +227,7 @@ class TestImperative(unittest.TestCase):
                 def backward(input):
                     return input
 
-            class PyLayer2(fluid.dygraph.PyLayer):
+            class PyLayer2(fluid.PyLayer):
                 def __init__(self):
                     super(PyLayer2, self).__init__()
 
@@ -257,9 +257,9 @@ class TestImperative(unittest.TestCase):
             my_py_layer = MyPyLayer()
             var_inp = fluid.dygraph.base.to_variable(np_inp)
             outs = my_py_layer(var_inp)
-            dy_out = np.sum(outs[0]._numpy())
-            outs[0]._backward()
-            dy_grad = var_inp._gradient()
+            dy_out = np.sum(outs[0].numpy())
+            outs[0].backward()
+            dy_grad = var_inp.gradient()
 
         with new_program_scope():
             inp = fluid.layers.data(
@@ -287,9 +287,9 @@ class TestImperative(unittest.TestCase):
             l = MyLayer("my_layer")
             x = l(var_inp)[0]
             self.assertIsNotNone(x)
-            dy_out = x._numpy()
-            x._backward()
-            dy_grad = l._x_for_debug._gradient()
+            dy_out = x.numpy()
+            x.backward()
+            dy_grad = l._x_for_debug.gradient()
 
         with new_program_scope():
             inp = fluid.layers.data(
@@ -314,9 +314,9 @@ class TestImperative(unittest.TestCase):
             var_inp = fluid.dygraph.base.to_variable(np_inp)
             mlp = MLP("mlp")
             out = mlp(var_inp)
-            dy_out = out._numpy()
-            out._backward()
-            dy_grad = mlp._fc1._w._gradient()
+            dy_out = out.numpy()
+            out.backward()
+            dy_grad = mlp._fc1._w.gradient()
 
         with new_program_scope():
             inp = fluid.layers.data(
@@ -358,7 +358,7 @@ class TestImperative(unittest.TestCase):
                 x = fluid.layers.elementwise_add(inp1, inp2)
             else:
                 x = fluid.layers.elementwise_sub(inp1, inp2)
-            dygraph_result = x._numpy()
+            dygraph_result = x.numpy()
 
         # static graph
         with new_program_scope():
@@ -407,11 +407,11 @@ class TestImperative(unittest.TestCase):
             var_inp = fluid.layers.reshape(var_inp, shape=[1, 4, 3])
             simple_rnn = SimpleRNN("simple_rnn")
             outs, pre_hiddens = simple_rnn.forward(var_inp)
-            dy_out = outs[3]._numpy()
-            outs[3]._backward()
-            dy_grad_h2o = simple_rnn._cell._h2o_w._gradient()
-            dy_grad_h2h = simple_rnn._cell._h2h_w._gradient()
-            dy_grad_i2h = simple_rnn._cell._i2h_w._gradient()
+            dy_out = outs[3].numpy()
+            outs[3].backward()
+            dy_grad_h2o = simple_rnn._cell._h2o_w.gradient()
+            dy_grad_h2h = simple_rnn._cell._h2h_w.gradient()
+            dy_grad_i2h = simple_rnn._cell._i2h_w.gradient()
 
         with new_program_scope():
             inp = fluid.layers.data(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
index a92b7d62fa598a3ec9b53bade2805cc033f4b9d9..00065917415f52d90f18da5a0b14591cecea737b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_checkpoint.py
@@ -18,11 +18,11 @@ import numpy as np
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
+from paddle.fluid import Conv2D, Pool2D, FC
 from paddle.fluid.dygraph.base import to_variable
 
 
-class SimpleImgConvPool(fluid.dygraph.Layer):
+class SimpleImgConvPool(fluid.Layer):
     def __init__(self,
                  name_scope,
                  num_channels,
@@ -71,7 +71,7 @@ class SimpleImgConvPool(fluid.dygraph.Layer):
         return x
 
 
-class MNIST(fluid.dygraph.Layer):
+class MNIST(fluid.Layer):
     def __init__(self, name_scope):
         super(MNIST, self).__init__(name_scope)
 
@@ -99,7 +99,7 @@ class MNIST(fluid.dygraph.Layer):
 
 
 class TestDygraphCheckpoint(unittest.TestCase):
-    def save_load_persistables(self):
+    def test_save_load_persistables(self):
         seed = 90
         epoch_num = 1
 
@@ -125,33 +125,36 @@ class TestDygraphCheckpoint(unittest.TestCase):
 
                     img = to_variable(dy_x_data)
                     label = to_variable(y_data)
-                    label._stop_gradient = True
+                    label.stop_gradient = True
 
                     cost = mnist(img)
                     loss = fluid.layers.cross_entropy(cost, label)
                     avg_loss = fluid.layers.mean(loss)
 
-                    dy_out = avg_loss._numpy()
+                    dy_out = avg_loss.numpy()
 
-                    avg_loss._backward()
+                    avg_loss.backward()
                     sgd.minimize(avg_loss)
-                    fluid.dygraph.save_persistables(mnist, "save_dir")
+                    fluid.dygraph.save_persistables(mnist.state_dict(),
+                                                    "save_dir")
                     mnist.clear_gradients()
 
                     for param in mnist.parameters():
-                        dy_param_init_value[param.name] = param._numpy()
+                        dy_param_init_value[param.name] = param.numpy()
 
                     mnist.load_dict(
-                        fluid.dygraph.load_persistables(mnist, "save_dir"))
+                        fluid.dygraph.load_persistables(mnist.state_dict(),
+                                                        "save_dir"))
 
                     restore = mnist.parameters()
 
                     self.assertEqual(len(dy_param_init_value), len(restore))
                     for value in restore:
                         self.assertTrue(
-                            np.allclose(value, dy_param_init_value[value.name]))
-                        self.assertTrue(np.isfinite(value.all()))
-                        self.assertFalse(np.isnan(value.any()))
+                            np.allclose(value.numpy(), dy_param_init_value[
+                                value.name]))
+                        self.assertTrue(np.isfinite(value.numpy().all()))
+                        self.assertFalse(np.isnan(value.numpy().any()))
 
                     step += 1
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index ccebd4a54727f383bd4e46ff57bfdc9381577d05..ca2cffa9c75cc851f0911cb0063f4e82bb2a41eb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -32,11 +32,11 @@ NUM_BATCHES = int(os.environ.get('NUM_BATCHES', 5))
 NUM_EPOCHES = int(os.environ.get('NUM_EPOCHES', 1))
 
 
-class DMF(fluid.dygraph.Layer):
+class DMF(fluid.Layer):
     def __init__(self, name_scope):
         super(DMF, self).__init__(name_scope)
-        self._user_latent = fluid.dygraph.FC(self.full_name(), 256)
-        self._item_latent = fluid.dygraph.FC(self.full_name(), 256)
+        self._user_latent = fluid.FC(self.full_name(), 256)
+        self._item_latent = fluid.FC(self.full_name(), 256)
 
         self._user_layers = []
         self._item_layers = []
@@ -45,13 +45,11 @@ class DMF(fluid.dygraph.Layer):
             self._user_layers.append(
                 self.add_sublayer(
                     'user_layer_%d' % i,
-                    fluid.dygraph.FC(
-                        self.full_name(), self._hid_sizes[i], act='relu')))
+                    fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
             self._item_layers.append(
                 self.add_sublayer(
                     'item_layer_%d' % i,
-                    fluid.dygraph.FC(
-                        self.full_name(), self._hid_sizes[i], act='relu')))
+                    fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
 
     def forward(self, users, items):
         users = self._user_latent(users)
@@ -63,19 +61,18 @@ class DMF(fluid.dygraph.Layer):
         return fluid.layers.elementwise_mul(users, items)
 
 
-class MLP(fluid.dygraph.Layer):
+class MLP(fluid.Layer):
     def __init__(self, name_scope):
         super(MLP, self).__init__(name_scope)
-        self._user_latent = fluid.dygraph.FC(self.full_name(), 256)
-        self._item_latent = fluid.dygraph.FC(self.full_name(), 256)
+        self._user_latent = fluid.FC(self.full_name(), 256)
+        self._item_latent = fluid.FC(self.full_name(), 256)
         self._match_layers = []
         self._hid_sizes = [128, 64]
         for i in range(len(self._hid_sizes)):
             self._match_layers.append(
                 self.add_sublayer(
                     'match_layer_%d' % i,
-                    fluid.dygraph.FC(
-                        self.full_name(), self._hid_sizes[i], act='relu')))
+                    fluid.FC(self.full_name(), self._hid_sizes[i], act='relu')))
         self._mat
 
     def forward(self, users, items):
@@ -88,7 +85,7 @@ class MLP(fluid.dygraph.Layer):
         return match_vec
 
 
-class DeepCF(fluid.dygraph.Layer):
+class DeepCF(fluid.Layer):
     def __init__(self, name_scope, num_users, num_items, matrix):
         super(DeepCF, self).__init__(name_scope)
         self._num_users = num_users
@@ -99,11 +96,11 @@ class DeepCF(fluid.dygraph.Layer):
             matrix.dtype,
             is_bias=False,
             default_initializer=fluid.initializer.NumpyArrayInitializer(matrix))
-        self._rating_matrix._stop_gradient = True
+        self._rating_matrix.stop_gradient = True
 
         self._mlp = MLP(self.full_name())
         self._dmf = DMF(self.full_name())
-        self._match_fc = fluid.dygraph.FC(self.full_name(), 1, act='sigmoid')
+        self._match_fc = fluid.FC(self.full_name(), 1, act='sigmoid')
 
     def forward(self, users, items):
         # users_emb = self._user_emb(users)
@@ -255,10 +252,10 @@ class TestDygraphDeepCF(unittest.TestCase):
                         fluid.layers.log_loss(prediction,
                                               to_variable(labels_np[
                                                   slice:slice + BATCH_SIZE])))
-                    loss._backward()
+                    loss.backward()
                     adam.minimize(loss)
                     deepcf.clear_gradients()
-                    dy_loss = loss._numpy()
+                    dy_loss = loss.numpy()
                     sys.stderr.write('dynamic loss: %s %s\n' % (slice, dy_loss))
 
         self.assertEqual(static_loss, dy_loss)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index 58faa1cb85af9cedb70f3a12244cfeb44e0f4f52..5d773ec1c9db160cd63a28c634043037260e0b82 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -22,12 +22,12 @@ import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.optimizer import SGDOptimizer
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
+from paddle.fluid import Conv2D, Pool2D, FC
 from test_imperative_base import new_program_scope
 from paddle.fluid.dygraph.base import to_variable
 
 
-class Discriminator(fluid.dygraph.Layer):
+class Discriminator(fluid.Layer):
     def __init__(self, name_scope):
         super(Discriminator, self).__init__(name_scope)
         self._fc1 = FC(self.full_name(), size=32, act='elu')
@@ -38,7 +38,7 @@ class Discriminator(fluid.dygraph.Layer):
         return self._fc2(x)
 
 
-class Generator(fluid.dygraph.Layer):
+class Generator(fluid.Layer):
     def __init__(self, name_scope):
         super(Generator, self).__init__(name_scope)
         self._fc1 = FC(self.full_name(), size=64, act='elu')
@@ -150,7 +150,7 @@ class TestDygraphGAN(unittest.TestCase):
                     x=d_fake, label=to_variable(np.zeros([2, 1], np.float32))))
 
             d_loss = d_loss_real + d_loss_fake
-            d_loss._backward()
+            d_loss.backward()
             sgd.minimize(d_loss)
             discriminator.clear_gradients()
             generator.clear_gradients()
@@ -160,15 +160,15 @@ class TestDygraphGAN(unittest.TestCase):
             g_loss = fluid.layers.reduce_mean(
                 fluid.layers.sigmoid_cross_entropy_with_logits(
                     x=d_fake, label=to_variable(np.ones([2, 1], np.float32))))
-            g_loss._backward()
+            g_loss.backward()
             sgd.minimize(g_loss)
             for p in discriminator.parameters():
-                dy_params[p.name] = p._numpy()
+                dy_params[p.name] = p.numpy()
             for p in generator.parameters():
-                dy_params[p.name] = p._numpy()
+                dy_params[p.name] = p.numpy()
 
-            dy_g_loss = g_loss._numpy()
-            dy_d_loss = d_loss._numpy()
+            dy_g_loss = g_loss.numpy()
+            dy_d_loss = d_loss.numpy()
 
         self.assertEqual(dy_g_loss, static_g_loss)
         self.assertEqual(dy_d_loss, static_d_loss)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
index a8fb9ecfe4be16b73ac2144259f25ed3859ece7e..234fcd60404286977309083257c24d941db77449 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -15,14 +15,12 @@
 import contextlib
 import unittest
 import numpy as np
-import six
 import sys
 
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.optimizer import AdamOptimizer
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
 from test_imperative_base import new_program_scope
 from paddle.fluid.dygraph.base import to_variable
 
@@ -31,7 +29,7 @@ def gen_data():
     pass
 
 
-class GraphConv(fluid.dygraph.Layer):
+class GraphConv(fluid.Layer):
     def __init__(self, name_scope, in_features, out_features):
         super(GraphConv, self).__init__(name_scope)
 
@@ -50,7 +48,7 @@ class GraphConv(fluid.dygraph.Layer):
         return fluid.layers.matmul(adj, support) + self.bias
 
 
-class GCN(fluid.dygraph.Layer):
+class GCN(fluid.Layer):
     def __init__(self, name_scope, num_hidden):
         super(GCN, self).__init__(name_scope)
         self.gc = GraphConv(self.full_name(), num_hidden, 32)
@@ -134,10 +132,9 @@ class TestDygraphGNN(unittest.TestCase):
             loss = fluid.layers.reduce_sum(loss)
             adam = AdamOptimizer(learning_rate=1e-3)
             adam.minimize(loss)
-            self.assertEqual(static_loss, loss._numpy())
-            self.assertTrue(
-                np.allclose(static_weight, model.gc.weight._numpy()))
-            sys.stderr.write('%s %s\n' % (static_loss, loss._numpy()))
+            self.assertEqual(static_loss, loss.numpy())
+            self.assertTrue(np.allclose(static_weight, model.gc.weight.numpy()))
+            sys.stderr.write('%s %s\n' % (static_loss, loss.numpy()))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
index 5ab01839fbc20bbd3c242878c4ea23a00f7b0dca..908237b88736da112b7001708bbca19b534baef1 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_mnist.py
@@ -117,6 +117,7 @@ class TestImperativeMnist(unittest.TestCase):
             train_reader = paddle.batch(
                 paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
 
+            mnist.train()
             dy_param_init_value = {}
             for epoch in range(epoch_num):
                 for batch_id, data in enumerate(train_reader()):
@@ -128,25 +129,25 @@ class TestImperativeMnist(unittest.TestCase):
 
                     img = to_variable(dy_x_data)
                     label = to_variable(y_data)
-                    label._stop_gradient = True
+                    label.stop_gradient = True
 
                     cost = mnist(img)
                     loss = fluid.layers.cross_entropy(cost, label)
                     avg_loss = fluid.layers.mean(loss)
 
-                    dy_out = avg_loss._numpy()
+                    dy_out = avg_loss.numpy()
 
                     if epoch == 0 and batch_id == 0:
                         for param in mnist.parameters():
-                            dy_param_init_value[param.name] = param._numpy()
+                            dy_param_init_value[param.name] = param.numpy()
 
-                    avg_loss._backward()
+                    avg_loss.backward()
                     sgd.minimize(avg_loss)
                     mnist.clear_gradients()
 
                     dy_param_value = {}
                     for param in mnist.parameters():
-                        dy_param_value[param.name] = param._numpy()
+                        dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index 8b659a3e08e381dd6f55b666d9f5f1b172a51930..b9f93119e83159c5bc3052b0292168a9ef641d3e 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -28,7 +28,7 @@ from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 
 
-class MLP(fluid.dygraph.Layer):
+class MLP(fluid.Layer):
     def __init__(self, name_scope, param_attr=None, bias_attr=None):
         super(MLP, self).__init__(name_scope)
 
@@ -75,18 +75,18 @@ class TestImperativeOptimizerBase(unittest.TestCase):
 
                 cost = mlp(img)
                 avg_loss = fluid.layers.reduce_mean(cost)
-                dy_out = avg_loss._numpy()
+                dy_out = avg_loss.numpy()
 
                 if batch_id == 0:
                     for param in mlp.parameters():
-                        dy_param_init_value[param.name] = param._numpy()
+                        dy_param_init_value[param.name] = param.numpy()
 
-                avg_loss._backward()
+                avg_loss.backward()
                 optimizer.minimize(avg_loss)
                 mlp.clear_gradients()
                 dy_param_value = {}
                 for param in mlp.parameters():
-                    dy_param_value[param.name] = param._numpy()
+                    dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index 82eb61ba654636ccc3c2acee8508dfabb62ee9cb..088d36be2327a91da0efc639d7f970ed9e43d151 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -24,10 +24,9 @@ from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 import numpy as np
 import six
-from paddle.fluid.backward import append_backward
 
 
-class SimpleLSTMRNN(fluid.dygraph.Layer):
+class SimpleLSTMRNN(fluid.Layer):
     def __init__(self,
                  name_scope,
                  hidden_size,
@@ -45,7 +44,7 @@ class SimpleLSTMRNN(fluid.dygraph.Layer):
         self.cell_array = []
         self.hidden_array = []
 
-    def _build_once(self, input_embedding, init_hidden=None, init_cell=None):
+    def build_once(self, input_embedding, init_hidden=None, init_cell=None):
         self.weight_1_arr = []
         self.weight_2_arr = []
         self.bias_arr = []
@@ -132,7 +131,7 @@ class SimpleLSTMRNN(fluid.dygraph.Layer):
         return real_res, last_hidden, last_cell
 
 
-class PtbModel(fluid.dygraph.Layer):
+class PtbModel(fluid.Layer):
     def __init__(self,
                  name_scope,
                  hidden_size,
@@ -177,7 +176,7 @@ class PtbModel(fluid.dygraph.Layer):
             default_initializer=fluid.initializer.UniformInitializer(
                 low=-self.init_scale, high=self.init_scale))
 
-    def _build_once(self, input, label, init_hidden, init_cell):
+    def build_once(self, input, label, init_hidden, init_cell):
         pass
 
     def forward(self, input, label, init_hidden, init_cell):
@@ -260,13 +259,13 @@ class TestDygraphPtbRnn(unittest.TestCase):
                                                             init_cell)
                 if i == 0:
                     for param in ptb_model.parameters():
-                        dy_param_init[param.name] = param._numpy()
-                dy_loss._backward()
+                        dy_param_init[param.name] = param.numpy()
+                dy_loss.backward()
                 sgd.minimize(dy_loss)
                 ptb_model.clear_gradients()
                 if i == batch_num - 1:
                     for param in ptb_model.parameters():
-                        dy_param_updated[param.name] = param._numpy()
+                        dy_param_updated[param.name] = param.numpy()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
@@ -334,11 +333,11 @@ class TestDygraphPtbRnn(unittest.TestCase):
                         static_param_updated[static_param_name_list[k -
                                                                     3]] = out[k]
 
-        self.assertTrue(np.array_equal(static_loss_value, dy_loss._numpy()))
+        self.assertTrue(np.array_equal(static_loss_value, dy_loss.numpy()))
         self.assertTrue(
-            np.array_equal(static_last_cell_value, last_cell._numpy()))
+            np.array_equal(static_last_cell_value, last_cell.numpy()))
         self.assertTrue(
-            np.array_equal(static_last_hidden_value, last_hidden._numpy()))
+            np.array_equal(static_last_hidden_value, last_hidden.numpy()))
         for key, value in six.iteritems(static_param_init):
             self.assertTrue(np.array_equal(value, dy_param_init[key]))
         for key, value in six.iteritems(static_param_updated):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index 1d786d584632769e4318bcdeb24ef7ef8ea18597..d9ef08b3c491b24323bb1469165ed5482737013a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -21,7 +21,7 @@ import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC
+from paddle.fluid import Conv2D, Pool2D, BatchNorm, FC
 from paddle.fluid.dygraph.base import to_variable
 from test_imperative_base import new_program_scope
 
@@ -68,7 +68,7 @@ def optimizer_setting(params):
     return optimizer
 
 
-class ConvBNLayer(fluid.dygraph.Layer):
+class ConvBNLayer(fluid.Layer):
     def __init__(self,
                  name_scope,
                  num_channels,
@@ -99,7 +99,7 @@ class ConvBNLayer(fluid.dygraph.Layer):
         return y
 
 
-class BottleneckBlock(fluid.dygraph.Layer):
+class BottleneckBlock(fluid.Layer):
     def __init__(self,
                  name_scope,
                  num_channels,
@@ -156,7 +156,7 @@ class BottleneckBlock(fluid.dygraph.Layer):
         return layer_helper.append_activation(y)
 
 
-class ResNet(fluid.dygraph.Layer):
+class ResNet(fluid.Layer):
     def __init__(self, name_scope, layers=50, class_dim=102):
         super(ResNet, self).__init__(name_scope)
 
@@ -247,7 +247,7 @@ class TestDygraphResnet(unittest.TestCase):
 
             dy_param_init_value = {}
             for param in resnet.parameters():
-                dy_param_init_value[param.name] = param._numpy()
+                dy_param_init_value[param.name] = param.numpy()
 
             for batch_id, data in enumerate(train_reader()):
                 if batch_id >= batch_num:
@@ -260,20 +260,20 @@ class TestDygraphResnet(unittest.TestCase):
 
                 img = to_variable(dy_x_data)
                 label = to_variable(y_data)
-                label._stop_gradient = True
+                label.stop_gradient = True
 
                 out = resnet(img)
                 loss = fluid.layers.cross_entropy(input=out, label=label)
                 avg_loss = fluid.layers.mean(x=loss)
 
-                dy_out = avg_loss._numpy()
+                dy_out = avg_loss.numpy()
 
                 if batch_id == 0:
                     for param in resnet.parameters():
                         if param.name not in dy_param_init_value:
-                            dy_param_init_value[param.name] = param._numpy()
+                            dy_param_init_value[param.name] = param.numpy()
 
-                avg_loss._backward()
+                avg_loss.backward()
 
                 dy_grad_value = {}
                 for param in resnet.parameters():
@@ -288,7 +288,7 @@ class TestDygraphResnet(unittest.TestCase):
 
                 dy_param_value = {}
                 for param in resnet.parameters():
-                    dy_param_value[param.name] = param._numpy()
+                    dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index 69931f0849480b2569a31d04c7b0b0f9db0d61a3..3f3f92cde57c80fa4ba3d2f1389cc47efd74ca5b 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -56,7 +56,7 @@ def optimizer_setting(params):
         #bd = [step * e for e in ls["epochs"]]
         #base_lr = params["lr"]
         #lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-        optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+        optimizer = fluid.optimizer.SGD(learning_rate=0.01)
 
     return optimizer
 
@@ -109,7 +109,7 @@ class SqueezeExcitation(fluid.dygraph.Layer):
             size=num_channels,
             param_attr=fluid.ParamAttr(
                 initializer=fluid.initializer.Constant(value=0.05)),
-            act='relu')
+            act='sigmoid')
 
     def forward(self, input):
         y = self._pool(input)
@@ -316,6 +316,7 @@ class TestImperativeResneXt(unittest.TestCase):
 
         batch_size = train_parameters["batch_size"]
         batch_num = 2
+        epoch_num = 1
         with fluid.dygraph.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
@@ -327,52 +328,54 @@ class TestImperativeResneXt(unittest.TestCase):
             random.seed = seed
             train_reader = paddle.batch(
                 paddle.dataset.flowers.train(use_xmap=False),
-                batch_size=batch_size)
+                batch_size=batch_size,
+                drop_last=True)
 
             dy_param_init_value = {}
             for param in se_resnext.parameters():
-                dy_param_init_value[param.name] = param._numpy()
-
-            for batch_id, data in enumerate(train_reader()):
-                if batch_id >= batch_num:
-                    break
-
-                dy_x_data = np.array(
-                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    batch_size, 1)
-
-                img = to_variable(dy_x_data)
-                label = to_variable(y_data)
-                label._stop_gradient = True
-
-                out = se_resnext(img)
-                loss = fluid.layers.cross_entropy(input=out, label=label)
-                avg_loss = fluid.layers.mean(x=loss)
-
-                dy_out = avg_loss._numpy()
-
-                if batch_id == 0:
+                dy_param_init_value[param.name] = param.numpy()
+            for epoch_id in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+
+                    if batch_id >= batch_num and batch_num != -1:
+                        break
+
+                    dy_x_data = np.array(
+                        [x[0].reshape(3, 224, 224)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(
+                            batch_size, 1)
+
+                    img = to_variable(dy_x_data)
+                    label = to_variable(y_data)
+                    label.stop_gradient = True
+
+                    out = se_resnext(img)
+                    loss = fluid.layers.cross_entropy(input=out, label=label)
+                    avg_loss = fluid.layers.mean(x=loss)
+
+                    dy_out = avg_loss.numpy()
+
+                    if batch_id == 0:
+                        for param in se_resnext.parameters():
+                            if param.name not in dy_param_init_value:
+                                dy_param_init_value[param.name] = param.numpy()
+                    avg_loss.backward()
+
+                    #dy_grad_value = {}
+                    #for param in se_resnext.parameters():
+                    #    if param.trainable:
+                    #        np_array = np.array(param._ivar._grad_ivar().value()
+                    #                            .get_tensor())
+                    #        dy_grad_value[param.name + core.grad_var_suffix()] = np_array
+
+                    optimizer.minimize(avg_loss)
+                    se_resnext.clear_gradients()
+
+                    dy_param_value = {}
                     for param in se_resnext.parameters():
-                        if param.name not in dy_param_init_value:
-                            dy_param_init_value[param.name] = param._numpy()
-
-                avg_loss._backward()
-
-                dy_grad_value = {}
-                for param in se_resnext.parameters():
-                    if param.trainable:
-                        np_array = np.array(param._ivar._grad_ivar().value()
-                                            .get_tensor())
-                        dy_grad_value[param.name + core.grad_var_suffix(
-                        )] = np_array
-
-                optimizer.minimize(avg_loss)
-                se_resnext.clear_gradients()
-
-                dy_param_value = {}
-                for param in se_resnext.parameters():
-                    dy_param_value[param.name] = param._numpy()
+                        dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
@@ -389,7 +392,8 @@ class TestImperativeResneXt(unittest.TestCase):
             random.seed = seed
             train_reader = paddle.batch(
                 paddle.dataset.flowers.train(use_xmap=False),
-                batch_size=batch_size)
+                batch_size=batch_size,
+                drop_last=True)
 
             img = fluid.layers.data(
                 name='pixel', shape=[3, 224, 224], dtype='float32')
@@ -415,37 +419,42 @@ class TestImperativeResneXt(unittest.TestCase):
 
             for i in range(len(static_param_name_list)):
                 static_param_init_value[static_param_name_list[i]] = out[i]
-
-            for batch_id, data in enumerate(train_reader()):
-                if batch_id >= batch_num:
-                    break
-
-                static_x_data = np.array(
-                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
-                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
-                    [batch_size, 1])
-
-                fetch_list = [avg_loss.name]
-                fetch_list.extend(static_param_name_list)
-                fetch_list.extend(static_grad_name_list)
-                out = exe.run(fluid.default_main_program(),
-                              feed={"pixel": static_x_data,
-                                    "label": y_data},
-                              fetch_list=fetch_list)
-
-                static_param_value = {}
-                static_grad_value = {}
-                static_out = out[0]
-                param_start_pos = 1
-                grad_start_pos = len(static_param_name_list) + param_start_pos
-                for i in range(param_start_pos,
-                               len(static_param_name_list) + param_start_pos):
-                    static_param_value[static_param_name_list[
-                        i - param_start_pos]] = out[i]
-                for i in range(grad_start_pos,
-                               len(static_grad_name_list) + grad_start_pos):
-                    static_grad_value[static_grad_name_list[
-                        i - grad_start_pos]] = out[i]
+            for epoch_id in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    if batch_id >= batch_num and batch_num != -1:
+                        break
+
+                    static_x_data = np.array(
+                        [x[0].reshape(3, 224, 224)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(
+                            [batch_size, 1])
+
+                    fetch_list = [avg_loss.name]
+                    fetch_list.extend(static_param_name_list)
+                    fetch_list.extend(static_grad_name_list)
+                    out = exe.run(
+                        fluid.default_main_program(),
+                        feed={"pixel": static_x_data,
+                              "label": y_data},
+                        fetch_list=fetch_list)
+
+                    static_param_value = {}
+                    static_grad_value = {}
+                    static_out = out[0]
+                    param_start_pos = 1
+                    grad_start_pos = len(
+                        static_param_name_list) + param_start_pos
+                    for i in range(
+                            param_start_pos,
+                            len(static_param_name_list) + param_start_pos):
+                        static_param_value[static_param_name_list[
+                            i - param_start_pos]] = out[i]
+                    for i in range(grad_start_pos,
+                                   len(static_grad_name_list) + grad_start_pos):
+                        static_grad_value[static_grad_name_list[
+                            i - grad_start_pos]] = out[i]
         self.assertTrue(np.allclose(static_out, dy_out))
 
         self.assertEqual(len(dy_param_init_value), len(static_param_init_value))
@@ -454,12 +463,12 @@ class TestImperativeResneXt(unittest.TestCase):
             self.assertTrue(np.allclose(value, dy_param_init_value[key]))
             self.assertTrue(np.isfinite(value.all()))
             self.assertFalse(np.isnan(value.any()))
-
-        self.assertEqual(len(dy_grad_value), len(static_grad_value))
-        for key, value in six.iteritems(static_grad_value):
-            self.assertTrue(np.allclose(value, dy_grad_value[key]))
-            self.assertTrue(np.isfinite(value.all()))
-            self.assertFalse(np.isnan(value.any()))
+        # FIXME(Yancey1989): np.array(_ivar.value().get_tensor()) leads to memory lake
+        #self.assertEqual(len(dy_grad_value), len(static_grad_value))
+        #for key, value in six.iteritems(static_grad_value):
+        #    self.assertTrue(np.allclose(value, dy_grad_value[key]))
+        #    self.assertTrue(np.isfinite(value.all()))
+        #    self.assertFalse(np.isnan(value.any()))
 
         self.assertEqual(len(dy_param_value), len(static_param_value))
         for key, value in six.iteritems(static_param_value):
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
index 89ae3c6a39d6277f590c8f2e02f7b0ae62a1cd4a..b24bab210a15528f308804c71732bd71eb6105a4 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer.py
@@ -16,7 +16,8 @@ from __future__ import print_function
 
 import unittest
 import paddle.fluid as fluid
-from paddle.fluid.dygraph import Embedding, LayerNorm, FC, to_variable, Layer, guard
+from paddle.fluid import Embedding, LayerNorm, FC, Layer
+from paddle.fluid.dygraph import to_variable, guard
 from test_imperative_base import new_program_scope
 from paddle.fluid import core
 import numpy as np
@@ -116,7 +117,7 @@ class ModelHyperParams(object):
     # to process after each sub-layer
     postprocess_cmd = "da"  # dropout + residual connection
     # random seed used in dropout for CE.
-    dropout_seed = 1
+    dropout_seed = None
     # the flag indicating whether to share embedding and softmax weights.
     # vocabularies in source and target should be same for weight sharing.
     weight_sharing = True
@@ -166,15 +167,21 @@ def create_data(is_static=False):
         ]
     else:
         enc_inputs = [
-            to_variable(src_word_np), to_variable(src_pos_np),
-            to_variable(src_slf_attn_bias_np)
+            to_variable(
+                src_word_np, name='src_word'), to_variable(
+                    src_pos_np, name='src_pos'), to_variable(
+                        src_slf_attn_bias_np, name='src_slf_attn_bias')
         ]
         dec_inputs = [
-            to_variable(trg_word_np), to_variable(trg_pos_np),
-            to_variable(trg_slf_attn_bias_np), to_variable(trg_src_attn_bias_np)
+            to_variable(
+                trg_word_np, name='trg_word'), to_variable(
+                    trg_pos_np, name='trg_pos'), to_variable(
+                        trg_slf_attn_bias_np, name='trg_slf_attn_bias'),
+            to_variable(
+                trg_src_attn_bias_np, name='trg_src_attn_bias')
         ]
-        label = to_variable(lbl_word_np)
-        weight = to_variable(lbl_weight_np)
+        label = to_variable(lbl_word_np, name='lbl_word')
+        weight = to_variable(lbl_weight_np, name='lbl_weight')
         return enc_inputs, dec_inputs, label, weight
 
 
@@ -211,7 +218,7 @@ def make_all_inputs(input_fields):
 # The placeholder for batch_size in compile time. Must be -1 currently to be
 # consistent with some ops' infer-shape output in compile time, such as the
 # sequence_expand op used in beamsearch decoder.
-batch_size = 32
+batch_size = -1
 # The placeholder for squence length in compile time.
 seq_len = ModelHyperParams.max_length
 # Here list the data shapes and data types of all inputs.
@@ -302,60 +309,43 @@ use_py_reader = False
 # if we run sync mode
 sync = False
 
-if not core.is_compiled_with_cuda():
-    # how many batches we use
-    batch_num = 50
-else:
-    batch_num = 5
+# how many batches we use
+batch_num = 5
 
-np.random.seed = 1
+np.random.seed = 90
 src_word_np = np.random.randint(
     1,
     ModelHyperParams.src_vocab_size - 1,
-    size=(batch_size, seq_len, 1),
+    size=(TrainTaskConfig.batch_size, seq_len, 1),
     dtype='int64')
 src_pos_np = np.random.randint(
-    1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
-src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-                                       seq_len, seq_len).astype('float32')
+    1, seq_len, size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64')
+src_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
+                                       ModelHyperParams.n_head, seq_len,
+                                       seq_len).astype('float32')
 
 trg_word_np = np.random.randint(
     1,
     ModelHyperParams.src_vocab_size - 1,
-    size=(batch_size, seq_len, 1),
+    size=(TrainTaskConfig.batch_size, seq_len, 1),
     dtype='int64')
 trg_pos_np = np.random.randint(
-    1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
-trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-                                       seq_len, seq_len).astype('float32')
-trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-                                       seq_len, seq_len).astype('float32')
+    1, seq_len, size=(TrainTaskConfig.batch_size, seq_len, 1), dtype='int64')
+trg_slf_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
+                                       ModelHyperParams.n_head, seq_len,
+                                       seq_len).astype('float32')
+trg_src_attn_bias_np = np.random.randn(TrainTaskConfig.batch_size,
+                                       ModelHyperParams.n_head, seq_len,
+                                       seq_len).astype('float32')
 
 lbl_word_np = np.random.randint(
     1,
     ModelHyperParams.src_vocab_size - 1,
-    size=(batch_size * seq_len, 1),
+    size=(TrainTaskConfig.batch_size * seq_len, 1),
     dtype='int64')
-lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
-
-# np.random.seed = 1
-# src_word_np = np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
-# src_pos_np = np.random.randint(
-#     1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
-# src_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-#                                        seq_len, seq_len).astype('float32')
-#
-# trg_word_np =  np.arange(0, 10).reshape([batch_size, seq_len, 1]).astype('int64')
-# trg_pos_np = np.random.randint(
-#     1, seq_len, size=(batch_size, seq_len, 1), dtype='int64')
-# trg_slf_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-#                                        seq_len, seq_len).astype('float32')
-# trg_src_attn_bias_np = np.random.randn(batch_size, ModelHyperParams.n_head,
-#                                        seq_len, seq_len).astype('float32')
-#
-# lbl_word_np =  np.arange(0, 10).reshape([batch_size * seq_len, 1]).astype('int64')
-# lbl_weight_np = np.random.randn(batch_size * seq_len, 1).astype('float32')
-#
+lbl_weight_np = np.random.randn(TrainTaskConfig.batch_size * seq_len,
+                                1).astype('float32')
+
 pos_inp1 = position_encoding_init(ModelHyperParams.max_length,
                                   ModelHyperParams.d_model)
 pos_inp2 = position_encoding_init(ModelHyperParams.max_length,
@@ -469,7 +459,7 @@ class MultiHeadAttentionLayer(Layer):
             x=v, shape=[0, 0, self._n_head, self._d_value], inplace=False)
         transpose_v = fluid.layers.transpose(x=reshaped_v, perm=[0, 2, 1, 3])
 
-        #scale dot product attention
+        # scale dot product attention
         product = fluid.layers.matmul(
             x=transpose_q,
             y=transpose_k,
@@ -742,7 +732,7 @@ class DecoderSubLayer(Layer):
         enc_attn_output_pp = self._multihead_attention_layer2(
             pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias)
         enc_attn_output = self._post_process_layer2(
-            slf_attn_output, enc_attn_output_pp, self._postprocess_cmd,
+            slf_attn_output_pp, enc_attn_output_pp, self._postprocess_cmd,
             self._prepostprcess_dropout)
         pre_process_rlt3 = self._pre_process_layer3(None, enc_attn_output,
                                                     self._preprocess_cmd,
@@ -993,16 +983,18 @@ class TestDygraphTransformer(unittest.TestCase):
                 enc_inputs, dec_inputs, label, weights = create_data()
                 dy_sum_cost, dy_avg_cost, dy_predict, dy_token_num = transformer(
                     enc_inputs, dec_inputs, label, weights)
+
                 if i == 0:
                     for param in transformer.parameters():
-                        dy_param_init[param.name] = param._numpy()
+                        dy_param_init[param.name] = param.numpy()
 
-                dy_avg_cost._backward()
+                dy_avg_cost.backward()
                 optimizer.minimize(dy_avg_cost)
                 transformer.clear_gradients()
+
                 if i == batch_num - 1:
                     for param in transformer.parameters():
-                        dy_param_updated[param.name] = param._numpy()
+                        dy_param_updated[param.name] = param.numpy()
 
         with new_program_scope():
             fluid.default_startup_program().random_seed = seed
@@ -1046,7 +1038,6 @@ class TestDygraphTransformer(unittest.TestCase):
             static_param_name_list = list()
             static_sum_cost, static_avg_cost, static_predict, static_token_num = transformer(
                 enc_inputs, dec_inputs, label, weights)
-
             optimizer.minimize(static_avg_cost)
             for param in transformer.parameters():
                 static_param_name_list.append(param.name)
@@ -1064,8 +1055,8 @@ class TestDygraphTransformer(unittest.TestCase):
                     static_sum_cost, static_avg_cost, static_predict,
                     static_token_num
                 ]
-                fetch_list.extend(static_param_name_list)
 
+                fetch_list.extend(static_param_name_list)
                 out = exe.run(fluid.default_main_program(),
                               feed=feed_dict,
                               fetch_list=fetch_list)
@@ -1079,13 +1070,14 @@ class TestDygraphTransformer(unittest.TestCase):
                                                                     4]] = out[k]
 
         self.assertTrue(
-            np.array_equal(static_avg_cost_value, dy_avg_cost._numpy()))
+            np.array_equal(static_avg_cost_value, dy_avg_cost.numpy()))
         self.assertTrue(
-            np.array_equal(static_sum_cost_value, dy_sum_cost._numpy()))
+            np.array_equal(static_sum_cost_value, dy_sum_cost.numpy()))
         self.assertTrue(
-            np.array_equal(static_predict_value, dy_predict._numpy()))
+            np.array_equal(static_predict_value, dy_predict.numpy()))
         self.assertTrue(
-            np.array_equal(static_token_num_value, dy_token_num._numpy()))
+            np.array_equal(static_token_num_value, dy_token_num.numpy()))
+
         for key, value in six.iteritems(static_param_init):
             self.assertTrue(np.array_equal(value, dy_param_init[key]))
         for key, value in six.iteritems(static_param_updated):
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 98b39256aad8435a1b54fe11fbb8d4677f18e99c..91f8bc5fd0a510dcc05cb7ba2397cad52be16af5 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -114,7 +114,7 @@ class TestLayer(LayerTest):
             dy_ret = fc2(ret)
 
         self.assertTrue(np.array_equal(static_ret, static_ret2))
-        self.assertTrue(np.array_equal(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.array_equal(static_ret, dy_ret.numpy()))
 
     def test_layer_norm(self):
         inp = np.ones([3, 32, 32], dtype='float32')
@@ -142,7 +142,7 @@ class TestLayer(LayerTest):
             dy_ret = lm(base.to_variable(inp))
 
         self.assertTrue(np.allclose(static_ret, static_ret2))
-        self.assertTrue(np.allclose(dy_ret._numpy(), static_ret2))
+        self.assertTrue(np.allclose(dy_ret.numpy(), static_ret2))
 
     def test_relu(self):
         with self.static_graph():
@@ -156,7 +156,7 @@ class TestLayer(LayerTest):
             t = np.ones([3, 3], dtype='float32')
             dy_ret = layers.relu(base.to_variable(t))
 
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
 
     def test_matmul(self):
         with self.static_graph():
@@ -177,7 +177,7 @@ class TestLayer(LayerTest):
             t2 = np.ones([3, 3], dtype='float32')
             dy_ret = layers.matmul(base.to_variable(t), base.to_variable(t2))
 
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
 
     def test_conv2d(self):
         with self.static_graph():
@@ -204,7 +204,7 @@ class TestLayer(LayerTest):
                 'conv2d', num_channels=3, num_filters=3, filter_size=[2, 2])
             dy_ret = conv2d(base.to_variable(images))
 
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
         self.assertTrue(np.allclose(static_ret, static_ret2))
 
     def test_gru_unit(self):
@@ -246,7 +246,7 @@ class TestLayer(LayerTest):
 
         for i in range(len(static_ret)):
             self.assertTrue(np.allclose(static_ret[i], static_ret2[i]))
-            self.assertTrue(np.allclose(static_ret[i], dy_ret[i]._numpy()))
+            self.assertTrue(np.allclose(static_ret[i], dy_ret[i].numpy()))
 
     def test_elementwise_math(self):
         n = np.ones([3, 3], dtype='float32')
@@ -288,8 +288,8 @@ class TestLayer(LayerTest):
             ret = layers.elementwise_sub(ret, n5)
             dy_ret = layers.elementwise_mul(ret, n6)
         self.assertTrue(
-            np.allclose(static_ret, dy_ret._numpy()),
-            '%s vs %s' % (static_ret, dy_ret._numpy()))
+            np.allclose(static_ret, dy_ret.numpy()),
+            '%s vs %s' % (static_ret, dy_ret.numpy()))
 
     def test_elementwise_minmax(self):
         n = np.ones([3, 3], dtype='float32')
@@ -299,8 +299,8 @@ class TestLayer(LayerTest):
             min_ret = layers.elementwise_min(n, n2)
             max_ret = layers.elementwise_max(n, n2)
 
-        self.assertTrue(np.allclose(n, min_ret._numpy()))
-        self.assertTrue(np.allclose(n2, max_ret._numpy()))
+        self.assertTrue(np.allclose(n, min_ret.numpy()))
+        self.assertTrue(np.allclose(n2, max_ret.numpy()))
 
     def test_sequence_conv(self):
         inp_np = np.arange(12).reshape([3, 4]).astype('float32')
@@ -367,7 +367,7 @@ class TestLayer(LayerTest):
                 'conv2d_transpose', num_filters=10, output_size=28)
             dy_rlt = conv2d_transpose(base.to_variable(inp_np))
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
+        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
 
     def test_bilinear_tensor_product(self):
         inp_np_x = np.array([[1, 2, 3]]).astype('float32')
@@ -410,7 +410,7 @@ class TestLayer(LayerTest):
             dy_rlt = btp(base.to_variable(inp_np_x), base.to_variable(inp_np_y))
 
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
+        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
 
     def test_prelu(self):
         inp_np = np.ones([5, 200, 100, 100]).astype('float32')
@@ -451,7 +451,7 @@ class TestLayer(LayerTest):
             dy_rlt = prelu(base.to_variable(inp_np))
 
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
+        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
 
     def test_embeding(self):
         inp_word = np.array([[[1]]]).astype('int64')
@@ -484,7 +484,7 @@ class TestLayer(LayerTest):
             static_rlt3 = emb2(base.to_variable(inp_word))
 
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(static_rlt3._numpy(), static_rlt))
+        self.assertTrue(np.allclose(static_rlt3.numpy(), static_rlt))
 
     def test_nce(self):
         window_size = 5
@@ -598,7 +598,7 @@ class TestLayer(LayerTest):
             nce_loss3 = nce(embs3, words[label_word])
 
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(nce_loss3._numpy(), static_rlt))
+        self.assertTrue(np.allclose(nce_loss3.numpy(), static_rlt))
 
     def test_conv3d(self):
         with self.static_graph():
@@ -625,7 +625,7 @@ class TestLayer(LayerTest):
             conv3d = nn.Conv3D('conv3d', num_filters=3, filter_size=2)
             dy_ret = conv3d(base.to_variable(images))
 
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
         self.assertTrue(np.allclose(static_ret, static_ret2))
 
     def test_row_conv(self):
@@ -719,7 +719,7 @@ class TestLayer(LayerTest):
             groupNorm = nn.GroupNorm('GroupNorm', groups=2)
             dy_ret = groupNorm(base.to_variable(input))
 
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
         self.assertTrue(np.allclose(static_ret, static_ret2))
 
     def test_spectral_norm(self):
@@ -769,7 +769,7 @@ class TestLayer(LayerTest):
             spectralNorm = nn.SpectralNorm('SpectralNorm', dim=1, power_iters=2)
             dy_ret = spectralNorm(base.to_variable(input))
 
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
         self.assertTrue(np.allclose(static_ret, static_ret2))
 
     def test_tree_conv(self):
@@ -842,7 +842,7 @@ class TestLayer(LayerTest):
             dy_ret = treeConv(base.to_variable(vectors), base.to_variable(adj))
 
         self.assertTrue(np.allclose(static_ret, static_ret2))
-        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+        self.assertTrue(np.allclose(static_ret, dy_ret.numpy()))
 
     def test_conv3d_transpose(self):
         input_array = np.arange(0, 48).reshape(
@@ -872,7 +872,7 @@ class TestLayer(LayerTest):
                 use_cudnn=False)
             dy_rlt = conv3d_transpose(base.to_variable(input_array))
         self.assertTrue(np.allclose(static_rlt2, static_rlt))
-        self.assertTrue(np.allclose(dy_rlt._numpy(), static_rlt))
+        self.assertTrue(np.allclose(dy_rlt.numpy(), static_rlt))
 
 
 class TestBook(LayerTest):
@@ -907,7 +907,7 @@ class TestBook(LayerTest):
                 if isinstance(dy_result, tuple):
                     dy_result = dy_result[0]
 
-        self.assertTrue(np.array_equal(static_result[0], dy_result._numpy()))
+        self.assertTrue(np.array_equal(static_result[0], dy_result.numpy()))
 
     def _get_np_data(self, shape, dtype, append_batch_size=True):
         np.random.seed(self.seed)
@@ -1759,10 +1759,20 @@ class TestBook(LayerTest):
     def test_lod_reset(self):
         # TODO(minqiyang): dygraph do not support lod now
         with self.static_graph():
+            # case 1
             x = layers.data(name='x', shape=[10], dtype='float32')
             y = layers.data(
                 name='y', shape=[10, 20], dtype='float32', lod_level=2)
-            return (layers.lod_reset(x=x, y=y))
+            z = layers.lod_reset(x=x, y=y)
+            self.assertTrue(z.lod_level == 2)
+            # case 2
+            lod_tensor_in = layers.data(name='lod_in', shape=[1], dtype='int64')
+            z = layers.lod_reset(x=x, y=lod_tensor_in)
+            self.assertTrue(z.lod_level == 1)
+            # case 3
+            z = layers.lod_reset(x=x, target_lod=[1, 2, 3])
+            self.assertTrue(z.lod_level == 1)
+            return z
 
     def test_affine_grid(self):
         with self.static_graph():
@@ -1925,6 +1935,13 @@ class TestBook(LayerTest):
             out = layers.flatten(x, axis=1, name="flatten")
             return (out)
 
+    def test_linspace(self):
+        program = Program()
+        with program_guard(program):
+            out = layers.linspace(20, 10, 5, 'float64')
+            self.assertIsNotNone(out)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_linspace.py b/python/paddle/fluid/tests/unittests/test_linspace.py
new file mode 100644
index 0000000000000000000000000000000000000000..eeecf178320327cc251f32bfe46c1622200339f4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_linspace.py
@@ -0,0 +1,71 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLinspaceOpCommonCase(OpTest):
+    def setUp(self):
+        self.op_type = "linspace"
+        dtype = 'float32'
+        self.inputs = {
+            'Start': np.array([0]).astype(dtype),
+            'Stop': np.array([10]).astype(dtype),
+            'Num': np.array([11]).astype('int32')
+        }
+
+        self.outputs = {'Out': np.arange(0, 11).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestLinspaceOpReverseCase(OpTest):
+    def setUp(self):
+        self.op_type = "linspace"
+        dtype = 'float32'
+        self.inputs = {
+            'Start': np.array([10]).astype(dtype),
+            'Stop': np.array([0]).astype(dtype),
+            'Num': np.array([11]).astype('int32')
+        }
+
+        self.outputs = {'Out': np.arange(10, -1, -1).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestLinspaceOpNumOneCase(OpTest):
+    def setUp(self):
+        self.op_type = "linspace"
+        dtype = 'float32'
+        self.inputs = {
+            'Start': np.array([10]).astype(dtype),
+            'Stop': np.array([0]).astype(dtype),
+            'Num': np.array([1]).astype('int32')
+        }
+
+        self.outputs = {'Out': np.array(10, dtype=dtype)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
index 5bb2260ef7a143670dd75fc88769603d1437173d..eb82af75e4a2bf834c010aede79d50b0d73c98bc 100644
--- a/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nearest_interp_op.py
@@ -73,7 +73,14 @@ class TestNearestInterpOp(OpTest):
         self.op_type = "nearest_interp"
         input_np = np.random.random(self.input_shape).astype("float32")
 
-        output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w,
+        if self.scale > 0:
+            out_h = int(self.input_shape[2] * self.scale)
+            out_w = int(self.input_shape[3] * self.scale)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
                                                self.out_size, self.actual_shape,
                                                self.align_corners)
         self.inputs = {'X': input_np}
@@ -84,6 +91,7 @@ class TestNearestInterpOp(OpTest):
         self.attrs = {
             'out_h': self.out_h,
             'out_w': self.out_w,
+            'scale': self.scale,
             'interp_method': self.interp_method,
             'align_corners': self.align_corners,
         }
@@ -100,6 +108,7 @@ class TestNearestInterpOp(OpTest):
         self.input_shape = [2, 3, 4, 4]
         self.out_h = 2
         self.out_w = 2
+        self.scale = 0.
         self.out_size = np.array([3, 3]).astype("int32")
         self.align_corners = True
 
@@ -110,6 +119,7 @@ class TestNearestNeighborInterpCase1(TestNearestInterpOp):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 1
         self.out_w = 1
+        self.scale = 0.
         self.align_corners = True
 
 
@@ -119,6 +129,7 @@ class TestNearestNeighborInterpCase2(TestNearestInterpOp):
         self.input_shape = [3, 3, 9, 6]
         self.out_h = 12
         self.out_w = 12
+        self.scale = 0.
         self.align_corners = True
 
 
@@ -128,6 +139,7 @@ class TestNearestNeighborInterpCase3(TestNearestInterpOp):
         self.input_shape = [1, 1, 128, 64]
         self.out_h = 64
         self.out_w = 128
+        self.scale = 0.
         self.align_corners = True
 
 
@@ -137,6 +149,7 @@ class TestNearestNeighborInterpCase4(TestNearestInterpOp):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 1
         self.out_w = 1
+        self.scale = 0.
         self.out_size = np.array([2, 2]).astype("int32")
         self.align_corners = True
 
@@ -147,6 +160,7 @@ class TestNearestNeighborInterpCase5(TestNearestInterpOp):
         self.input_shape = [3, 3, 9, 6]
         self.out_h = 12
         self.out_w = 12
+        self.scale = 0.
         self.out_size = np.array([11, 11]).astype("int32")
         self.align_corners = True
 
@@ -157,6 +171,7 @@ class TestNearestNeighborInterpCase6(TestNearestInterpOp):
         self.input_shape = [1, 1, 128, 64]
         self.out_h = 64
         self.out_w = 128
+        self.scale = 0.
         self.out_size = np.array([65, 129]).astype("int32")
         self.align_corners = True
 
@@ -167,6 +182,7 @@ class TestNearestNeighborInterpActualShape(TestNearestInterpOp):
         self.input_shape = [3, 2, 32, 16]
         self.out_h = 64
         self.out_w = 32
+        self.scale = 0.
         self.out_size = np.array([66, 40]).astype("int32")
         self.align_corners = True
 
@@ -179,7 +195,15 @@ class TestNearestInterpOpUint8(OpTest):
         self.op_type = "nearest_interp"
         input_np = np.random.randint(
             low=0, high=256, size=self.input_shape).astype("uint8")
-        output_np = nearest_neighbor_interp_np(input_np, self.out_h, self.out_w,
+
+        if self.scale > 0:
+            out_h = int(self.input_shape[2] * self.scale)
+            out_w = int(self.input_shape[3] * self.scale)
+        else:
+            out_h = self.out_h
+            out_w = self.out_w
+
+        output_np = nearest_neighbor_interp_np(input_np, out_h, out_w,
                                                self.out_size, self.actual_shape,
                                                self.align_corners)
         self.inputs = {'X': input_np}
@@ -188,6 +212,7 @@ class TestNearestInterpOpUint8(OpTest):
         self.attrs = {
             'out_h': self.out_h,
             'out_w': self.out_w,
+            'scale': self.scale,
             'interp_method': self.interp_method,
             'align_corners': self.align_corners
         }
@@ -201,6 +226,7 @@ class TestNearestInterpOpUint8(OpTest):
         self.input_shape = [1, 3, 9, 6]
         self.out_h = 10
         self.out_w = 9
+        self.scale = 0.
         self.align_corners = True
 
 
@@ -210,6 +236,7 @@ class TestNearestNeighborInterpCase1Uint8(TestNearestInterpOpUint8):
         self.input_shape = [2, 3, 128, 64]
         self.out_h = 120
         self.out_w = 50
+        self.scale = 0.
         self.align_corners = True
 
 
@@ -219,6 +246,7 @@ class TestNearestNeighborInterpCase2Uint8(TestNearestInterpOpUint8):
         self.input_shape = [4, 1, 7, 8]
         self.out_h = 5
         self.out_w = 13
+        self.scale = 0.
         self.out_size = np.array([6, 15]).astype("int32")
         self.align_corners = True
 
@@ -228,5 +256,38 @@ class TestNearestInterpWithoutCorners(TestNearestInterpOp):
         self.align_corners = False
 
 
+class TestNearestNeighborInterpScale1(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 2.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpScale2(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 1.5
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
+class TestNearestNeighborInterpScale3(TestNearestInterpOp):
+    def init_test_case(self):
+        self.interp_method = 'nearest'
+        self.input_shape = [3, 2, 32, 16]
+        self.out_h = 64
+        self.out_w = 32
+        self.scale = 1.
+        self.out_size = np.array([66, 40]).astype("int32")
+        self.align_corners = True
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
index 041c56fce11e6f6abb0a941a9e9c9ad1cb60ab42..e1b3c2cb6dca1149e0a0b995d35977d74e04e4fe 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
@@ -21,25 +21,8 @@ import os
 os.environ['FLAGS_enable_parallel_graph'] = str(1)
 import paddle.fluid.core as core
 import os
-import paddle.fluid as fluid
 from parallel_executor_test_base import TestParallelExecutorBase
-
-
-def simple_fc_net(use_feed):
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = img
-    for _ in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
+from simple_nets import simple_fc_net, init_data
 
 
 class TestMNIST(TestParallelExecutorBase):
@@ -47,19 +30,12 @@ class TestMNIST(TestParallelExecutorBase):
     def setUpClass(cls):
         os.environ['CPU_NUM'] = str(4)
 
-    def _init_data(self):
-        np.random.seed(5)
-        img = np.random.random(size=[32, 784]).astype(np.float32)
-        label = np.ones(shape=[32, 1], dtype='int64')
-        return img, label
-
     # simple_fc
     def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
         if use_cuda and not core.is_compiled_with_cuda():
             return
 
-        img, label = self._init_data()
-
+        img, label = init_data()
         self.check_network_convergence(
             simple_fc_net,
             feed_dict={"image": img,
@@ -75,8 +51,7 @@ class TestMNIST(TestParallelExecutorBase):
         if use_cuda and not core.is_compiled_with_cuda():
             return
 
-        img, label = self._init_data()
-
+        img, label = init_data()
         single_first_loss, single_last_loss = self.check_network_convergence(
             method=simple_fc_net,
             seed=1,
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
index 1f23fae92c9d8148efb25facb602cdc4d485865b..92a5c58c11773e97ca0bb5ff2c21cbc8df612d58 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
@@ -14,19 +14,22 @@
 
 from __future__ import print_function
 import os
-os.environ['FLAGS_fuse_parameter_memory_size'] = "131072"
-os.environ['FLAGS_fuse_parameter_groups_size'] = "3"
 
 import paddle.fluid as fluid
+fluid.core._set_fuse_parameter_group_size(3)
+fluid.core._set_fuse_parameter_memory_size(131072)
+
 import paddle.fluid.layers.ops as ops
 from paddle.fluid.initializer import init_on_cpu
 from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter
 import paddle.fluid.core as core
 from parallel_executor_test_base import TestParallelExecutorBase
+from simple_nets import init_data
 import unittest
 import math
 import numpy as np
-
+from functools import partial
+os.environ['CPU_NUM'] = str(4)
 # FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor
 # and Executor is different. Because, for ParallelExecutor, the dropout_op of
 # the neural net will be copied N copies(N is the number of device). This will
@@ -110,7 +113,6 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio):
     return fluid.layers.elementwise_add(x=short, y=scale, act='relu')
 
 
-batch_size = 12
 img_shape = [3, 224, 224]
 
 
@@ -178,53 +180,84 @@ def optimizer(learning_rate=0.01):
     return optimizer
 
 
+def _batch_size():
+    return 12
+
+
+def _iter(use_cuda):
+    if use_cuda:
+        return 10
+    return 2
+
+
+gpu_img, gpu_label = init_data(
+    batch_size=_batch_size(), img_shape=img_shape, label_range=999)
+cpu_img, cpu_label = init_data(
+    batch_size=_batch_size(), img_shape=img_shape, label_range=999)
+feed_dict_gpu = {"image": gpu_img, "label": gpu_label}
+feed_dict_cpu = {"image": cpu_img, "label": cpu_label}
+model = SE_ResNeXt50Small
+
+
+def _feed_dict(use_cuda):
+    if use_cuda:
+        return feed_dict_gpu
+    return feed_dict_cpu
+
+
+def _get_result_of_origin_model(use_cuda):
+    global remove_bn
+    global remove_dropout
+    remove_bn = True
+    remove_dropout = True
+    first_loss, last_loss = TestParallelExecutorBase.check_network_convergence(
+        model,
+        feed_dict=_feed_dict(use_cuda),
+        iter=_iter(use_cuda),
+        batch_size=_batch_size(),
+        use_cuda=use_cuda,
+        use_reduce=False,
+        optimizer=optimizer)
+
+    return first_loss, last_loss
+
+
+origin_cpu_first_loss, origin_cpu_last_loss = _get_result_of_origin_model(False)
+if core.is_compiled_with_cuda():
+    origin_gpu_first_loss, origin_gpu_last_loss = _get_result_of_origin_model(
+        True)
+
+
+def _get_origin_result(use_cuda):
+    if use_cuda:
+        assert core.is_compiled_with_cuda(), "Doesn't compiled with CUDA."
+        return origin_gpu_first_loss, origin_gpu_last_loss
+    return origin_cpu_first_loss, origin_cpu_last_loss
+
+
 class TestResnet(TestParallelExecutorBase):
-    @classmethod
-    def setUpClass(cls):
-        os.environ['CPU_NUM'] = str(4)
-        global remove_dropout
-        global remove_bn
-        remove_dropout = False
-        remove_bn = False
-
-    def _init_data(self, batch_size=2, random=True):
-        np.random.seed(5)
-        if random:
-            img = np.random.random(
-                size=[batch_size] + img_shape).astype(np.float32)
-        else:
-            img = np.ones(shape=[batch_size] + img_shape, dtype='float32')
-        label = [np.random.randint(0, 999) for _ in range(batch_size)]
-        label = np.array(label).astype(np.int64).reshape(-1, 1)
-        return img, label
-
-    def _compare_reduce_and_allreduce(self,
-                                      model,
-                                      use_cuda,
-                                      iter=20,
-                                      delta2=1e-5):
+    def _compare_reduce_and_allreduce(self, use_cuda, delta2=1e-5):
         if use_cuda and not core.is_compiled_with_cuda():
             return
 
         global remove_bn
+        global remove_dropout
         remove_bn = True
+        remove_dropout = True
 
-        img, label = self._init_data(batch_size=batch_size)
         all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
-            iter=iter,
-            batch_size=batch_size,
+            feed_dict=_feed_dict(use_cuda),
+            iter=_iter(use_cuda),
+            batch_size=_batch_size(),
             use_cuda=use_cuda,
             use_reduce=False,
             optimizer=optimizer)
         reduce_first_loss, reduce_last_loss = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
-            iter=iter,
-            batch_size=batch_size,
+            feed_dict=_feed_dict(use_cuda),
+            iter=_iter(use_cuda),
+            batch_size=_batch_size(),
             use_cuda=use_cuda,
             use_reduce=True,
             optimizer=optimizer)
@@ -239,10 +272,9 @@ class TestResnet(TestParallelExecutorBase):
 
         all_reduce_first_loss_seq, all_reduce_last_loss_seq = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
-            iter=iter,
-            batch_size=batch_size,
+            feed_dict=_feed_dict(use_cuda),
+            iter=_iter(use_cuda),
+            batch_size=_batch_size(),
             use_cuda=use_cuda,
             use_reduce=False,
             optimizer=optimizer,
@@ -250,10 +282,9 @@ class TestResnet(TestParallelExecutorBase):
 
         reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence(
             model,
-            feed_dict={"image": img,
-                       "label": label},
-            iter=iter,
-            batch_size=batch_size,
+            feed_dict=_feed_dict(use_cuda),
+            iter=_iter(use_cuda),
+            batch_size=_batch_size(),
             use_cuda=use_cuda,
             use_reduce=True,
             optimizer=optimizer,
@@ -274,98 +305,91 @@ class TestResnet(TestParallelExecutorBase):
         for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq):
             self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
 
-    def _check_resnet_convergence(self,
-                                  model,
-                                  use_cuda=True,
-                                  use_reduce=False,
-                                  iter=20,
-                                  delta2=1e-5):
+    def _compare_result_with_origin_model(self,
+                                          get_origin_result,
+                                          check_func_2,
+                                          use_cuda,
+                                          delta2=1e-5,
+                                          compare_seperately=True,
+                                          rm_drop_out=False,
+                                          rm_bn=False):
         if use_cuda and not core.is_compiled_with_cuda():
             return
 
-        global remove_dropout
         global remove_bn
-        remove_dropout = True
-        remove_bn = True
-
-        img, label = self._init_data(batch_size=batch_size)
-        single_first_loss, single_last_loss = self.check_network_convergence(
-            model,
-            feed_dict={"image": img,
-                       "label": label},
-            iter=iter,
-            batch_size=batch_size,
-            use_cuda=use_cuda,
-            use_reduce=use_reduce,
-            optimizer=optimizer,
-            use_parallel_executor=False)
-        parallel_first_loss, parallel_last_loss = self.check_network_convergence(
-            model,
-            feed_dict={"image": img,
-                       "label": label},
-            iter=iter,
-            batch_size=batch_size,
-            use_cuda=use_cuda,
-            use_reduce=use_reduce,
-            optimizer=optimizer)
-
-        self.assertAlmostEquals(
-            np.mean(parallel_first_loss), single_first_loss[0], delta=1e-5)
-        self.assertAlmostEquals(
-            np.mean(parallel_last_loss), single_last_loss[0], delta=delta2)
-
-    def _compare_with_fused_all_reduce(self,
-                                       model,
-                                       use_cuda,
-                                       iter=20,
-                                       delta2=1e-5):
-        if use_cuda and not core.is_compiled_with_cuda():
-            return
-
-        global remove_bn
-        remove_bn = True
+        global remove_dropout
+        remove_bn = rm_bn or use_cuda
+        remove_dropout = rm_drop_out
 
-        img, label = self._init_data(batch_size=batch_size)
-        all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence(
+        func_1_first_loss, func_1_last_loss = get_origin_result(use_cuda)
+        func_2_first_loss, func_2_last_loss = check_func_2(
             model,
-            feed_dict={"image": img,
-                       "label": label},
-            iter=iter,
-            batch_size=batch_size,
-            use_cuda=use_cuda,
-            fuse_all_reduce_ops=False,
-            optimizer=optimizer)
-        reduce_first_loss, reduce_last_loss = self.check_network_convergence(
-            model,
-            feed_dict={"image": img,
-                       "label": label},
-            iter=iter,
-            batch_size=batch_size,
-            use_cuda=use_cuda,
-            fuse_all_reduce_ops=True,
-            optimizer=optimizer)
+            feed_dict=_feed_dict(use_cuda),
+            iter=_iter(use_cuda),
+            batch_size=_batch_size(),
+            use_cuda=use_cuda)
+
+        if compare_seperately:
+            for loss in zip(func_1_first_loss, func_2_first_loss):
+                self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
+            for loss in zip(func_1_last_loss, func_2_last_loss):
+                self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+        else:
+            self.assertAlmostEquals(
+                np.mean(func_1_first_loss), func_2_first_loss[0], delta=1e-5)
+            self.assertAlmostEquals(
+                np.mean(func_1_last_loss), func_2_last_loss[0], delta=delta2)
 
-        for loss in zip(all_reduce_first_loss, reduce_first_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
-        for loss in zip(all_reduce_last_loss, reduce_last_loss):
-            self.assertAlmostEquals(loss[0], loss[1], delta=delta2)
+    def test_seresnext_with_reduce(self):
+        self._compare_reduce_and_allreduce(use_cuda=False, delta2=1e-3)
+        self._compare_reduce_and_allreduce(use_cuda=True, delta2=1e-2)
 
     def test_seresnext_with_learning_rate_decay(self):
-        self._check_resnet_convergence(model=SE_ResNeXt50Small, use_cuda=True)
-        self._check_resnet_convergence(
-            model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
-
-    def test_seresnext_with_reduce(self):
-        self._compare_reduce_and_allreduce(
-            model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2)
-        self._compare_reduce_and_allreduce(
-            model=SE_ResNeXt50Small, use_cuda=False, iter=5)
+        # NOTE(zcd): This test is compare the result of use parallel_executor and executor,
+        # and the result of drop_out op and batch_norm op in this two executor
+        # have diff, so the two ops should be removed from the model.
+        check_func_1 = _get_origin_result
+        check_func_2 = partial(
+            self.check_network_convergence,
+            optimizer=optimizer,
+            use_parallel_executor=False)
+        self._compare_result_with_origin_model(
+            check_func_1,
+            check_func_2,
+            use_cuda=False,
+            rm_drop_out=True,
+            rm_bn=True,
+            compare_seperately=False,
+            delta2=1e-3)
+        self._compare_result_with_origin_model(
+            check_func_1,
+            check_func_2,
+            use_cuda=True,
+            rm_drop_out=True,
+            rm_bn=True,
+            compare_seperately=False)
 
     def test_seresnext_with_fused_all_reduce(self):
-        self._compare_with_fused_all_reduce(
-            model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-3)
-        self._compare_with_fused_all_reduce(
-            model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3)
+        # NOTE(zcd): In order to make the program faster,
+        # this unit test remove drop_out and batch_norm.
+        check_func_1 = _get_origin_result
+        check_func_2 = partial(
+            self.check_network_convergence,
+            optimizer=optimizer,
+            fuse_all_reduce_ops=True)
+        self._compare_result_with_origin_model(
+            check_func_1,
+            check_func_2,
+            use_cuda=False,
+            rm_drop_out=True,
+            rm_bn=True)
+        self._compare_result_with_origin_model(
+            check_func_1,
+            check_func_2,
+            use_cuda=True,
+            rm_drop_out=True,
+            rm_bn=True,
+            delta2=1e-3)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index d89fd87a38be460c561dbff656cdaa069ffbbd53..eaf9e484df922051ca503c4a8cd679fc243a0fe8 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from __future__ import print_function
-
+from simple_nets import simple_fc_net
 import paddle.fluid as fluid
 from paddle.fluid import compiler
 import paddle.fluid.core as core
@@ -24,23 +24,6 @@ import sys
 import math
 
 
-def simple_fc_net():
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = img
-    for _ in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
 class ParallelExecutorTestingDuringTraining(unittest.TestCase):
     def check_network_convergence(self, use_cuda, build_strategy=None):
         os.environ['CPU_NUM'] = str(4)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
index aacc1c3ecda8c25dec9f08827a856d38c37b1b2f..b1851f4c78ddf984b06cf67f628099d5b60c771e 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
@@ -65,7 +65,9 @@ class ModelHyperParams(object):
     # number of head used in multi-head attention.
     n_head = 8
     # number of sub-layers to be stacked in the encoder and decoder.
-    n_layer = 6
+    # NOTE(zcd): the origin number of layer is 6, to make this unit test faster,
+    # we should reduce the layer number to 4.
+    n_layer = 4
     # dropout rate used by all dropout layers.
     dropout = 0.1
 
@@ -175,7 +177,7 @@ class TestTransformer(TestParallelExecutorBase):
             self.check_network_convergence(transformer, use_cuda=True)
             self.check_network_convergence(
                 transformer, use_cuda=True, enable_sequential_execution=True)
-        self.check_network_convergence(transformer, use_cuda=False, iter=5)
+        self.check_network_convergence(transformer, use_cuda=False, iter=2)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_pass_builder.py b/python/paddle/fluid/tests/unittests/test_pass_builder.py
index a96cb624f52303f05e40f572ccda858d1e329941..497bea43567774f356de379acced2544c8302d46 100644
--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+from simple_nets import simple_fc_net
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid import compiler
@@ -24,23 +25,6 @@ import sys
 import math
 
 
-def simple_fc_net():
-    img = fluid.layers.data(name='image', shape=[784], dtype='float32')
-    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    hidden = img
-    for _ in range(4):
-        hidden = fluid.layers.fc(
-            hidden,
-            size=200,
-            act='tanh',
-            bias_attr=fluid.ParamAttr(
-                initializer=fluid.initializer.Constant(value=1.0)))
-    prediction = fluid.layers.fc(hidden, size=10, act='softmax')
-    loss = fluid.layers.cross_entropy(input=prediction, label=label)
-    loss = fluid.layers.mean(loss)
-    return loss
-
-
 class TestPassBuilder(unittest.TestCase):
     def check_network_convergence(self, use_cuda, build_strategy=None):
         os.environ['CPU_NUM'] = str(4)
diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py
index 39d778b82a04f403bea030381ff220a68b1ff0ef..367b60831c5b1d0397b7729acf078513bb074299 100644
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
@@ -26,6 +26,10 @@ import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
 
 
 class TestProfiler(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        os.environ['CPU_NUM'] = str(4)
+
     def net_profiler(self, state, use_parallel_executor=False):
         profile_path = os.path.join(tempfile.gettempdir(), "profile")
         open(profile_path, "w").write("")
diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
index 6dfc85e301a2eda66bade09a8b6dd0004155f385..cf86ebf0a81c5c6cd36a5edb5d61a11cdd98ae11 100644
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 import unittest
-
+import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from paddle.fluid.framework import Program, grad_var_name
 from paddle.fluid.executor import Executor
@@ -115,10 +115,6 @@ class RecurrentOpTest1(unittest.TestCase):
     def setup_program(self):
         self.main_program = Program()
         self.startup_program = Program()
-        self.p_info = {
-            "main_program": self.main_program,
-            "startup_program": self.startup_program
-        }
         self.place = core.CPUPlace()
 
     def setUp(self):
@@ -129,33 +125,29 @@ class RecurrentOpTest1(unittest.TestCase):
         self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
         self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape)
 
-        self.output = layers.mean(self.create_rnn_op(), **self.p_info)
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
             dtype='float32',
             name='x',
-            append_batch_size=False,
-            **self.p_info)
+            append_batch_size=False)
         x.stop_gradient = False
         h_boot = layers.data(
-            shape=[self.input_dim],
-            dtype='float32',
-            name='h_boot',
-            **self.p_info)
+            shape=[self.input_dim], dtype='float32', name='h_boot')
         h_boot.stop_gradient = False
 
-        rnn = layers.StaticRNN(main_program=self.main_program)
+        rnn = layers.StaticRNN()
         with rnn.step():
             h_pre = rnn.memory(init=h_boot)
             x_t = rnn.step_input(x)
 
             h = layers.scale(
                 x=layers.elementwise_add(
-                    x=h_pre, y=x_t, **self.p_info),
-                scale=self.py_rnn.scale,
-                **self.p_info)
+                    x=h_pre, y=x_t),
+                scale=self.py_rnn.scale)
 
             rnn.update_memory(h_pre, h)
             rnn.output(h)
@@ -193,7 +185,8 @@ class RecurrentOpTest1(unittest.TestCase):
     def test_backward(self):
         self.check_forward()
 
-        append_backward(self.output)
+        with fluid.program_guard(self.main_program, self.startup_program):
+            append_backward(self.output)
 
         ana_grad = [np.array(x) for x in self.backward()]
 
@@ -205,12 +198,8 @@ class RecurrentOpTest1(unittest.TestCase):
                     num_grad[idx], ana_grad[idx], rtol=0.1).all())
 
     def check_forward(self):
-        print('test recurrent op forward')
         pd_output = self.forward()
         py_output = self.py_rnn.forward()
-        print('pd_output', pd_output)
-        print
-        print('py_output', py_output)
         self.assertEqual(pd_output.shape, py_output.shape)
         self.assertTrue(np.isclose(pd_output, py_output, rtol=0.1).all())
 
@@ -263,24 +252,21 @@ class RecurrentOpTest2(RecurrentOpTest1):
         self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
         self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape)
 
-        self.output = layers.mean(self.create_rnn_op(), **self.p_info)
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
             dtype='float32',
             name='x',
-            append_batch_size=False,
-            **self.p_info)
+            append_batch_size=False)
         x.stop_gradient = False
         h_boot = layers.data(
-            shape=[self.input_dim],
-            dtype='float32',
-            name='h_boot',
-            **self.p_info)
+            shape=[self.input_dim], dtype='float32', name='h_boot')
         h_boot.stop_gradient = False
 
-        rnn = layers.StaticRNN(main_program=self.main_program)
+        rnn = layers.StaticRNN()
         with rnn.step():
             h_pre = rnn.memory(init=h_boot)
             x_t = rnn.step_input(x)
@@ -288,18 +274,13 @@ class RecurrentOpTest2(RecurrentOpTest1):
             temp_l = layers.fc(input=x_t,
                                size=self.input_dim,
                                param_attr='W',
-                               bias_attr=False,
-                               **self.p_info)
+                               bias_attr=False)
             temp_r = layers.fc(input=h_pre,
                                size=self.input_dim,
                                param_attr='U',
-                               bias_attr=False,
-                               **self.p_info)
+                               bias_attr=False)
 
-            h = layers.sigmoid(
-                x=layers.elementwise_add(
-                    x=temp_l, y=temp_r, **self.p_info),
-                **self.p_info)
+            h = layers.sigmoid(x=layers.elementwise_add(x=temp_l, y=temp_r))
 
             rnn.update_memory(h_pre, h)
             rnn.output(h)
@@ -362,40 +343,38 @@ class RecurrentOpMultipleMemoryTest(RecurrentOpTest1):
         self.py_rnn = RecurrentOpMultipleMemoryTest.PySimpleRNN3(
             self.input_shape, self.output_shape)
 
-        self.output = layers.mean(self.create_rnn_op(), **self.p_info)
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
             dtype='float32',
             name='x',
-            append_batch_size=False,
-            **self.p_info)
+            append_batch_size=False)
         x.stop_gradient = False
         h_boot1 = layers.data(
             shape=[self.batch_size, self.input_dim],
             dtype='float32',
             name='h_boot1',
-            append_batch_size=False,
-            **self.p_info)
+            append_batch_size=False)
         h_boot1.stop_gradient = False
         h_boot2 = layers.data(
             shape=[self.batch_size, self.input_dim],
             dtype='float32',
             name='h_boot2',
-            append_batch_size=False,
-            **self.p_info)
+            append_batch_size=False)
         h_boot2.stop_gradient = False
 
-        rnn = layers.StaticRNN(main_program=self.main_program)
+        rnn = layers.StaticRNN()
         with rnn.step():
             h_pre1 = rnn.memory(init=h_boot1)
             h_pre2 = rnn.memory(init=h_boot2)
             x_t = rnn.step_input(x)
 
-            mem1 = layers.scale(x=h_pre1, scale=1.0, **self.p_info)
-            mem2 = layers.scale(x=h_pre2, scale=1.0, **self.p_info)
-            out = layers.sums(input=[mem1, x_t, mem2], **self.p_info)
+            mem1 = layers.scale(x=h_pre1, scale=1.0)
+            mem2 = layers.scale(x=h_pre2, scale=1.0)
+            out = layers.sums(input=[mem1, x_t, mem2])
 
             rnn.update_memory(h_pre1, mem1)
             rnn.update_memory(h_pre2, mem2)
@@ -446,23 +425,23 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):
         self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
         self.py_rnn = RecurrentOpNoMemBootTest.PySimpleRNN4(self.input_shape,
                                                             self.output_shape)
-        self.output = layers.mean(self.create_rnn_op(), **self.p_info)
-        print(self.main_program)
+
+        with fluid.program_guard(self.main_program, self.startup_program):
+            self.output = layers.mean(self.create_rnn_op())
 
     def create_rnn_op(self):
         x = layers.data(
             shape=[self.sent_len, self.batch_size, self.input_dim],
             dtype='float32',
             name='x',
-            append_batch_size=False,
-            **self.p_info)
+            append_batch_size=False)
         x.stop_gradient = False
 
-        rnn = layers.StaticRNN(main_program=self.main_program)
+        rnn = layers.StaticRNN()
         with rnn.step():
             mem_pre = rnn.memory(shape=[-1, self.input_dim], batch_ref=x)
             x_t = rnn.step_input(x)
-            mem = layers.elementwise_add(x=mem_pre, y=x_t, **self.p_info)
+            mem = layers.elementwise_add(x=mem_pre, y=x_t)
             rnn.update_memory(mem_pre, mem)
             rnn.output(mem)
 
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 8fc8125a773543eea768783155ad152c475535b5..65fc1453d8db13ad9c85746c3bf148f898e8f788 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -91,6 +91,78 @@ class TestProdOp(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestAllOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].all()}
+        self.attrs = {'reduce_all': True}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAllOpWithDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': [1]}
+        self.outputs = {'Out': self.inputs['X'].all(axis=1)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAllOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].all(axis=1), axis=1)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAnyOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].any()}
+        self.attrs = {'reduce_all': True}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAnyOpWithDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': [1]}
+        self.outputs = {'Out': self.inputs['X'].any(axis=1)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAnyOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].any(axis=1), axis=1)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class Test1DReduce(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
diff --git a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
index 674ef2ddf44edb4246c9d952cb75b36fe3d6ddc8..0c784d3e49d85f0b5750c5e6d7307be754b43ab2 100644
--- a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
@@ -17,6 +17,7 @@ import numpy as np
 from op_test import OpTest
 
 import paddle.fluid.core as core
+import paddle.fluid as fluid
 from paddle.fluid.op import Operator
 
 
@@ -57,5 +58,26 @@ class TestSamplingIdOp(OpTest):
         pass
 
 
+class TestSamplingIdShape(unittest.TestCase):
+    def test_shape(self):
+        x = fluid.layers.data(name='x', shape=[3], dtype='float32')
+        output = fluid.layers.sampling_id(x)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place=place)
+        exe.run(fluid.default_startup_program())
+
+        feed = {
+            'x': np.array(
+                [[0.2, 0.3, 0.5], [0.2, 0.3, 0.4]], dtype='float32')
+        }
+        output_np = exe.run(feed=feed, fetch_list=[output])[0]
+
+        self.assertEqual(output.shape[0], -1)
+        self.assertEqual(len(output.shape), 1)
+        self.assertEqual(output_np.shape[0], 2)
+        self.assertEqual(len(output_np.shape), 1)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
index ae1883f1f7e44e06e378ff6d16dbc3c5060027e4..ec10b634091fc521062457b780b0c4cafcbacec0 100644
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -149,5 +149,98 @@ class TestSigmoidCrossEntropyWithNorm(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestSigmoidCrossEntropyWithLogitsOp5(OpTest):
+    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
+    """
+
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = [10, 10]
+        num_classes = 20
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
+                .astype("float32")),
+            'Label': np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
+            .astype("float32")
+        }
+
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        self.outputs = {'Out': -term1 - term2}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSigmoidCrossEntropyWithNorm2(OpTest):
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = [10, 10]
+        num_classes = 20
+        ignore_index = -1
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
+                .astype("float32")),
+            'Label': np.random.randint(-1, 2, tuple(batch_size + [num_classes]))
+            .astype("float32")
+        }
+        self.attrs = {'ignore_index': ignore_index, 'normalize': True}
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        out = -term1 - term2
+        out[np.where(self.inputs['Label'] == ignore_index)] = 0
+        if self.attrs['normalize']:
+            out = out / float(
+                np.where(self.inputs['Label'] != ignore_index)[0].size)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSigmoidCrossEntropyWithLogitsOp6(OpTest):
+    """Test sigmoid_cross_entropy_with_logit_op with binary label
+    """
+
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = [10, 10]
+        num_classes = 20
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, tuple(batch_size + [num_classes]))
+                .astype("float32")),
+            'Label': np.random.randint(0, 2, tuple(batch_size + [num_classes]))
+            .astype("float32")
+        }
+
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        self.outputs = {'Out': -term1 - term2}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
index b0494f114c5f7f4449e87ec67b97924fe77cd8c9..b06b52f75d21a720e2473feba6ba2e1dccc2db89 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -195,5 +195,144 @@ class TestSoftmaxWithCrossEntropyOp3NoCudnn(TestSoftmaxWithCrossEntropyOp3):
         self.numeric_stable_mode = True
 
 
+class TestSoftmaxWithCrossEntropyOp5(OpTest):
+    """
+    Test softmax with cross entropy operator with ignore_index.
+    """
+
+    def initParams(self):
+        self.numeric_stable_mode = False
+
+    def setUp(self):
+        self.initParams()
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = [6, 10]
+        class_num = 47
+
+        logits = np.random.uniform(
+            0.1, 1.0, tuple(batch_size + [class_num])).astype("float64")
+        softmax = np.apply_along_axis(stable_softmax, 2, logits)
+        labels = np.random.randint(
+            0, class_num, tuple(batch_size + [1]), dtype="int64")
+        ignore_index = 7
+
+        softmax_2d = np.reshape(softmax, [-1, class_num])
+        labels_2d = np.reshape(labels, [-1, 1])
+        cross_entropy = np.asmatrix(
+            [[-np.log(softmax_2d[i][labels_2d[i][0]])]
+             if labels_2d[i] != ignore_index else [0]
+             for i in range(softmax_2d.shape[0])],
+            dtype="float64")
+
+        cross_entropy = np.reshape(cross_entropy, batch_size)
+
+        output_shape = tuple(batch_size + [1])
+        output_res = cross_entropy.astype("float64")
+        output_res = np.expand_dims(output_res, axis=2)
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {
+            "Softmax": softmax.astype("float64"),
+            "Loss": output_res,
+        }
+        self.attrs = {
+            "ignore_index": ignore_index,
+            "numeric_stable_mode": self.numeric_stable_mode
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss")
+
+
+class TestSoftmaxWithCrossEntropyOp5NoCudnn(TestSoftmaxWithCrossEntropyOp5):
+    def initParams(self):
+        self.numeric_stable_mode = True
+
+
+class TestSoftmaxWithCrossEntropyOp6(OpTest):
+    """
+    Test softmax with cross entropy operator with soft labels.
+    """
+
+    def setUp(self):
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = [6, 10]
+        class_num = 37
+
+        logits = np.random.uniform(
+            0.1, 1.0, tuple(batch_size + [class_num])).astype("float64")
+        softmax = np.apply_along_axis(stable_softmax, 2, logits)
+        labels = np.random.uniform(
+            0.1, 1.0, tuple(batch_size + [class_num])).astype("float64")
+        labels /= np.sum(labels, axis=2, keepdims=True)
+
+        cross_entropy = (-labels * np.log(softmax)).sum(
+            axis=2, keepdims=True).astype("float64")
+
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {
+            "Softmax": softmax.astype("float64"),
+            "Loss": cross_entropy.astype("float64")
+        }
+        self.attrs = {"soft_label": True}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss")
+
+
+class TestSoftmaxWithCrossEntropyOpFp16_2(TestSoftmaxWithCrossEntropyOp):
+    def initParams(self):
+        self.numeric_stable_mode = False
+        self.dtype = np.float16
+
+    def setUp(self):
+        self.initParams()
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = [64, 10]
+        class_num = 37
+
+        # NOTE: numpy float16 have very low accuracy, use float32 for numpy check.
+        logits = np.random.uniform(
+            0.1, 1.0, tuple(batch_size + [class_num])).astype(np.float32)
+        softmax = np.apply_along_axis(stable_softmax, 2, logits)
+        labels = np.random.randint(
+            0, class_num, tuple(batch_size + [1]), dtype="int64")
+
+        softmax_2d = np.reshape(softmax, [-1, class_num])
+        labels_2d = np.reshape(labels, [-1, 1])
+
+        cross_entropy = np.asmatrix(
+            [[-np.log(softmax_2d[i][labels_2d[i][0]])]
+             for i in range(softmax_2d.shape[0])],
+            dtype=np.float32)
+
+        cross_entropy = np.reshape(cross_entropy, batch_size)
+        output_shape = tuple(batch_size + [1])
+        output_res = cross_entropy.astype(self.dtype)
+        output_res = np.expand_dims(output_res, axis=2)
+        self.inputs = {"Logits": logits, "Label": labels}
+
+        self.inputs = {
+            "Logits": logits.astype(self.dtype).view(np.uint16),
+            "Label": labels
+        }
+        self.outputs = {
+            "Softmax": softmax.astype(self.dtype),
+            "Loss": output_res,
+        }
+        self.attrs = {"numeric_stable_mode": self.numeric_stable_mode}
+
+    def test_check_output(self):
+        self.check_output(atol=1e-2)
+
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.1)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py
index 380c404fb2d6a36bf3732ebbff4b6cef22f71362..c742ee002aa6c470c41d46978a4e08fc774c3152 100644
--- a/python/paddle/fluid/trainer_desc.py
+++ b/python/paddle/fluid/trainer_desc.py
@@ -28,10 +28,10 @@ class TrainerDesc(object):
         import multiprocessing as mp
         # set default thread num == cpu count
         self.proto_desc.thread_num = mp.cpu_count()
-        self.fleet_desc_ = None
-        self.device_worker_ = None
-        self.program_ = None
-        self.infer_ = False
+        self._fleet_desc = None
+        self._device_worker = None
+        self._program = None
+        self._infer = False
 
     def _set_fetch_var_and_info(self, fetch_vars, fetch_info, print_period):
         for i, v in enumerate(fetch_vars):
@@ -47,19 +47,19 @@ class TrainerDesc(object):
         self.proto_desc.thread_num = thread_num
 
     def _set_device_worker(self, device_worker):
-        self.device_worker_ = device_worker
+        self._device_worker = device_worker
 
     def _set_infer(self, infer):
-        self.infer_ = infer
+        self._infer = infer
 
     def _set_fleet_desc(self, fleet_desc):
-        self.fleet_desc_ = fleet_desc
+        self._fleet_desc = fleet_desc
 
     def _gen_trainer_desc(self):
         pass
 
     def _set_program(self, program):
-        self.program_ = program
+        self._program = program
 
     def _desc(self):
         from google.protobuf import text_format
@@ -73,13 +73,13 @@ class MultiTrainer(TrainerDesc):
 
     def _set_program(self, program):
         super(MultiTrainer, self)._set_program(program)
-        self.program_ = program
+        self._program = program
 
     def _gen_trainer_desc(self):
         super(MultiTrainer, self)._gen_trainer_desc()
         self.proto_desc.class_name = "MultiTrainer"
-        self.device_worker_._set_infer(self.infer_)
-        self.device_worker_._gen_worker_desc(self.proto_desc)
+        self._device_worker._set_infer(self._infer)
+        self._device_worker._gen_worker_desc(self.proto_desc)
 
 
 class DistMultiTrainer(TrainerDesc):
@@ -89,13 +89,13 @@ class DistMultiTrainer(TrainerDesc):
 
     def _set_program(self, program):
         super(DistMultiTrainer, self)._set_program(program)
-        self.program_ = program
+        self._program = program
 
     def _gen_trainer_desc(self):
         super(DistMultiTrainer, self)._gen_trainer_desc()
         self.proto_desc.class_name = "DistMultiTrainer"
-        if self.program_ == None:
+        if self._program == None:
             raise RuntimeError("None Program")
-        self.device_worker_._set_infer(self.infer_)
-        self.device_worker_._set_program(self.program_)
-        self.device_worker_._gen_worker_desc(self.proto_desc)
+        self._device_worker._set_infer(self._infer)
+        self._device_worker._set_program(self._program)
+        self._device_worker._gen_worker_desc(self.proto_desc)
diff --git a/python/paddle/fluid/trainer_factory.py b/python/paddle/fluid/trainer_factory.py
index 4e957880f77a41d3dad9582bc7cc09af1d1a253b..871b663663e87a08ef3edaf58a4480b85caf4c4a 100644
--- a/python/paddle/fluid/trainer_factory.py
+++ b/python/paddle/fluid/trainer_factory.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .trainer_desc import MultiTrainer, DistMultiTrainer
+from .device_worker import Hogwild, DownpourSGD
+
 __all__ = ["TrainerFactory"]
 
 
@@ -20,8 +23,6 @@ class TrainerFactory(object):
         pass
 
     def _create_trainer(self, opt_info=None):
-        from .trainer_desc import MultiTrainer, DistMultiTrainer
-        from .device_worker import Hogwild, DownpourSGD
         trainer = None
         device_worker = None
         if opt_info == None:
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 41e5f47976c566306ad141f655a0f6516831d690..19a1f8bf74060905ecb4b81b44f7080db79c45e4 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -658,6 +658,7 @@ class DistributeTranspiler(object):
                 outputs={"Out": splited_var},
                 attrs={
                     "epmap": eps,
+                    "trainer_id": self.trainer_id,
                     RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
                 })
 
@@ -669,6 +670,7 @@ class DistributeTranspiler(object):
             outputs={"Out": fetch_barrier_out},
             attrs={
                 "endpoints": self.pserver_endpoints,
+                "trainer_id": self.trainer_id,
                 RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE
             })
 
@@ -791,11 +793,15 @@ class DistributeTranspiler(object):
 
         global_ops = []
 
+        # sparse grad name to param name
+        sparse_grad_to_param = []
+
         def __append_optimize_op__(op, block, grad_to_block_id, merged_var,
                                    lr_ops):
             if self._is_optimizer_op(op):
                 self._append_pserver_ops(block, op, endpoint, grad_to_block_id,
-                                         self.origin_program, merged_var)
+                                         self.origin_program, merged_var,
+                                         sparse_grad_to_param)
             elif op not in lr_ops:
                 self._append_pserver_non_opt_ops(block, op)
 
@@ -911,6 +917,7 @@ class DistributeTranspiler(object):
             "Fanin": self.trainer_num,
             "sync_mode": self.sync_mode,
             "grad_to_block_id": grad_to_block_id,
+            "sparse_grad_to_param": sparse_grad_to_param,
         }
 
         if self.has_distributed_lookup_table:
@@ -1779,7 +1786,8 @@ class DistributeTranspiler(object):
         return o4
 
     def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
-                            grad_to_block_id, origin_program, merged_var):
+                            grad_to_block_id, origin_program, merged_var,
+                            sparse_grad_to_param):
         program = optimize_block.program
         pserver_block = program.global_block()
         new_inputs = collections.OrderedDict()
@@ -1863,6 +1871,12 @@ class DistributeTranspiler(object):
             outputs=outputs,
             attrs=opt_op.all_attrs())
 
+        # record sparse grad to param name
+        if new_inputs["Grad"].type == core.VarDesc.VarType.SELECTED_ROWS:
+            sparse_grad_to_param.append(
+                str(new_inputs["Grad"].name) + ":" + str(new_inputs["Param"]
+                                                         .name))
+
     def _get_pserver_grad_param_var(self, var, var_dict):
         """
         Return pserver side grad/param variable, return None