diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index ce1857582bd3e8ab3077158384beaae36a83a4b2..e9852f00b1835adec31373f58ac538f9685251e0 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -62,8 +62,26 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif()
 
 if(WIN32)
-  # windows stupid compile option for all targets.
+  # windows header option for all targets.
   add_definitions(-D_XKEYCHECK_H)
+  # Use symbols instead of absolute path, reduce the cmake link command length. 
+  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
+  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
+  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
+  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS 1)
+  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_INCLUDES 1)
+  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES 1)
+  SET(CMAKE_C_RESPONSE_FILE_LINK_FLAG "@")
+  SET(CMAKE_CXX_RESPONSE_FILE_LINK_FLAG "@")
+
+  # Specify the program to use when building static libraries
+  SET(CMAKE_C_CREATE_STATIC_LIBRARY "<CMAKE_AR> lib <TARGET> <LINK_FLAGS> <OBJECTS>")
+  SET(CMAKE_CXX_CREATE_STATIC_LIBRARY "<CMAKE_AR> lib <TARGET> <LINK_FLAGS> <OBJECTS>")
+
+  # set defination for the dll export
+  if (NOT MSVC)
+    message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
+  endif(NOT MSVC)
 endif(WIN32)
 
 if(NOT WITH_GOLANG)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index a67512578147fc7223714dbc4cd124b831fb4775..5bf82b4ddf10a646ca540ac4ee2cfd3d3bc6cf58 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -110,6 +110,20 @@ function(find_fluid_modules TARGET_NAME)
   endif()
 endfunction(find_fluid_modules)
 
+# find all third_party modules is used for paddle static library
+# for reduce the dependency when building the inference libs.
+set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY)
+function(find_fluid_thirdparties TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+  string(FIND "${__target_path}" "third_party" pos)
+  if(pos GREATER 1)
+    get_property(fluid_ GLOBAL PROPERTY FLUID_THIRD_PARTY)
+    set(fluid_third_partys ${fluid_third_partys} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY "${fluid_third_partys}")
+  endif()
+endfunction(find_fluid_thirdparties)
+
 function(merge_static_libs TARGET_NAME)
   set(libs ${ARGN})
   list(REMOVE_DUPLICATES libs)
@@ -204,18 +218,13 @@ function(merge_static_libs TARGET_NAME)
 
     foreach(lib ${libs})
       # Get the file names of the libraries to be merged
-      #if(NOT $<TARGET_FILE:${lib}> MATCHES "lib.*\\.lib")
-      #  message("library" ${lib})
-      #  set(libfiles ${libfiles} lib$<TARGET_FILE:${lib}>)
-      #else()
       set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
-      #endif()
     endforeach()
-
-    # windows cmd return error in clean env.
-    # COMMAND del "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
+    # msvc will put libarary in directory of "/Release/xxxlib" by default 
+    #       COMMAND cmake -E remove "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/${TARGET_NAME}.lib"
     add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.lib ${libfiles}
+      COMMAND cmake -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}"
+      COMMAND lib /OUT:${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}/lib${TARGET_NAME}.lib ${libfiles}
       )
   endif(WIN32)
 endfunction(merge_static_libs)
diff --git a/doc/README.md b/doc/README.md
index 77aa2a5322057d582435e83f38476833d1f73c48..998a39f10699af6d1a391f177a5cf03c9ae170fd 100644
--- a/doc/README.md
+++ b/doc/README.md
@@ -2,6 +2,6 @@
 
 Thanks for reading PaddlePaddle documentation. 
 
-Since **September 17th, 2018**, the **0.15.0 and develop** documentation source has been moved to [Fluiddoc Repo](https://github.com/PaddlePaddle/Paddle) and updated in Fluiddoc Repo.
+Since **September 17th, 2018**, the **0.15.0 and develop** documentation source has been moved to [FluidDoc Repo](https://github.com/PaddlePaddle/FluidDoc) and updated there.
 
-Please turn to Fluiddoc Repo for the latest documentation.
+Please turn to FluidDoc Repo for the latest documentation.
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index c4a65bb13e25c550b3cdd08a0f1c8d11bee5abe8..d3583cf894991624f537a4073f14aacc470aadd0 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -22,9 +22,6 @@ paddle.fluid.Operator.rename_input ArgSpec(args=['self', 'old_name', 'new_name']
 paddle.fluid.Operator.rename_output ArgSpec(args=['self', 'old_name', 'new_name'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Operator.set_attr ArgSpec(args=['self', 'name', 'val'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.Operator.to_string ArgSpec(args=['self', 'throw_on_error'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Parameter.__init__ ArgSpec(args=['self', 'block', 'shape', 'dtype'], varargs=None, keywords='kwargs', defaults=None)
-paddle.fluid.Parameter.astype ArgSpec(args=['self', 'dtype'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Parameter.to_string ArgSpec(args=['self', 'throw_on_error', 'with_details'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.default_startup_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.default_main_program ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.program_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
@@ -35,29 +32,16 @@ paddle.fluid.Executor.close ArgSpec(args=['self'], varargs=None, keywords=None,
 paddle.fluid.Executor.run ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False))
 paddle.fluid.global_scope ArgSpec(args=[], varargs=None, keywords=None, defaults=None)
 paddle.fluid.scope_guard ArgSpec(args=[], varargs='args', keywords='kwds', defaults=None)
-paddle.fluid.Trainer.__init__ ArgSpec(args=['self', 'train_func', 'optimizer_func', 'param_path', 'place', 'parallel', 'checkpoint_config'], varargs=None, keywords=None, defaults=(None, None, False, None))
-paddle.fluid.Trainer.save_inference_model ArgSpec(args=['self', 'param_path', 'feeded_var_names', 'target_var_indexes'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Trainer.save_params ArgSpec(args=['self', 'param_path'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Trainer.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Trainer.test ArgSpec(args=['self', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.Trainer.train ArgSpec(args=['self', 'num_epochs', 'event_handler', 'reader', 'feed_order'], varargs=None, keywords=None, defaults=(None, None))
-paddle.fluid.BeginEpochEvent.__init__ ArgSpec(args=['self', 'epoch_id'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.EndEpochEvent.__init__ ArgSpec(args=['self', 'epoch_id'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.BeginStepEvent.__init__ ArgSpec(args=['self', 'epoch_id', 'step_id'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.EndStepEvent.__init__ ArgSpec(args=['self', 'epoch_id', 'step_id', 'metrics'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.CheckpointConfig.__init__ ArgSpec(args=['self', 'checkpoint_dir', 'max_num_checkpoints', 'epoch_interval', 'step_interval'], varargs=None, keywords=None, defaults=(None, 3, 1, 10))
-paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path', 'place', 'parallel'], varargs=None, keywords=None, defaults=(None, False))
-paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
-paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
+paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
 paddle.fluid.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
 paddle.fluid.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspilerConfig.__init__ 
-paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords='kwargs', defaults=(None, None, None, None, None, 1, 0, None))
+paddle.fluid.ParallelExecutor.__init__ ArgSpec(args=['self', 'use_cuda', 'loss_name', 'main_program', 'share_vars_from', 'exec_strategy', 'build_strategy', 'num_trainers', 'trainer_id', 'scope'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 1, 0, None))
 paddle.fluid.ParallelExecutor.run ArgSpec(args=['self', 'fetch_list', 'feed', 'feed_dict', 'return_numpy'], varargs=None, keywords=None, defaults=(None, None, True))
 paddle.fluid.ExecutionStrategy.__init__ __init__(self: paddle.fluid.core.ExecutionStrategy) -> None
 paddle.fluid.BuildStrategy.GradientScaleStrategy.__init__ __init__(self: paddle.fluid.core.GradientScaleStrategy, arg0: int) -> None
@@ -178,14 +162,14 @@ paddle.fluid.layers.unstack ArgSpec(args=['x', 'axis', 'num'], varargs=None, key
 paddle.fluid.layers.sequence_enumerate ArgSpec(args=['input', 'win_size', 'pad_value', 'name'], varargs=None, keywords=None, defaults=(0, None))
 paddle.fluid.layers.expand ArgSpec(args=['x', 'expand_times', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sequence_concat ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.scale ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None))
-paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
-paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
-paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
-paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
-paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
-paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
-paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(-1, False, None, None))
+paddle.fluid.layers.scale ArgSpec(args=['x', 'scale', 'bias', 'bias_after_scale', 'out', 'act', 'name'], varargs=None, keywords=None, defaults=(1.0, 0.0, True, None, None, None))
+paddle.fluid.layers.elementwise_add ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_div ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_sub ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_mul ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_max ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_min ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
+paddle.fluid.layers.elementwise_pow ArgSpec(args=['x', 'y', 'out', 'axis', 'use_mkldnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, -1, False, None, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
 paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None)
@@ -265,12 +249,12 @@ paddle.fluid.layers.sum ArgSpec(args=[], varargs='args', keywords='kwargs', defa
 paddle.fluid.layers.slice ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.shape ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.maxout ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
-paddle.fluid.layers.softshrink ArgSpec(args=[], varargs='args', keywords='kwargs', defaults=None)
 paddle.fluid.layers.sigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.logsigmoid ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.exp ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.tanh ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.tanh_shrink ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.softshrink ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.sqrt ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.abs ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.ceil ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
@@ -336,7 +320,7 @@ paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_programs ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self', 'wait_port'], varargs=None, keywords=None, defaults=(True,))
-paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None))
+paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode', 'startup_program', 'current_endpoint'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True, None, '127.0.0.1:6174'))
 paddle.fluid.transpiler.memory_optimize ArgSpec(args=['input_program', 'skip_opt_set', 'print_log', 'level'], varargs=None, keywords=None, defaults=(None, False, 0))
 paddle.fluid.transpiler.release_memory ArgSpec(args=['input_program', 'skip_opt_set'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.HashName.__init__ ArgSpec(args=['self', 'pserver_endpoints'], varargs=None, keywords=None, defaults=None)
@@ -350,6 +334,7 @@ paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'fi
 paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max'))
 paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,))
 paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0))
+paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn', 'use_mkldnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True, False))
 paddle.fluid.optimizer.SGDOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'regularization', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.optimizer.SGDOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'momentum', 'use_nesterov', 'regularization', 'name'], varargs=None, keywords=None, defaults=(False, None, None))
@@ -389,7 +374,7 @@ paddle.fluid.CPUPlace.__init__ __init__(self: paddle.fluid.core.CPUPlace) -> Non
 paddle.fluid.CUDAPlace.__init__ __init__(self: paddle.fluid.core.CUDAPlace, arg0: int) -> None
 paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinnedPlace) -> None
 paddle.fluid.ParamAttr.__init__ ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False))
-paddle.fluid.WeightNormParamAttr.__init__ ArgSpec(args=['self', 'dim'], varargs=None, keywords='kwargs', defaults=(None,))
+paddle.fluid.WeightNormParamAttr.__init__ ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False))
 paddle.fluid.DataFeeder.__init__ ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DataFeeder.decorate_reader ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True))
 paddle.fluid.DataFeeder.feed ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index 4ea1df655df005ba7585fb67fb0a3c3411a76418..2b265a773fe967f5b2ab38ce795b0f599d859c2a 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+// logging.h and windows.h conflict
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+// solve static linking error in windows
+// https://github.com/google/glog/issues/301
+#define GOOGLE_GLOG_DLL_DECL
 
 #include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -46,11 +51,13 @@ struct EigenTensor {
   using ConstType =
       Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
 
-  static Type From(Tensor& tensor, DDim dims) {
+  static Type From(Tensor& tensor, DDim dims) {  // NOLINT
     return Type(tensor.data<T>(), EigenDim<D>::From(dims));
   }
 
-  static Type From(Tensor& tensor) { return From(tensor, tensor.dims_); }
+  static Type From(Tensor& tensor) {  // NOLINT
+    return From(tensor, tensor.dims_);
+  }  // NOLINT
 
   static ConstType From(const Tensor& tensor, DDim dims) {
     return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
@@ -64,7 +71,8 @@ struct EigenTensor {
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
-  static typename EigenMatrix::Type Reshape(Tensor& tensor, int num_col_dims) {
+  static typename EigenMatrix::Type Reshape(Tensor& tensor,  // NOLINT
+                                            int num_col_dims) {
     int rank = tensor.dims_.size();
     PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
                    "`num_col_dims` must be between (0, rank_of_tensor).");
@@ -86,11 +94,12 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
   // Flatten reshapes a Tensor into an EigenVector.
-  static typename EigenVector::Type Flatten(Tensor& tensor) {
+  static typename EigenVector::Type Flatten(Tensor& tensor) {  // NOLINT
     return EigenVector::From(tensor, {product(tensor.dims_)});
   }
 
-  static typename EigenVector::ConstType Flatten(const Tensor& tensor) {
+  static typename EigenVector::ConstType Flatten(
+      const Tensor& tensor) {  // NOLINT
     return EigenVector::From(tensor, {product(tensor.dims_)});
   }
 };
@@ -104,7 +113,7 @@ struct EigenScalar {
   using ConstType = Eigen::TensorMap<
       Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
 
-  static Type From(Tensor& tensor) { return Type(tensor.data<T>()); }
+  static Type From(Tensor& tensor) { return Type(tensor.data<T>()); }  // NOLINT
 
   static ConstType From(const Tensor& tensor) {
     return ConstType(tensor.data<T>());
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
index 09c5ec59d66445bdbd5349447b125be89cb2efdf..d7df6389cfd595324e284e0da10f65213ccee80f 100644
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -26,8 +26,6 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
   PADDLE_ENFORCE(graph.get());
   FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get());
 
-  std::unordered_set<Node*> nodes2delete;
-
   GraphPatternDetector gpd;
   auto* conv_input = gpd.mutable_pattern()
                          ->NewNode("conv_relu_mkldnn_fuse/conv_input")
@@ -42,36 +40,20 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
                      Graph* g) {
     VLOG(4) << "handle ConvReLU fuse";
     GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
-                              conv_relu_pattern);  // Filter
-    GET_IR_NODE_FROM_SUBGRAPH(conv_bias, conv_bias, conv_relu_pattern);  // Bias
-    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern);    // tmp
+                              conv_relu_pattern);                      // Filter
+    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern);  // tmp
     GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern);  // CONV op
     GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern);  // Out
     GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern);  // ReLU op
 
-    // Create an ConvReLU Node.
-    OpDesc desc;
-    std::string conv_relu_i_in = subgraph.at(conv_input)->Name();
-    std::string conv_relu_w_in = conv_weight->Name();
-    std::string conv_relu_b_in = conv_bias->Name();
-    std::string conv_relu_out = relu_out->Name();
-    desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
-    desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
-    desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
-    desc.SetOutput("Output", std::vector<std::string>({conv_relu_out}));
-    desc.SetType("conv2d");
-    for (auto& attr : conv->Op()->GetAttrMap()) {
-      desc.SetAttr(attr.first, attr.second);
-    }
-    desc.SetAttr("fuse_relu", true);
-    auto conv_relu_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
-    GraphSafeRemoveNodes(graph.get(), {conv, relu, conv_out});
+    // Transform Conv node into ConvReLU node.
+    OpDesc* desc = conv->Op();
+    desc->SetOutput("Output", std::vector<std::string>({relu_out->Name()}));
+    desc->SetAttr("fuse_relu", true);
+    GraphSafeRemoveNodes(graph.get(), {relu, conv_out});
 
     PADDLE_ENFORCE(subgraph.count(conv_input));
-    IR_NODE_LINK_TO(subgraph.at(conv_input), conv_relu_node);
-    IR_NODE_LINK_TO(conv_weight, conv_relu_node);
-    IR_NODE_LINK_TO(conv_bias, conv_relu_node);
-    IR_NODE_LINK_TO(conv_relu_node, relu_out);
+    IR_NODE_LINK_TO(conv, relu_out);
 
     found_conv_relu_count++;
   };
diff --git a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
index 82b5fa1886098ca3b19c147c307d3f2fc3ba03d6..9dd780ec89ab991d6d99cb66fa2a9b683be2b9ca 100644
--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -85,16 +85,13 @@ TEST(ConvReLUFusePass, basic) {
 
   for (auto* node : graph->Nodes()) {
     if (node->IsOp() && node->Op()->Type() == "conv2d") {
-      if (node->Op()->HasAttr("use_mkldnn")) {
-        bool use_mkldnn = boost::get<bool>(node->Op()->GetAttr("use_mkldnn"));
-        if (use_mkldnn) {
-          if (node->Op()->HasAttr("fuse_relu")) {
-            bool fuse_relu = boost::get<bool>(node->Op()->GetAttr("fuse_relu"));
-            if (fuse_relu) {
-              ++conv_relu_count;
-            }
-          }
-        }
+      auto* op = node->Op();
+      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
+      EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
+      ASSERT_TRUE(op->HasAttr("fuse_relu"));
+      bool fuse_relu = boost::get<bool>(op->GetAttr("fuse_relu"));
+      if (fuse_relu) {
+        ++conv_relu_count;
       }
     }
   }
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index ef5113819696238d4e06b826bf43064b0f368dea..6d2c51b0e9bed8461f6491b84a36a3bf6663a138 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -638,11 +638,6 @@ PDNode *patterns::ConvReLU::operator()(
                               ->AsInput()
                               ->assert_is_persistable_var()
                               ->assert_is_op_input("conv2d", "Filter");
-  // Bias
-  auto *conv_bias_var = pattern->NewNode(conv_bias_repr())
-                            ->AsInput()
-                            ->assert_is_persistable_var()
-                            ->assert_is_op_input("conv2d", "Bias");
   // intermediate variable, will be removed in the IR after fuse.
   auto *conv_out_var = pattern->NewNode(conv_out_repr())
                            ->AsIntermediate()
@@ -653,8 +648,7 @@ PDNode *patterns::ConvReLU::operator()(
                            ->AsOutput()
                            ->assert_is_op_output("relu");
 
-  conv_op->LinksFrom({conv_input, conv_weight_var, conv_bias_var})
-      .LinksTo({conv_out_var});
+  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
   relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var});
   return relu_out_var;
 }
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 46950ed877cda7cd83ead4e9aa9a3aaae5d5ecfa..69b486c29d8bd1102a8372f5041051c25ce19359 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -379,7 +379,7 @@ struct PatternBase {
 // op: conv + relu
 // named nodes:
 // conv_input, conv_weight,
-// conv_bias, conv_out, conv,
+// conv_out, conv,
 // relu_out, relu
 struct ConvReLU : public PatternBase {
   ConvReLU(PDPattern* pattern, const std::string& name_scope)
@@ -392,7 +392,6 @@ struct ConvReLU : public PatternBase {
   PATTERN_DECL_NODE(relu);
   // declare variable node's name
   PATTERN_DECL_NODE(conv_weight);
-  PATTERN_DECL_NODE(conv_bias);
   PATTERN_DECL_NODE(conv_out);
   PATTERN_DECL_NODE(relu_out);
 };
diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
index df2a7a27ca4a6011b214202ac9bf4f30dc482ece..2663c9be41a834523fb896b490e7e75df256de05 100644
--- a/paddle/fluid/framework/op_proto_maker.cc
+++ b/paddle/fluid/framework/op_proto_maker.cc
@@ -132,7 +132,9 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
 
   AddAttr<std::string>(OpNamescopeAttrName(), "Operator name with namesope.")
       .SetDefault("");
-
+  AddAttr<std::vector<std::string>>(OpCreationCallstackAttrName(),
+                                    "Callstack for Op Creatation.")
+      .SetDefault({});
   Validate();
 }
 
diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
index 4ed3cc45d66849267ef4945a03da1db76b53e4ea..f13196959705bad473a6f7b3ef88f8faa8abe2b8 100644
--- a/paddle/fluid/framework/op_proto_maker.h
+++ b/paddle/fluid/framework/op_proto_maker.h
@@ -46,6 +46,7 @@ class OpProtoAndCheckerMaker {
   static const char *OpRoleAttrName() { return "op_role"; }
   static const char *OpRoleVarAttrName() { return "op_role_var"; }
   static const char *OpNamescopeAttrName() { return "op_namescope"; }
+  static const char *OpCreationCallstackAttrName() { return "op_callstack"; }
 
   void operator()(proto::OpProto *proto, OpAttrChecker *attr_checker);
 
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index e7dfa608b48f89a2155e43c7e63e31154675cd38..ef2eb334a4e7f3f482ba6d62d3f325f109c69302 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -23,6 +23,11 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 
+#if defined(_WIN32)
+#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#define GOOGLE_GLOG_DLL_DECL
+#endif
+
 #include "glog/logging.h"  // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"
@@ -241,22 +246,20 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
  * we will use and tell the compiler to
  * link them into target.
  */
-#define USE_OP_ITSELF(op_type)                                    \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                 \
-      __use_op_itself_##op_type,                                  \
-      "USE_OP_ITSELF must be called in global namespace");        \
-  extern int TouchOpRegistrar_##op_type();                        \
-  static int use_op_itself_##op_type##_ __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type()
+#define USE_OP_ITSELF(op_type)                             \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                          \
+      __use_op_itself_##op_type,                           \
+      "USE_OP_ITSELF must be called in global namespace"); \
+  extern int TouchOpRegistrar_##op_type();                 \
+  UNUSED static int use_op_itself_##op_type##_ = TouchOpRegistrar_##op_type()
 
 #define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE)               \
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                 \
       __use_op_kernel_##op_type##_##LIBRARY_TYPE##__,             \
       "USE_OP_DEVICE_KERNEL must be in global namespace");        \
   extern int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE(); \
-  static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_          \
-      __attribute__((unused)) =                                   \
-          TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE()
+  UNUSED static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_ = \
+      TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE()
 
 // TODO(fengjiayi): The following macros
 // seems ugly, do we have better method?
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index b7fae7171a57666a8fb4613a7cbe3aa15997b638..e800cb9993ddde45de7c33b11994359e77710daf 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -11,15 +11,20 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
+
+#include "paddle/fluid/framework/operator.h"
 #include <gflags/gflags.h>
 #include <glog/logging.h>
-
 #include <algorithm>
-
+#include <sstream>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/data_transform.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/op_proto_maker.h"
 #include "paddle/fluid/framework/shape_inference.h"
 #include "paddle/fluid/framework/var_type.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -137,19 +142,48 @@ static LoD GetLoD(const Scope& scope, const std::string& name) {
 }
 
 void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
-  VLOG(4) << place << " " << DebugStringEx(&scope);
-  if (platform::is_gpu_place(place)) {
+  try {
+    if (VLOG_IS_ON(4)) {
+      VLOG(4) << place << " " << DebugStringEx(&scope);
+    }
+    if (platform::is_gpu_place(place)) {
 #ifndef PADDLE_WITH_CUDA
-    PADDLE_THROW("Cannot run operator on place %s", place);
+      PADDLE_THROW("Cannot run operator on place %s", place);
 #else
-    auto dev_id = boost::get<platform::CUDAPlace>(place).device;
-    platform::SetDeviceId(dev_id);
+      auto dev_id = boost::get<platform::CUDAPlace>(place).device;
+      platform::SetDeviceId(dev_id);
 #endif
+    }
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(Type(), pool.Get(place));
+    RunImpl(scope, place);
+    if (VLOG_IS_ON(3)) {
+      VLOG(3) << place << " " << DebugStringEx(&scope);
+    }
+  } catch (platform::EnforceNotMet exception) {
+    if (Attrs().count("sub_block") != 0) {
+      throw exception;
+    }
+
+    auto& callstack = Attr<std::vector<std::string>>(
+        OpProtoAndCheckerMaker::OpCreationCallstackAttrName());
+
+    if (callstack.empty()) {
+      throw exception;
+    }
+    std::ostringstream sout;
+    sout << "Invoke operator " << Type() << " error.\n";
+    sout << "Python Callstacks: \n";
+    for (auto& line : callstack) {
+      sout << line;
+    }
+    sout << "C++ Callstacks: \n";
+    sout << exception.err_str_;
+    exception.err_str_ = sout.str();
+    throw exception;
+  } catch (...) {
+    std::rethrow_exception(std::current_exception());
   }
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  platform::RecordEvent record_event(Type(), pool.Get(place));
-  RunImpl(scope, place);
-  VLOG(3) << place << " " << DebugStringEx(&scope);
 }
 
 bool OperatorBase::HasInputs(const std::string& name) const {
@@ -177,7 +211,7 @@ const std::vector<std::string>& OperatorBase::Inputs(
 }
 
 bool OperatorBase::HasOutputs(const std::string& name) const {
-  if (outputs_.find(name) != outputs_.end()) {
+  if (outputs_.end() != outputs_.find(name)) {
     return true;
   } else {
     return false;
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 1040eb882baea624e972faf4af3094119df72308..626b50edfd39424473be33e9f8baec5970471477 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -20,6 +20,8 @@ limitations under the License. */
 #include <tuple>
 #include <unordered_map>
 #include <vector>
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
 
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/attribute.h"
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 684e0ce0e292d852d4601ebd1ccd920382e42c8b..1032aadcbda4f1b05841e08e1abe7c737c3aeb9c 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -71,15 +71,15 @@ bool AnalysisPredictor::Init(
     inference_program_ = paddle::inference::Load(
         executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
   } else {
-    LOG(ERROR) << "fail to load inference model.";
+    LOG(ERROR) << "fail to load inference model from " << config_.model_dir;
     return false;
   }
 
   OptimizeInferenceProgram();
-  ctx_ = executor_->Prepare(*inference_program_, 0);
   if (config_._use_mkldnn) {
     executor_->EnableMKLDNN(*inference_program_);
   }
+  ctx_ = executor_->Prepare(*inference_program_, 0);
 
   VLOG(5) << "to create variables";
   PADDLE_ENFORCE(scope_.get());
@@ -109,8 +109,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
   }
   argument_.origin_program_desc.reset(
       new ProgramDesc(*inference_program_->Proto()));
-  PADDLE_ENFORCE(config_.ir_mode == AnalysisConfig::IrPassMode::kExclude,
-                 "Only kExclude is supported yet.");
+  PADDLE_ENFORCE(
+      config_.ir_mode == contrib::AnalysisConfig::IrPassMode::kExclude,
+      "Only kExclude is supported yet.");
   Analyzer().DisableIrPasses(config_.ir_passes).Run(&argument_);
 
   CHECK(argument_.transformed_program_desc);
@@ -126,8 +127,9 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 }
 
 template <>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig& config) {
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<contrib::AnalysisConfig, PaddleEngineKind::kAnalysis>(
+    const contrib::AnalysisConfig& config) {
   VLOG(3) << "create AnalysisConfig";
   if (config.use_gpu) {
     // 1. GPU memeroy
@@ -154,4 +156,11 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
   return predictor;
 }
 
+template <>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<contrib::AnalysisConfig>(
+    const contrib::AnalysisConfig& config) {
+  return CreatePaddlePredictor<contrib::AnalysisConfig,
+                               PaddleEngineKind::kAnalysis>(config);
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index e53925366e9214cd60422efe56884751297c15e5..aa00e8be5c28c2e3bfe74fa0bff2c72210bd106e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -30,7 +30,7 @@ using framework::proto::ProgramDesc;
  */
 class AnalysisPredictor : public NativePaddlePredictor {
  public:
-  explicit AnalysisPredictor(const AnalysisConfig& config)
+  explicit AnalysisPredictor(const contrib::AnalysisConfig& config)
       : NativePaddlePredictor(config), config_(config) {}
 
   bool Init(const std::shared_ptr<framework::Scope>& parent_scope);
@@ -46,7 +46,7 @@ class AnalysisPredictor : public NativePaddlePredictor {
   Argument& analysis_argument() { return argument_; }
 
  private:
-  AnalysisConfig config_;
+  contrib::AnalysisConfig config_;
   Argument argument_;
 };
 
diff --git a/paddle/fluid/inference/api/api_anakin_engine.cc b/paddle/fluid/inference/api/api_anakin_engine.cc
index 43b31269d2bd82c06e284e3599a3763da693a2af..2c4894fd887f2f509dc7ab88c367cea5c1aed99a 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.cc
+++ b/paddle/fluid/inference/api/api_anakin_engine.cc
@@ -31,21 +31,24 @@
 
 namespace paddle {
 
+using paddle::contrib::AnakinConfig;
+
 template <typename Target>
 PaddleInferenceAnakinPredictor<Target>::PaddleInferenceAnakinPredictor(
-    const AnakinConfig &config) {
+    const contrib::AnakinConfig &config) {
   CHECK(Init(config));
 }
 template <>
 PaddleInferenceAnakinPredictor<anakin::X86>::PaddleInferenceAnakinPredictor(
-    const AnakinConfig &config) {
+    const contrib::AnakinConfig &config) {
   omp_set_dynamic(0);
   omp_set_num_threads(1);
   mkl_set_num_threads(1);
   CHECK(Init(config));
 }
 template <typename Target>
-bool PaddleInferenceAnakinPredictor<Target>::Init(const AnakinConfig &config) {
+bool PaddleInferenceAnakinPredictor<Target>::Init(
+    const contrib::AnakinConfig &config) {
   if (!(graph_.load(config.model_file))) {
     VLOG(3) << "fail to load graph from " << config.model_file;
     return false;
@@ -200,10 +203,11 @@ template class PaddleInferenceAnakinPredictor<anakin::X86>;
 
 // A factory to help create difference predictor.
 template <>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
-    AnakinConfig, PaddleEngineKind::kAnakin>(const AnakinConfig &config) {
+std::unique_ptr<PaddlePredictor>
+CreatePaddlePredictor<contrib::AnakinConfig, PaddleEngineKind::kAnakin>(
+    const contrib::AnakinConfig &config) {
   VLOG(3) << "Anakin Predictor create.";
-  if (config.target_type == AnakinConfig::NVGPU) {
+  if (config.target_type == contrib::AnakinConfig::NVGPU) {
 #ifdef PADDLE_WITH_CUDA
     VLOG(3) << "Anakin Predictor create on [ NVIDIA GPU ].";
     std::unique_ptr<PaddlePredictor> x(
@@ -213,7 +217,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
     LOG(ERROR) << "AnakinConfig::NVGPU could not used in ONLY-CPU environment";
     return nullptr;
 #endif
-  } else if (config.target_type == AnakinConfig::X86) {
+  } else if (config.target_type == contrib::AnakinConfig::X86) {
     VLOG(3) << "Anakin Predictor create on [ Intel X86 ].";
     std::unique_ptr<PaddlePredictor> x(
         new PaddleInferenceAnakinPredictor<anakin::X86>(config));
diff --git a/paddle/fluid/inference/api/api_anakin_engine.h b/paddle/fluid/inference/api/api_anakin_engine.h
index dd08661880d8cc3a9f4401e9af91a3d10e6579b6..04536ea3a53bbbc9293d92e69a23567e4bfd84c0 100644
--- a/paddle/fluid/inference/api/api_anakin_engine.h
+++ b/paddle/fluid/inference/api/api_anakin_engine.h
@@ -29,6 +29,8 @@ limitations under the License. */
 
 namespace paddle {
 
+using contrib::AnakinConfig;
+
 template <typename Target>
 class PaddleInferenceAnakinPredictor : public PaddlePredictor {
  public:
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 2e9e10139fa7008a46c3782960dfd44d3228cc26..dca4386b21b4a064c21b52218682321258f368c4 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/timer.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -101,14 +102,11 @@ bool NativePaddlePredictor::Init(
     inference_program_ = paddle::inference::Load(
         executor_.get(), scope_.get(), config_.prog_file, config_.param_file);
   } else {
-    LOG(ERROR) << "fail to load inference model.";
+    LOG(ERROR) << "fail to load inference model from " << config_.model_dir;
     return false;
   }
 
   ctx_ = executor_->Prepare(*inference_program_, 0);
-  if (config_._use_mkldnn) {
-    executor_->EnableMKLDNN(*inference_program_);
-  }
   executor_->CreateVariables(*inference_program_,
                              sub_scope_ ? sub_scope_ : scope_.get(), 0);
 
@@ -218,57 +216,20 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
 template <typename T>
 void NativePaddlePredictor::GetFetchOne(const framework::LoDTensor &fetch,
                                         PaddleTensor *output) {
-  std::vector<int> shape;
-  auto dims_i = fetch.dims();
-  auto lod = fetch.lod();
-  const T *output_ptr = fetch.data<T>();
-  auto num = fetch.numel();
-  std::vector<T> data;
-  if (0 == lod.size()) {
-    std::copy(output_ptr, output_ptr + num, std::back_inserter(data));
-    for (int j = 0; j < dims_i.size(); ++j) {
-      shape.push_back(dims_i[j]);
-    }
-  } else {
-    // for batch detection
-    // image[0] -> output[0] shape {145, 6}
-    // image[1] -> output[1] shape {176, 6}
-    // then,
-    // the batch output shape {321, 6}
-    // the lod {{0, 145, 321}}
-    // so we should append output[0] to {176, 6}
-    size_t max_dim = 0;
-    for (size_t j = 1; j < lod[0].size(); j++) {
-      max_dim = std::max(max_dim, lod[0][j] - lod[0][j - 1]);
-    }
-    size_t common_dim = lod[0].back() == 0 ? 0 : num / lod[0].back();
-    if (max_dim > 0) {
-      data.resize((lod[0].size() - 1) * max_dim * common_dim, 0);
-    }
-    for (size_t j = 1; j < lod[0].size(); j++) {
-      size_t start = lod[0][j - 1] * common_dim;
-      size_t end = lod[0][j] * common_dim;
-      if (end > start) {
-        std::copy(output_ptr + start, output_ptr + end,
-                  data.begin() + (j - 1) * max_dim * common_dim);
-      }
-    }
-    shape.push_back(lod[0].size() - 1);
-    shape.push_back(max_dim);
-    for (int j = 1; j < dims_i.size(); ++j) {
-      shape.push_back(dims_i[j]);
-    }
-  }
-
-  output->shape = shape;
-  auto &buffer = output->data;
-  if (buffer.empty() || buffer.length() < sizeof(T) * data.size()) {
-    buffer.Resize(sizeof(T) * data.size());
-  }
-  std::memcpy(buffer.data(), data.data(), sizeof(T) * data.size());
-  // copy LoD
-  for (const auto &level : fetch.lod()) {
-    output->lod.emplace_back(level);
+  // set shape.
+  auto shape = framework::vectorize(fetch.dims());
+  output->shape.assign(shape.begin(), shape.end());
+  // set data.
+  const T *data = fetch.data<T>();
+  int num_elems = inference::VecReduceToInt(shape);
+  output->data.Resize(num_elems * sizeof(T));
+  // The fetched tensor output by fetch op, should always in CPU memory, so just
+  // copy.
+  memcpy(output->data.data(), data, num_elems * sizeof(T));
+  // set lod
+  output->lod.clear();
+  for (auto &level : fetch.lod()) {
+    output->lod.emplace_back(level.begin(), level.end());
   }
 }
 
@@ -330,4 +291,10 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
 #endif
 }
 
+template <>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<NativeConfig>(
+    const NativeConfig &config) {
+  return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index 6ecc32a700506214d44244e6a5612a574b46cc6b..6386d601262b3dac0e957fae991d23768b52f2c0 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -14,15 +14,22 @@
 
 #pragma once
 
+// logging.h and windows.h conflict
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+// solve static linking error in windows
+// https://github.com/google/glog/issues/301
+#define GOOGLE_GLOG_DLL_DECL
+
 #include <glog/logging.h>
 #include <map>
 #include <memory>
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/profiler.h"
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
index 2b9be77e9f8856c1c4c581847b70dbc285a576d6..5ee6a5a93168f58770067f76ca7f6bb6f67b2965 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@@ -25,10 +25,11 @@ using inference::analysis::Argument;
 using inference::Singleton;
 using inference::analysis::Analyzer;
 using framework::proto::ProgramDesc;
+using paddle::contrib::MixedRTConfig;
 
 class TensorRTSubgraphPredictor : public NativePaddlePredictor {
  public:
-  explicit TensorRTSubgraphPredictor(const TensorRTConfig& config)
+  explicit TensorRTSubgraphPredictor(const MixedRTConfig& config)
       : NativePaddlePredictor(config), config_(config) {}
 
   bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
@@ -121,13 +122,13 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
   }
 
  private:
-  TensorRTConfig config_;
+  MixedRTConfig config_;
 };
 
 template <>
 std::unique_ptr<PaddlePredictor>
-CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
-    const TensorRTConfig& config) {
+CreatePaddlePredictor<MixedRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
+    const MixedRTConfig& config) {
   VLOG(3) << "create TensorRTSubgraphPredictor";
   if (config.use_gpu) {
     // 1. GPU memeroy
@@ -156,6 +157,13 @@ CreatePaddlePredictor<TensorRTConfig, PaddleEngineKind::kAutoMixedTensorRT>(
   return std::move(predictor);
 }
 
+template <>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<MixedRTConfig>(
+    const MixedRTConfig& config) {
+  return CreatePaddlePredictor<MixedRTConfig,
+                               PaddleEngineKind::kAutoMixedTensorRT>(config);
+}
+
 }  // namespace paddle
 
 USE_TRT_CONVERTER(elementwise_add_weight);
diff --git a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
index 9e7425eddd2df07ffe897f908aad360abe42117a..fc6310e90b0257bc84742fb617a00f5778bb1866 100644
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@@ -20,6 +20,8 @@
 
 namespace paddle {
 
+using contrib::MixedRTConfig;
+
 DEFINE_string(dirname, "", "Directory of the inference model.");
 
 void CompareTensorRTWithFluid(bool enable_tensorrt) {
@@ -32,7 +34,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
   config0.fraction_of_gpu_memory = 0.3;
   config0.device = 0;
 
-  TensorRTConfig config1;
+  MixedRTConfig config1;
   config1.model_dir = FLAGS_dirname + "word2vec.inference.model";
   config1.use_gpu = true;
   config1.fraction_of_gpu_memory = 0.3;
@@ -42,7 +44,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
   auto predictor0 =
       CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
   auto predictor1 =
-      CreatePaddlePredictor<TensorRTConfig,
+      CreatePaddlePredictor<MixedRTConfig,
                             PaddleEngineKind::kAutoMixedTensorRT>(config1);
 
   for (int batch_id = 0; batch_id < 1; batch_id++) {
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index afb46a7139f6ab8e6b3697fdc56fe1c78a05cd64..d4e6bb3e4a4ceb361ccd35121d0ecf84a764243e 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -1,13 +1,32 @@
 cmake_minimum_required(VERSION 3.0)
-
 project(cpp_inference_demo CXX C)
+option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
+option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
+option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
+
+macro(safe_set_static_flag)
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+endmacro()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 if (WIN32)
-set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
+  if (WITH_STATIC_LIB)
+    safe_set_static_flag()
+    add_definitions(-DSTATIC_LIB)
+    set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "/w")
+    set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} "/w")
+  endif()
+  set(CMAKE_STATIC_LIBRARY_PREFIX "lib")
 else()
-set(CMAKE_STATIC_LIBRARY_PREFIX "")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+  set(CMAKE_STATIC_LIBRARY_PREFIX "")
 endif()
+message("flags" ${CMAKE_CXX_FLAGS})
 
 if(NOT DEFINED PADDLE_LIB)
   message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib")
@@ -16,14 +35,18 @@ if(NOT DEFINED DEMO_NAME)
   message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name")
 endif()
 
-option(WITH_MKL        "Compile demo with MKL/OpenBlas support, default use MKL."       ON)
-option(WITH_GPU        "Compile demo with GPU/CPU, default use CPU."                    OFF)
-option(WITH_STATIC_LIB "Compile demo with static/shared library, default use static."   ON)
 
 if(WITH_GPU)
-  set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
+  if(NOT WIN32)
+    set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library")
+  else()
+    if(CUDA_LIB STREQUAL "")
+    set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64")
+    endif()
+  endif(NOT WIN32)
 endif()
 
+include_directories("D:/Paddle/")
 include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")
@@ -83,10 +106,18 @@ set(DEPS ${DEPS}
     ${MATH_LIB} ${MKLDNN_LIB}
     ${CMAKE_STATIC_LIBRARY_PREFIX}glog  ${CMAKE_STATIC_LIBRARY_PREFIX}gflags  ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf
     ${EXTERNAL_LIB})
+# NOTE(dzhwinter) shlwapi is deprecated.
+set(DEPS ${DEPS} libcmt shlwapi)
 endif(NOT WIN32)
 
 if(WITH_GPU)
-  set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
+  if(NOT WIN32)
+    set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX})
+  else()
+    set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} )
+  set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} )
+  set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} )
+  endif()
 endif()
 
 target_link_libraries(${DEMO_NAME} ${DEPS})
diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
index 03ac79e9edf0d7ce6e167c3d34af5ba84bbc0e72..360f924810a570422db5a00b13939813fa73e2fa 100644
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
+
+#include <algorithm>
 #include <memory>
 #include <thread>  //NOLINT
 #include "paddle/fluid/inference/paddle_inference_api.h"
@@ -67,7 +69,8 @@ void Main(bool use_gpu) {
                        0.000932706};
     const size_t num_elements = outputs.front().data.length() / sizeof(float);
     // The outputs' buffers are in CPU memory.
-    for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+    for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
+         i++) {
       PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
                      result[i]);
     }
@@ -113,7 +116,8 @@ void MainThreads(int num_threads, bool use_gpu) {
         const size_t num_elements =
             outputs.front().data.length() / sizeof(float);
         // The outputs' buffers are in CPU memory.
-        for (size_t i = 0; i < std::min(5UL, num_elements); i++) {
+        for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
+             i++) {
           PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
                          result[i]);
         }
diff --git a/paddle/fluid/inference/api/demo_ci/windows_inference.md b/paddle/fluid/inference/api/demo_ci/windows_inference.md
new file mode 100644
index 0000000000000000000000000000000000000000..44b2586ad6d33ce7cbd2bb3080acc96b5e27f660
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/windows_inference.md
@@ -0,0 +1,19 @@
+# windows inference
+本文介绍windows inference，目前只提供了静态编译，编译出paddle_fluid.lib，包含了除openblas.dll之外的所有第三方依赖库。
+
+1. 下载最新的paddle_fluid.lib和openblas.dll，并把它们放在同一个目录下。
+
+2. 准备预训练好的模型文件，例如models中的模型，可以将模型用safe_inference_model接口保存下来。将模型文件放到该目录下
+
+3. 进入Paddle/paddle/fluid/inference/api/demo_ci目录，新建build目录，然后使用cmake生成vs2015的solution文件。
+其中PADDLE_LIB是前面的paddle_fluid.lib对应文件夹, CUDA_LIB指定为x64格式下的cuda系统库目录文件夹。
+```shell
+ cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_GPU=ON -DWITH_MKL=OFF -DWITH_STATIC_LIB=ON -DCMAKE_BUILD_TYPE=Release -DDEMO_NAME=inference_icnet -DPADDLE_LIB=D:\to_the_paddle_fluid.lib -DCUDA_LIB=D:\tools\v8.0\lib\x64
+```
+然后用vs2015打开对应的项目文件，注意使用静态链接 "/MT"，生成对应的exe。将openblas.dll放到exe所在目录。
+
+4. 该exe即为项目生成文件，可绑定运行。
+
+## FAQ
+1. cmake需要您手动下载，并添加到系统路径里
+2. 路径中的不要包含空格，例如发现CUDA_LIB路径是Program Files(x86)可能会出错。可以将CUDA拷贝到一个新位置。
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index 8e359a67738c0df180933421b45f15b39fd0e78c..1fec2f96da0f9d978a3537b2d78e4ce5ef628c81 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -74,13 +74,17 @@ template <>
 std::string to_string<std::vector<std::vector<float>>>(
     const std::vector<std::vector<std::vector<float>>> &vec);
 
+template <typename T>
+int VecReduceToInt(const std::vector<T> &v) {
+  return std::accumulate(v.begin(), v.end(), 1, [](T a, T b) { return a * b; });
+}
+
 template <typename T>
 static void TensorAssignData(PaddleTensor *tensor,
                              const std::vector<std::vector<T>> &data) {
   // Assign buffer
-  int dim = std::accumulate(tensor->shape.begin(), tensor->shape.end(), 1,
-                            [](int a, int b) { return a * b; });
-  tensor->data.Resize(sizeof(T) * dim);
+  int num_elems = VecReduceToInt(tensor->shape);
+  tensor->data.Resize(sizeof(T) * num_elems);
   int c = 0;
   for (const auto &f : data) {
     for (T v : f) {
@@ -89,7 +93,7 @@ static void TensorAssignData(PaddleTensor *tensor,
   }
 }
 
-std::string DescribeTensor(const PaddleTensor &tensor) {
+static std::string DescribeTensor(const PaddleTensor &tensor) {
   std::stringstream os;
   os << "Tensor [" << tensor.name << "]\n";
   os << " - type: ";
@@ -113,8 +117,7 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
   os << "\n";
   os << " - data: ";
 
-  int dim = std::accumulate(tensor.shape.begin(), tensor.shape.end(), 1,
-                            [](int a, int b) { return a * b; });
+  int dim = VecReduceToInt(tensor.shape);
   for (int i = 0; i < dim; i++) {
     os << static_cast<float *>(tensor.data.data())[i] << " ";
   }
@@ -122,8 +125,8 @@ std::string DescribeTensor(const PaddleTensor &tensor) {
   return os.str();
 }
 
-void PrintTime(int batch_size, int repeat, int num_threads, int tid,
-               double latency, int epoch = 1) {
+static void PrintTime(int batch_size, int repeat, int num_threads, int tid,
+                      double latency, int epoch = 1) {
   LOG(INFO) << "====== batch_size: " << batch_size << ", repeat: " << repeat
             << ", threads: " << num_threads << ", thread id: " << tid
             << ", latency: " << latency << "ms ======";
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index d0527d714acdcda047ad9d47ccd0e3a8083c771f..01ea0d9c3ad37b3bcebe6853de77373810333776 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -28,34 +28,61 @@ limitations under the License. */
 
 namespace paddle {
 
+// Data type.
 enum PaddleDType {
   FLOAT32,
   INT64,
+  // TODO(Superjomn) support more data types if needed.
 };
 
+/*
+ * Memory menage for PaddleTensor.
+ * The PaddleBuf holds a buffer for data input or output. The memory can be
+ * allocated by user or by PaddleBuf itself, but in any case, the PaddleBuf
+ * should be reused for better performance.
+ *
+ * For user allocated memory, the following API can be used:
+ * - PaddleBuf(void* data, size_t length) to set an external memory by
+ * specifying
+ *   the memory address and length.
+ * - Reset(void* data, size_t length) to reset the PaddleBuf with an external
+ * memory.
+ * ATTENTION, for user allocated memory, deallocation should be done by users
+ * externally after the program finished. The PaddleBuf won't do any allocation
+ * or deallocation.
+ *
+ * To have the PaddleBuf allocate and manage the memory:
+ * - PaddleBuf(size_t length) will allocate a memory of size `length`.
+ * - Resize(size_t length) resize the memory to no less than `length`, ATTENTION
+ *   if the allocated memory is larger than `length`, nothing will done.
+ */
 class PaddleBuf {
  public:
-  PaddleBuf() = default;
-  PaddleBuf(PaddleBuf&& other);
-  // Copy only available when memory is managed externally.
-  explicit PaddleBuf(const PaddleBuf&);
-  PaddleBuf& operator=(const PaddleBuf&);
-  PaddleBuf& operator=(PaddleBuf&&);
-  // Do not own the memory.
-  PaddleBuf(void* data, size_t length)
-      : data_(data), length_(length), memory_owned_{false} {}
-  // Own memory.
+  // PaddleBuf allocate memory internally, and manage it.
   explicit PaddleBuf(size_t length)
       : data_(new char[length]), length_(length), memory_owned_(true) {}
-  // Resize to `length` bytes.
+  // Set external memory, the PaddleBuf won't manage it.
+  PaddleBuf(void* data, size_t length)
+      : data_(data), length_(length), memory_owned_{false} {}
+  // Copy only available when memory is managed externally.
+  explicit PaddleBuf(const PaddleBuf&);
+
+  // Resize the memory.
   void Resize(size_t length);
-  // Reset to external memory.
+  // Reset to external memory, with address and length set.
   void Reset(void* data, size_t length);
+  // Tell whether the buffer is empty.
   bool empty() const { return length_ == 0; }
+  // Get the memory address.
   void* data() const { return data_; }
+  // Get the memory length.
   size_t length() const { return length_; }
 
   ~PaddleBuf() { Free(); }
+  PaddleBuf& operator=(const PaddleBuf&);
+  PaddleBuf& operator=(PaddleBuf&&);
+  PaddleBuf() = default;
+  PaddleBuf(PaddleBuf&& other);
 
  private:
   void Free();
@@ -64,6 +91,7 @@ class PaddleBuf {
   bool memory_owned_{true};
 };
 
+// Basic input and output data structure for PaddlePredictor.
 struct PaddleTensor {
   PaddleTensor() = default;
   std::string name;  // variable name.
@@ -73,19 +101,8 @@ struct PaddleTensor {
   std::vector<std::vector<size_t>> lod;  // Tensor+LoD equals LoDTensor
 };
 
-enum class PaddleEngineKind {
-  kNative = 0,         // Use the native Fluid facility.
-  kAnakin,             // Use Anakin for inference.
-  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
-  kAnalysis
-  // TODO(Superjomn) support following engines latter.
-  // kTensorRT,           // Use TensorRT for inference.
-  // kAutoMixedAnakin,    // Automatically mix Fluid with Anakin.
-};
-
 /*
- * A simple Inference API for Paddle. Currently this API can be used by
- * non-sequence scenerios.
+ * A simple Inference API for Paddle.
  */
 class PaddlePredictor {
  public:
@@ -120,26 +137,53 @@ struct NativeConfig : public PaddlePredictor::Config {
   // GPU related fields.
   bool use_gpu{false};
   int device{0};
-  float fraction_of_gpu_memory{-1.f};  // Negative to notify initialization.
-  // NOTE: NOT use it, just for the internal test, will discard later
-  bool _use_mkldnn{false};
-  // Specify the variable's name of each input.
-  bool specify_input_name{false};
+  float fraction_of_gpu_memory{-1.f};  // Change to a float in (0,1] if needed.
 
+  // Specify the exact path of program and parameter files.
   std::string prog_file;
   std::string param_file;
+
+  // Specify the variable's name of each input if input tensors don't follow the
+  // `feeds` and `fetches` of the phase `save_inference_model`.
+  bool specify_input_name{false};
 };
 
-// Configurations for Anakin engine.
-struct AnakinConfig : public PaddlePredictor::Config {
-  enum TargetType { NVGPU = 0, X86 };
-  int device;
-  std::string model_file;
-  int max_batch_size{-1};
-  TargetType target_type;
+// A factory to help create different predictors.
+//
+// Usage:
+//
+// NativeConfig config;
+// ... // change the configs.
+// auto native_predictor = CreatePaddlePredictor(config);
+//
+// FOR EXTENSION DEVELOPER:
+// Different predictors are designated by config type. Similar configs can be
+// merged, but there shouldn't be a huge config containing different fields for
+// more than one kind of predictors.
+template <typename ConfigT>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+// NOTE The following APIs are too trivial, we will discard it in the following
+// versions.
+enum class PaddleEngineKind {
+  kNative = 0,         // Use the native Fluid facility.
+  kAutoMixedTensorRT,  // Automatically mix Fluid with TensorRT.
+  kAnalysis,           // More optimization.
+  kAnakin              // Use Anakin for inference, not mature yet.
 };
 
-struct TensorRTConfig : public NativeConfig {
+template <typename ConfigT, PaddleEngineKind engine>
+std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+
+// ==
+//
+// -----------------------------------------------------------------------------------
+// NOTE: The following APIs are not mature yet, we are still working on them.
+
+namespace contrib {
+
+// Accelerate GPU computation with TensorRT engine.
+struct MixedRTConfig : public NativeConfig {
   // Determine whether a subgraph will be executed by TRT.
   int min_subgraph_size{1};
   // While TensorRT allows an engine optimized for a given max batch size
@@ -162,7 +206,6 @@ struct TensorRTConfig : public NativeConfig {
 
 // NOTE WIP, not stable yet.
 struct AnalysisConfig : public NativeConfig {
-  //
   enum class IrPassMode {
     kSystem,   // Use system default passes, not customize.
     kInclude,  // Specify the passes in `ir_passes`.
@@ -173,18 +216,21 @@ struct AnalysisConfig : public NativeConfig {
   IrPassMode ir_mode{IrPassMode::kExclude};
   // attention lstm fuse works only on some specific models, disable as default.
   std::vector<std::string> ir_passes{"attention_lstm_fuse_pass"};
+
+  // NOTE this is just for internal development, please not use it.
+  bool _use_mkldnn{false};
 };
 
-// A factory to help create different predictors.
-//
-// FOR EXTENSION DEVELOPER:
-// Different predictors are designated by config type and engine kind. Similar
-// configs can be merged, but there shouldn't be a huge config containing
-// different fields for more than one kind of predictors.
-//
-// Similarly, each engine kind should map to a unique predictor implementation.
-template <typename ConfigT, PaddleEngineKind engine = PaddleEngineKind::kNative>
-std::unique_ptr<PaddlePredictor> CreatePaddlePredictor(const ConfigT& config);
+// Configurations for Anakin engine.
+struct AnakinConfig : public PaddlePredictor::Config {
+  enum TargetType { NVGPU = 0, X86 };
+  int device;
+  std::string model_file;
+  int max_batch_size{-1};
+  TargetType target_type;
+};
+
+}  // namespace contrib
 
 int PaddleDtypeSize(PaddleDType dtype);
 
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 9c057affca9c4b6b3fcd4574a587af1b78145b5c..925c3e6b879b56ab158e983032efbbc673357eba 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -58,6 +58,11 @@ set(TEXT_CLASSIFICATION_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/text_classifi
 download_model_and_data(${TEXT_CLASSIFICATION_INSTALL_DIR} "text-classification-Senta.tar.gz" "text_classification_data.txt.tar.gz")
 inference_analysis_api_test(test_analyzer_text_classification ${TEXT_CLASSIFICATION_INSTALL_DIR} analyzer_text_classification_tester.cc)
 
+# seq_conv1
+set(SEQ_CONV1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/seq_conv1")
+download_model_and_data(${SEQ_CONV1_INSTALL_DIR} "seq_conv1_model.tar.gz" "seq_conv1_data.txt.tar.gz")
+inference_analysis_api_test(test_analyzer_seq_conv1 ${SEQ_CONV1_INSTALL_DIR} analyzer_seq_conv1_tester.cc)
+
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
 if (NOT EXISTS ${OCR_INSTALL_DIR})
diff --git a/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc b/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc
index 62e820b68c79a47d963bb174663bfc8c4ac22de3..cf97f064beddb6ede1d4716f323b4c5b46cb266d 100644
--- a/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc
+++ b/paddle/fluid/inference/tests/api/anakin_mobilenet_tester.cc
@@ -22,10 +22,10 @@ DEFINE_string(model, "", "Directory of the inference model(mobile_v2).");
 
 namespace paddle {
 
-AnakinConfig GetConfig() {
-  AnakinConfig config;
+contrib::AnakinConfig GetConfig() {
+  contrib::AnakinConfig config;
   // using AnakinConfig::X86 if you need to use cpu to do inference
-  config.target_type = AnakinConfig::NVGPU;
+  config.target_type = contrib::AnakinConfig::NVGPU;
   config.model_file = FLAGS_model;
   config.device = 0;
   config.max_batch_size = 1;
@@ -33,9 +33,10 @@ AnakinConfig GetConfig() {
 }
 
 TEST(inference, anakin) {
-  AnakinConfig config = GetConfig();
+  auto config = GetConfig();
   auto predictor =
-      CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
+      CreatePaddlePredictor<contrib::AnakinConfig, PaddleEngineKind::kAnakin>(
+          config);
 
   float data[1 * 3 * 224 * 224] = {1.0f};
   PaddleTensor tensor;
diff --git a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
index 98c74aaa562dce6618ccde8f11f4344eefd59ef2..82bc83988de688e46613e160b66943c89c4a0391 100644
--- a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
@@ -97,10 +97,10 @@ void Data::get_batch_data(
 
 namespace paddle {
 
-AnakinConfig GetConfig() {
-  AnakinConfig config;
+contrib::AnakinConfig GetConfig() {
+  contrib::AnakinConfig config;
   // using AnakinConfig::X86 if you need to use cpu to do inference
-  config.target_type = AnakinConfig::X86;
+  config.target_type = contrib::AnakinConfig::X86;
   config.model_file = FLAGS_model;
   config.device = 0;
   config.max_batch_size = 1000;  // the max number of token
@@ -121,9 +121,10 @@ void set_tensor(std::string name, std::vector<int> shape,
 }
 
 void single_test() {
-  AnakinConfig config = GetConfig();
+  auto config = GetConfig();
   auto predictor =
-      CreatePaddlePredictor<AnakinConfig, PaddleEngineKind::kAnakin>(config);
+      CreatePaddlePredictor<contrib::AnakinConfig, PaddleEngineKind::kAnakin>(
+          config);
 
   int max_batch_size = 1000;
   std::string feature_file = FLAGS_datapath;
diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
index 8cf230a51d05c3a141f7cfd4e30bf30f064f0989..59020545cd609961487cafc4a08c20951a02c8ce 100644
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
@@ -95,7 +95,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   }
 }
 
-void SetConfig(AnalysisConfig *cfg) {
+void SetConfig(contrib::AnalysisConfig *cfg) {
   cfg->prog_file = FLAGS_infer_model + "/__model__";
   cfg->param_file = FLAGS_infer_model + "/param";
   cfg->use_gpu = false;
@@ -117,7 +117,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 
 // Easy for profiling independently.
 TEST(Analyzer_Chinese_ner, profile) {
-  AnalysisConfig cfg;
+  contrib::AnalysisConfig cfg;
   SetConfig(&cfg);
   std::vector<PaddleTensor> outputs;
 
@@ -141,7 +141,7 @@ TEST(Analyzer_Chinese_ner, profile) {
 
 // Check the fuse status
 TEST(Analyzer_Chinese_ner, fuse_statis) {
-  AnalysisConfig cfg;
+  contrib::AnalysisConfig cfg;
   SetConfig(&cfg);
 
   int num_ops;
@@ -155,7 +155,7 @@ TEST(Analyzer_Chinese_ner, fuse_statis) {
 
 // Compare result of NativeConfig and AnalysisConfig
 TEST(Analyzer_Chinese_ner, compare) {
-  AnalysisConfig cfg;
+  contrib::AnalysisConfig cfg;
   SetConfig(&cfg);
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
diff --git a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
index 14bdf76efc71b326bd130858ea246be81c9bd45c..3bf5383d8f35347c767d6caee83e0dcc5fb0a446 100644
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
@@ -149,7 +149,7 @@ void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
   }
 }
 
-void SetConfig(AnalysisConfig *cfg) {
+void SetConfig(contrib::AnalysisConfig *cfg) {
   cfg->prog_file = FLAGS_infer_model + "/__model__";
   cfg->param_file = FLAGS_infer_model + "/param";
   cfg->use_gpu = false;
@@ -172,7 +172,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 
 // Easy for profiling independently.
 TEST(Analyzer_rnn1, profile) {
-  AnalysisConfig cfg;
+  contrib::AnalysisConfig cfg;
   SetConfig(&cfg);
   std::vector<PaddleTensor> outputs;
 
@@ -183,7 +183,7 @@ TEST(Analyzer_rnn1, profile) {
 
 // Check the fuse status
 TEST(Analyzer_rnn1, fuse_statis) {
-  AnalysisConfig cfg;
+  contrib::AnalysisConfig cfg;
   SetConfig(&cfg);
 
   int num_ops;
@@ -198,7 +198,7 @@ TEST(Analyzer_rnn1, fuse_statis) {
 
 // Compare result of NativeConfig and AnalysisConfig
 TEST(Analyzer_rnn1, compare) {
-  AnalysisConfig cfg;
+  contrib::AnalysisConfig cfg;
   SetConfig(&cfg);
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
@@ -208,7 +208,7 @@ TEST(Analyzer_rnn1, compare) {
 
 // Test Multi-Thread.
 TEST(Analyzer_rnn1, multi_thread) {
-  AnalysisConfig cfg;
+  contrib::AnalysisConfig cfg;
   SetConfig(&cfg);
   std::vector<PaddleTensor> outputs;
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2f71ed46ffc9fd5f853f5b5b46de1446d28b9e69
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -0,0 +1,199 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+struct DataRecord {
+  std::vector<std::vector<int64_t>> title1_all, title2_all, title3_all, l1_all;
+  std::vector<std::vector<int64_t>> title1, title2, title3, l1;
+  std::vector<size_t> title1_lod, title2_lod, title3_lod, l1_lod;
+  size_t batch_iter{0};
+  size_t batch_size{1};
+  size_t num_samples;  // total number of samples
+  DataRecord() = default;
+  explicit DataRecord(const std::string &path, int batch_size = 1)
+      : batch_size(batch_size) {
+    Load(path);
+  }
+  DataRecord NextBatch() {
+    DataRecord data;
+    size_t batch_end = batch_iter + batch_size;
+    // NOTE skip the final batch, if no enough data is provided.
+    if (batch_end <= title1_all.size()) {
+      data.title1_all.assign(title1_all.begin() + batch_iter,
+                             title1_all.begin() + batch_end);
+      data.title2_all.assign(title2_all.begin() + batch_iter,
+                             title2_all.begin() + batch_end);
+      data.title3_all.assign(title3_all.begin() + batch_iter,
+                             title3_all.begin() + batch_end);
+      data.l1_all.assign(l1_all.begin() + batch_iter,
+                         l1_all.begin() + batch_end);
+      // Prepare LoDs
+      data.title1_lod.push_back(0);
+      data.title2_lod.push_back(0);
+      data.title3_lod.push_back(0);
+      data.l1_lod.push_back(0);
+      CHECK(!data.title1_all.empty());
+      CHECK(!data.title2_all.empty());
+      CHECK(!data.title3_all.empty());
+      CHECK(!data.l1_all.empty());
+      CHECK_EQ(data.title1_all.size(), data.title2_all.size());
+      CHECK_EQ(data.title1_all.size(), data.title3_all.size());
+      CHECK_EQ(data.title1_all.size(), data.l1_all.size());
+      for (size_t j = 0; j < data.title1_all.size(); j++) {
+        data.title1.push_back(data.title1_all[j]);
+        data.title2.push_back(data.title2_all[j]);
+        data.title3.push_back(data.title3_all[j]);
+        data.l1.push_back(data.l1_all[j]);
+        // calculate lod
+        data.title1_lod.push_back(data.title1_lod.back() +
+                                  data.title1_all[j].size());
+        data.title2_lod.push_back(data.title2_lod.back() +
+                                  data.title2_all[j].size());
+        data.title3_lod.push_back(data.title3_lod.back() +
+                                  data.title3_all[j].size());
+        data.l1_lod.push_back(data.l1_lod.back() + data.l1_all[j].size());
+      }
+    }
+    batch_iter += batch_size;
+    return data;
+  }
+  void Load(const std::string &path) {
+    std::ifstream file(path);
+    std::string line;
+    int num_lines = 0;
+    while (std::getline(file, line)) {
+      num_lines++;
+      std::vector<std::string> data;
+      split(line, '\t', &data);
+      // load title1 data
+      std::vector<int64_t> title1_data;
+      split_to_int64(data[0], ' ', &title1_data);
+      // load title2 data
+      std::vector<int64_t> title2_data;
+      split_to_int64(data[1], ' ', &title2_data);
+      // load title3 data
+      std::vector<int64_t> title3_data;
+      split_to_int64(data[2], ' ', &title3_data);
+      // load l1 data
+      std::vector<int64_t> l1_data;
+      split_to_int64(data[3], ' ', &l1_data);
+      title1_all.push_back(std::move(title1_data));
+      title2_all.push_back(std::move(title2_data));
+      title3_all.push_back(std::move(title3_data));
+      l1_all.push_back(std::move(l1_data));
+    }
+    num_samples = num_lines;
+  }
+};
+
+void PrepareInputs(std::vector<PaddleTensor> *input_slots, DataRecord *data,
+                   int batch_size) {
+  PaddleTensor title1_tensor, title2_tensor, title3_tensor, l1_tensor;
+  title1_tensor.name = "title1";
+  title2_tensor.name = "title2";
+  title3_tensor.name = "title3";
+  l1_tensor.name = "l1";
+  auto one_batch = data->NextBatch();
+  int title1_size = one_batch.title1_lod[one_batch.title1_lod.size() - 1];
+  title1_tensor.shape.assign({title1_size, 1});
+  title1_tensor.lod.assign({one_batch.title1_lod});
+  int title2_size = one_batch.title2_lod[one_batch.title2_lod.size() - 1];
+  title2_tensor.shape.assign({title2_size, 1});
+  title2_tensor.lod.assign({one_batch.title2_lod});
+  int title3_size = one_batch.title3_lod[one_batch.title3_lod.size() - 1];
+  title3_tensor.shape.assign({title3_size, 1});
+  title3_tensor.lod.assign({one_batch.title3_lod});
+  int l1_size = one_batch.l1_lod[one_batch.l1_lod.size() - 1];
+  l1_tensor.shape.assign({l1_size, 1});
+  l1_tensor.lod.assign({one_batch.l1_lod});
+
+  // assign data
+  TensorAssignData<int64_t>(&title1_tensor, one_batch.title1);
+  TensorAssignData<int64_t>(&title2_tensor, one_batch.title2);
+  TensorAssignData<int64_t>(&title3_tensor, one_batch.title3);
+  TensorAssignData<int64_t>(&l1_tensor, one_batch.l1);
+  // Set inputs.
+  input_slots->assign({title1_tensor, title2_tensor, title3_tensor, l1_tensor});
+  for (auto &tensor : *input_slots) {
+    tensor.dtype = PaddleDType::INT64;
+  }
+}
+
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->model_dir = FLAGS_infer_model;
+  cfg->use_gpu = false;
+  cfg->device = 0;
+  cfg->specify_input_name = true;
+  cfg->enable_ir_optim = true;
+}
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  DataRecord data(FLAGS_infer_data, FLAGS_batch_size);
+  std::vector<PaddleTensor> input_slots;
+  int epoch = FLAGS_test_all_data ? data.num_samples / FLAGS_batch_size : 1;
+  LOG(INFO) << "number of samples: " << epoch * FLAGS_batch_size;
+  for (int bid = 0; bid < epoch; ++bid) {
+    PrepareInputs(&input_slots, &data, FLAGS_batch_size);
+    (*inputs).emplace_back(input_slots);
+  }
+}
+
+// Easy for profiling independently.
+TEST(Analyzer_seq_conv1, profile) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  std::vector<PaddleTensor> outputs;
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(cfg, input_slots_all, &outputs, FLAGS_num_threads);
+
+  if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) {
+    // the first inference result
+    PADDLE_ENFORCE_EQ(outputs.size(), 1UL);
+    size_t size = GetSize(outputs[0]);
+    PADDLE_ENFORCE_GT(size, 0);
+    float *result = static_cast<float *>(outputs[0].data.data());
+    // output is probability, which is in (0, 1).
+    for (size_t i = 0; i < size; i++) {
+      EXPECT_GT(result[i], 0);
+      EXPECT_LT(result[i], 1);
+    }
+  }
+}
+
+// Check the fuse status
+TEST(Analyzer_seq_conv1, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  int num_ops;
+  auto fuse_statis = GetFuseStatis(cfg, &num_ops);
+}
+
+// Compare result of NativeConfig and AnalysisConfig
+TEST(Analyzer_seq_conv1, compare) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(cfg, input_slots_all);
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h
index 05cd343433beb6d8fd80915f65b917bb13d345f6..9fcb5129d268a7730c11e5910077ad233050484e 100644
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -38,6 +38,8 @@ DEFINE_bool(use_analysis, true,
 namespace paddle {
 namespace inference {
 
+using contrib::AnalysisConfig;
+
 void CompareResult(const std::vector<PaddleTensor> &outputs,
                    const std::vector<PaddleTensor> &ref_outputs) {
   EXPECT_GT(outputs.size(), 0UL);
@@ -45,11 +47,8 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
   for (size_t i = 0; i < outputs.size(); i++) {
     auto &out = outputs[i];
     auto &ref_out = ref_outputs[i];
-    size_t size = std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                                  [](int a, int b) { return a * b; });
-    size_t ref_size =
-        std::accumulate(ref_out.shape.begin(), ref_out.shape.end(), 1,
-                        [](int a, int b) { return a * b; });
+    size_t size = VecReduceToInt(out.shape);
+    size_t ref_size = VecReduceToInt(ref_out.shape);
     EXPECT_GT(size, 0);
     EXPECT_EQ(size, ref_size);
     EXPECT_EQ(out.dtype, ref_out.dtype);
@@ -77,18 +76,15 @@ void CompareResult(const std::vector<PaddleTensor> &outputs,
 std::unique_ptr<PaddlePredictor> CreateTestPredictor(
     const AnalysisConfig &config, bool use_analysis = true) {
   if (use_analysis) {
-    return CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
-        config);
+    return CreatePaddlePredictor<contrib::AnalysisConfig,
+                                 PaddleEngineKind::kAnalysis>(config);
   } else {
     return CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(
         config);
   }
 }
 
-size_t GetSize(const PaddleTensor &out) {
-  return std::accumulate(out.shape.begin(), out.shape.end(), 1,
-                         [](int a, int b) { return a * b; });
-}
+size_t GetSize(const PaddleTensor &out) { return VecReduceToInt(out.shape); }
 
 std::unordered_map<std::string, int> GetFuseStatis(AnalysisConfig config,
                                                    int *num_ops) {
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 286b03d7b7d11a50f33f0190c1a5b9097ed0f4a2..c091476d6d132db17a656d5c8dee65e3a88d9ac2 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/activation_op.h"
 #include <string>
 #include "paddle/fluid/operators/mkldnn_activation_op.h"
+#include "paddle/fluid/platform/port.h"
 
 namespace paddle {
 namespace operators {
@@ -105,105 +106,105 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
   }
 };
 
-__attribute__((unused)) constexpr char SigmoidDoc[] = R"DOC(
+UNUSED constexpr char SigmoidDoc[] = R"DOC(
 Sigmoid Activation Operator
 
 $$out = \frac{1}{1 + e^{-x}}$$
 
 )DOC";
 
-__attribute__((unused)) constexpr char LogSigmoidDoc[] = R"DOC(
+UNUSED constexpr char LogSigmoidDoc[] = R"DOC(
 Logsigmoid Activation Operator
 
 $$out = \\log \\frac{1}{1 + e^{-x}}$$
 
 )DOC";
 
-__attribute__((unused)) constexpr char ExpDoc[] = R"DOC(
+UNUSED constexpr char ExpDoc[] = R"DOC(
 Exp Activation Operator.
 
 $out = e^x$
 
 )DOC";
 
-__attribute__((unused)) constexpr char ReluDoc[] = R"DOC(
+UNUSED constexpr char ReluDoc[] = R"DOC(
 Relu Activation Operator.
 
 $out = \max(x, 0)$
 
 )DOC";
 
-__attribute__((unused)) constexpr char TanhDoc[] = R"DOC(
+UNUSED constexpr char TanhDoc[] = R"DOC(
 Tanh Activation Operator.
 
 $$out = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
 )DOC";
 
-__attribute__((unused)) constexpr char TanhShrinkDoc[] = R"DOC(
+UNUSED constexpr char TanhShrinkDoc[] = R"DOC(
 TanhShrink Activation Operator.
 
 $$out = x - \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
 
 )DOC";
 
-__attribute__((unused)) constexpr char SqrtDoc[] = R"DOC(
+UNUSED constexpr char SqrtDoc[] = R"DOC(
 Sqrt Activation Operator.
 
 $out = \sqrt{x}$
 
 )DOC";
 
-__attribute__((unused)) constexpr char AbsDoc[] = R"DOC(
+UNUSED constexpr char AbsDoc[] = R"DOC(
 Abs Activation Operator.
 
 $out = |x|$
 
 )DOC";
 
-__attribute__((unused)) constexpr char CeilDoc[] = R"DOC(
+UNUSED constexpr char CeilDoc[] = R"DOC(
 Ceil Activation Operator.
 
 $out = ceil(x)$
 
 )DOC";
 
-__attribute__((unused)) constexpr char FloorDoc[] = R"DOC(
+UNUSED constexpr char FloorDoc[] = R"DOC(
 Floor Activation Operator.
 
 $out = floor(x)$
 
 )DOC";
 
-__attribute__((unused)) constexpr char CosDoc[] = R"DOC(
+UNUSED constexpr char CosDoc[] = R"DOC(
 Cosine Activation Operator.
 
 $out = cos(x)$
 
 )DOC";
 
-__attribute__((unused)) constexpr char SinDoc[] = R"DOC(
+UNUSED constexpr char SinDoc[] = R"DOC(
 Sine Activation Operator.
 
 $out = sin(x)$
 
 )DOC";
 
-__attribute__((unused)) constexpr char RoundDoc[] = R"DOC(
+UNUSED constexpr char RoundDoc[] = R"DOC(
 Round Activation Operator.
 
 $out = [x]$
 
 )DOC";
 
-__attribute__((unused)) constexpr char ReciprocalDoc[] = R"DOC(
+UNUSED constexpr char ReciprocalDoc[] = R"DOC(
 Reciprocal Activation Operator.
 
 $$out = \\frac{1}{x}$$
 
 )DOC";
 
-__attribute__((unused)) constexpr char LogDoc[] = R"DOC(
+UNUSED constexpr char LogDoc[] = R"DOC(
 Log Activation Operator.
 
 $out = \ln(x)$
@@ -212,21 +213,21 @@ Natural logarithm of x.
 
 )DOC";
 
-__attribute__((unused)) constexpr char SquareDoc[] = R"DOC(
+UNUSED constexpr char SquareDoc[] = R"DOC(
 Square Activation Operator.
 
 $out = x^2$
 
 )DOC";
 
-__attribute__((unused)) constexpr char SoftplusDoc[] = R"DOC(
+UNUSED constexpr char SoftplusDoc[] = R"DOC(
 Softplus Activation Operator.
 
 $out = \ln(1 + e^{x})$
 
 )DOC";
 
-__attribute__((unused)) constexpr char SoftsignDoc[] = R"DOC(
+UNUSED constexpr char SoftsignDoc[] = R"DOC(
 Softsign Activation Operator.
 
 $$out = \frac{x}{1 + |x|}$$
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
index b98190d40a2afa684cfd29cc52fc29fac851cca7..4cc980b41b34894f9d915d4b325887548091c0eb 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
@@ -23,8 +23,6 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-static constexpr int kROISize = 4;
-
 template <typename T>
 bool GT_E(T a, T b) {
   return (a > b) || fabs(a - b) < 1e-4;
diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc
index 5341187d1ce9400ac34750ab691608e76158ae0d..56cef91e29cc7da27384c27a7ec63e90cfadfc3b 100644
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
@@ -46,6 +46,25 @@ static std::string gethash(const memory::dims& input_dims,
          dims2str(paddings) + pooling_type + suffix;
 }
 
+static inline int ComputeCeiledOutput(int input_size, int kernel_size,
+                                      int padding, int stride) {
+  return (input_size - kernel_size + 2 * padding) / stride + 1;
+}
+
+static inline void CorrectOutputSize(
+    const std::vector<int>& src_tz, const std::vector<int>& dst_tz,
+    const std::vector<int>& kernel_size, const std::vector<int>& paddings,
+    const std::vector<int>& strides,
+    std::vector<int>& right_bot_padding) {  // NOLINT
+  for (size_t i = 0; i < right_bot_padding.size(); i++) {
+    int desired_size = ComputeCeiledOutput(src_tz[i + 2], kernel_size[i],
+                                           paddings[i], strides[i]);
+    if (desired_size != dst_tz[i + 2]) {
+      right_bot_padding[i] += strides[i];
+    }
+  }
+}
+
 template <typename T>
 class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -103,6 +122,13 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto pool_p =
         std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p));
     if (pool_p == nullptr) {
+      const std::vector<int>& padding_left_top(paddings);
+      std::vector<int> padding_right_bottom(paddings);
+      bool ceil_mode = ctx.Attr<bool>("ceil_mode");
+      if (ceil_mode) {
+        CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides,
+                          padding_right_bottom);
+      }
       auto src_md = platform::MKLDNNMemDesc(
           src_tz, platform::MKLDNNGetDataType<T>(), input_format);
 
@@ -114,8 +140,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                                             mkldnn::memory::format::any);
 
       std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
-          CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
-                              pooling_type, mkldnn_engine);
+          CreatePrimitiveDesc(src_md, dst_md, strides, padding_left_top,
+                              padding_right_bottom, ksize, pooling_type,
+                              mkldnn_engine, ceil_mode);
 
       // save pool_pd into global device context to be referred in backward path
       dev_ctx.SetBlob(key_pool_pd, pool_pd);
@@ -171,14 +198,16 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  private:
   std::unique_ptr<mkldnn::pooling_forward::primitive_desc> CreatePrimitiveDesc(
       const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst,
-      const std::vector<int>& stride, const std::vector<int>& padding,
-      const std::vector<int>& kernel, const std::string& pooling_type,
-      const mkldnn::engine& engine) const {
+      const std::vector<int>& stride, const std::vector<int>& padding_left_top,
+      const std::vector<int>& padding_right_bot, const std::vector<int>& kernel,
+      const std::string& pooling_type, const mkldnn::engine& engine,
+      bool ceil_mode) const {
     auto pool_desc = mkldnn::pooling_forward::desc(
         mkldnn::prop_kind::forward,
         pooling_type == "max" ? mkldnn::algorithm::pooling_max
                               : mkldnn::algorithm::pooling_avg,
-        src, dst, stride, kernel, padding, padding, mkldnn::padding_kind::zero);
+        src, dst, stride, kernel, padding_left_top, padding_right_bot,
+        mkldnn::padding_kind::zero);
 
     auto p_pool_pd =
         new mkldnn::pooling_forward::primitive_desc(pool_desc, engine);
diff --git a/paddle/fluid/operators/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt_engine_op.h
index d4ba0f9c33c91811647f9d19a332f139c16b0eb2..3c78c29c1a30d74947be84cd2b52ad308e732a2d 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
@@ -34,7 +34,7 @@ namespace operators {
 using FluidDT = framework::proto::VarType_Type;
 using TRT_DT = nvinfer1::DataType;
 
-namespace {
+namespace {  // NOLINT
 
 TRT_DT FluidDataType2TRT(FluidDT type) {
   switch (type) {
diff --git a/paddle/fluid/operators/top_k_op.cc b/paddle/fluid/operators/top_k_op.cc
index 4a8ac441cfaf642fde58ee30865a22e83c065498..92a0697e27ba0da66fa3b0f5380e7bd52575640d 100644
--- a/paddle/fluid/operators/top_k_op.cc
+++ b/paddle/fluid/operators/top_k_op.cc
@@ -30,6 +30,8 @@ class TopkOp : public framework::OperatorWithKernel {
                    "Output(Indices) of TopkOp should not be null.");
 
     auto input_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(input_dims.size(), 2,
+                      "Rank of TopK op's input must be 2.");
     const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
 
     PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
diff --git a/paddle/fluid/platform/cudnn_helper_test.cc b/paddle/fluid/platform/cudnn_helper_test.cc
index 517df6863499f20d7b66d15ef114a689700be5b2..28edfd2e50237c887dbeb7ac73e1f990ce239a9c 100644
--- a/paddle/fluid/platform/cudnn_helper_test.cc
+++ b/paddle/fluid/platform/cudnn_helper_test.cc
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
+
 #include "paddle/fluid/platform/cudnn_helper.h"
 #include <gtest/gtest.h>
 
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 61a653d9313daff96d39c08e80f17d7e33acceb1..f04395a8ac00f33501008aa12f22773ddda9b138 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -21,6 +21,7 @@ limitations under the License. */
 #if defined(_WIN32)
 #define NOMINMAX  // msvc max/min macro conflict with std::min/max
 #define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
+#define GOOGLE_GLOG_DLL_DECL
 #endif
 
 #ifdef PADDLE_WITH_CUDA
@@ -47,7 +48,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/curand.h"
-#if !defined(__APPLE__) and !defined(_WIN32)
+#if !defined(__APPLE__) && !defined(_WIN32)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
@@ -216,7 +217,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
 #endif
 }
 
-#if !defined(__APPLE__) and !defined(_WIN32)
+#if !defined(__APPLE__) && !defined(_WIN32)
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     ncclResult_t stat, const Args&... args) {
@@ -260,14 +261,8 @@ inline void throw_on_error(T e) {
     }                                                                   \
   } while (false)
 
-#define PADDLE_THROW_EOF()                                                     \
-  do {                                                                         \
-    throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
-                                           __LINE__);                          \
-  } while (false)
-
 #else
-#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__)
+#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
 #endif  // REPLACE_ENFORCE_GLOG
 
 #else  // !_WIN32
@@ -281,6 +276,12 @@ inline void throw_on_error(T e) {
 #define PADDLE_ENFORCE(x, ...) x
 #endif  // !_WIN32
 
+#define PADDLE_THROW_EOF()                                                     \
+  do {                                                                         \
+    throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
+                                           __LINE__);                          \
+  } while (false)
+
 /*
  * Some enforce helpers here, usage:
  *    int a = 1;
@@ -294,7 +295,7 @@ inline void throw_on_error(T e) {
  *    extra messages is also supported, for example:
  *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
  */
-
+#if !defined(_WIN32)
 #define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
   __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
 #define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
@@ -307,6 +308,7 @@ inline void throw_on_error(T e) {
   __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
 #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
   __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
+
 #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
   do {                                                       \
     if (UNLIKELY(nullptr == (__VAL))) {                      \
@@ -326,6 +328,27 @@ inline void throw_on_error(T e) {
                    paddle::string::Sprintf("" __VA_ARGS__));            \
     }                                                                   \
   } while (0)
+#else
+#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1))
+#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1))
+#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1))
+#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1))
+#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1))
+#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1))
+
+#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
+  do {                                                                 \
+    if (!((__VAL0)__CMP(__VAL1))) {                                    \
+      PADDLE_THROW("Windows disable the enforce. Enforce failed.");    \
+    }                                                                  \
+  } while (0)
+#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...)                       \
+  do {                                                             \
+    if (nullptr == (__VAL1)) {                                     \
+      PADDLE_THROW("Windows disable the enforce. Enforce failed"); \
+    }                                                              \
+  } while (0)
+#endif  // !_WIN32
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h
index 0e30594672927253cc8083dcb88bb867d63ec729..992ca5e6f6a966a331616a698e3bebd2eee129d5 100644
--- a/paddle/fluid/platform/init.h
+++ b/paddle/fluid/platform/init.h
@@ -16,6 +16,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#define GLOG_NO_ABBREVIATED_SEVERITIES
+#define GOOGLE_GLOG_DLL_DECL
+
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 
diff --git a/paddle/fluid/pybind/const_value.cc b/paddle/fluid/pybind/const_value.cc
index 1f61a0e289f32196ead04d71d07b513cbe4655b1..882e6332e8174b59eb6e19e788c8cced808d552c 100644
--- a/paddle/fluid/pybind/const_value.cc
+++ b/paddle/fluid/pybind/const_value.cc
@@ -48,6 +48,9 @@ void BindConstValue(pybind11::module* m) {
   op_proto_and_checker_maker.def(
       "kOpNameScopeAttrName",
       framework::OpProtoAndCheckerMaker::OpNamescopeAttrName);
+  op_proto_and_checker_maker.def(
+      "kOpCreationCallstackAttrName",
+      framework::OpProtoAndCheckerMaker::OpCreationCallstackAttrName);
 }
 
 }  // namespace pybind
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index f50a68c54114d5cce15418ad22f38c83163ba866..e6a9524382be219e550017ed4f1a6070dca22fbf 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -147,6 +147,7 @@ function cmake_gen() {
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR}
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF}
         -DPY_VERSION=${PY_VERSION:-2.7}
+        -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
     ========================================
 EOF
     # Disable UNITTEST_USE_VIRTUALENV in docker because
@@ -178,7 +179,8 @@ EOF
         -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
         -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \
         -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \
-        -DPY_VERSION=${PY_VERSION:-2.7}
+        -DPY_VERSION=${PY_VERSION:-2.7} \
+        -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX:-/paddle/build}
 
 }
 
@@ -361,7 +363,7 @@ EOF
         ctest --output-on-failure
         # make install should also be test when unittest
         make install -j `nproc`
-        pip install /usr/local/opt/paddle/share/wheels/*.whl
+        pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
         if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
             paddle version
         fi
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 9e4a5ae8baaf7f2975c8060856f9eecab55f241c..7bbdf7de89cc932e0023952e3c8e102f92b06855 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -19,17 +19,8 @@ from .framework import *
 # import all class inside executor into fluid module
 from . import executor
 from .executor import *
-
 from . import trainer
-from .trainer import Trainer
-from .trainer import BeginEpochEvent
-from .trainer import EndEpochEvent
-from .trainer import BeginStepEvent
-from .trainer import EndStepEvent
-from .trainer import CheckpointConfig
-
 from . import inferencer
-from .inferencer import Inferencer
 
 from . import io
 from . import evaluator
diff --git a/python/paddle/fluid/contrib/inferencer.py b/python/paddle/fluid/contrib/inferencer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8d5f4ffeadca0a7b103682f175d50dc46fa258a
--- /dev/null
+++ b/python/paddle/fluid/contrib/inferencer.py
@@ -0,0 +1,112 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import contextlib
+
+from .. import core
+
+from .. import executor
+from .. import framework
+from .. import io
+from .. import parallel_executor
+from .. import unique_name
+from .trainer import check_and_get_place
+
+__all__ = ['Inferencer', ]
+
+
+class Inferencer(object):
+    """
+    Inferencer High Level API.
+
+    Args:
+        infer_func (Python func): Infer function that will return predict Variable
+        param_path (str): The path where the inference model is saved by fluid.io.save_params
+        place (Place): place to do the inference
+        parallel (bool): use parallel_executor to run the inference, it will use multi CPU/GPU.
+
+    Examples:
+        .. code-block:: python
+
+            def inference_program():
+                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+                y_predict = fluid.layers.fc(input=x, size=1, act=None)
+                return y_predict
+
+            place = fluid.CPUPlace()
+            inferencer = fluid.Inferencer(
+                infer_func=inference_program, param_path="/tmp/model", place=place)
+
+    """
+
+    def __init__(self, infer_func, param_path, place=None, parallel=False):
+        self.param_path = param_path
+        self.scope = core.Scope()
+        self.parallel = parallel
+        self.place = check_and_get_place(place)
+
+        self.inference_program = framework.Program()
+        with framework.program_guard(self.inference_program):
+            with unique_name.guard():
+                self.predict_var = infer_func()
+
+        with self._prog_and_scope_guard():
+            # load params from param_path into scope
+            io.load_params(executor.Executor(self.place), param_path)
+
+        if parallel:
+            with self._prog_and_scope_guard():
+                self.exe = parallel_executor.ParallelExecutor(
+                    use_cuda=isinstance(self.place, core.CUDAPlace),
+                    loss_name=self.predict_var.name)
+        else:
+            self.exe = executor.Executor(self.place)
+
+        self.inference_program = self.inference_program.clone(for_test=True)
+
+    def infer(self, inputs, return_numpy=True):
+        """
+        Do Inference for Inputs
+
+        Args:
+            inputs (map): a map of {"input_name": input_var} that will be feed into the inference program
+            return_numpy (bool): transform return value into numpy or not
+
+        Returns:
+            Tensor or Numpy: the predict value of the inference model for the inputs
+
+        Examples:
+            .. code-block:: python
+
+                tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
+                results = inferencer.infer({'x': tensor_x})
+        """
+        if not isinstance(inputs, dict):
+            raise ValueError(
+                "inputs should be a map of {'input_name': input_var}")
+
+        with self._prog_and_scope_guard():
+            results = self.exe.run(feed=inputs,
+                                   fetch_list=[self.predict_var.name],
+                                   return_numpy=return_numpy)
+
+        return results
+
+    @contextlib.contextmanager
+    def _prog_and_scope_guard(self):
+        with framework.program_guard(main_program=self.inference_program):
+            with executor.scope_guard(self.scope):
+                yield
diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8569e486f91786b5562e84dcdccf6d91da0612cc
--- /dev/null
+++ b/python/paddle/fluid/contrib/trainer.py
@@ -0,0 +1,1258 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import contextlib
+import os
+import errno
+import shutil
+import six
+import time
+
+from .. import core
+from .. import data_feeder
+from .. import executor
+from .. import framework
+from .. import io
+# optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
+from .. import optimizer as opt_module
+from .. import parallel_executor
+from ..transpiler import distribute_transpiler
+
+__all__ = [
+    'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent',
+    'EndStepEvent', 'CheckpointConfig'
+]
+
+
+class BeginEpochEvent(object):
+    """
+    The begin of a training epoch.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+    """
+
+    def __init__(self, epoch_id):
+        self.epoch = epoch_id
+
+
+class EndEpochEvent(object):
+    """
+    The end of a training epoch.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+    """
+
+    def __init__(self, epoch_id):
+        self.epoch = epoch_id
+
+
+class BeginStepEvent(object):
+    """
+    The begin of a training epoch.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+        step_id(int): The current step ID.
+    """
+
+    def __init__(self, epoch_id, step_id):
+        self.epoch = epoch_id
+        self.step = step_id
+        self.fetch_metrics = True
+        """
+        If fetch_metrics is true, the metrics will be fetched at the
+        EndStepEvent. Default is True.
+        """
+
+
+class EndStepEvent(object):
+    """
+    The end of a training step.
+
+    Args:
+        epoch_id(int): The current epoch ID.
+        step_id(int): The current step ID.
+        metrics(list): A list of fetched tensor. The order of this list is same
+            as the :code:`train_func` returns.
+    """
+
+    def __init__(self, epoch_id, step_id, metrics):
+        self.epoch = epoch_id
+        self.step = step_id
+        self.metrics = metrics
+
+
+class CheckpointConfig(object):
+    """
+    Parameter object for :code:`save_checkpoint` and
+    :code:`fluid.Trainer`. Used to configuration how to save checkpoint.
+
+    Args:
+        checkpoint_dir(str): Directory path to save check point. Default is the
+            current directory.
+
+        max_num_checkpoints(int): The max number of local check points.
+        epoch_interval(int): Every number of epoch to save check point.
+        step_interval(int): Every number of step to save check point.
+
+    Examples:
+        >>> config = fluid.CheckpointConfig("./checkpoints")
+        >>> trainer = fluid.Trainer(train_func=train_program,
+        >>>                         place=place,
+        >>>                         optimizer_func=optimizer_func,
+        >>>                         checkpoint_config=config)
+        >>> trainer.train(...)
+    """
+
+    def __init__(self,
+                 checkpoint_dir=None,
+                 max_num_checkpoints=3,
+                 epoch_interval=1,
+                 step_interval=10):
+
+        assert epoch_interval >= 1
+        assert step_interval >= 1
+
+        self.checkpoint_dir = checkpoint_dir \
+            if checkpoint_dir is not None else os.getcwd()
+        self.max_num_checkpoints = max_num_checkpoints
+        self.epoch_interval = epoch_interval
+        self.step_interval = step_interval
+        self.epoch_id = 0
+        self.step_id = 0
+        self.load_serial = None
+        self.pserver_id = None
+        self.lookup_table_name = None
+
+
+def check_and_get_place(place):
+    """
+    Check the type of place or get the default place
+    Args:
+        place(None|core.CUDAPlace|core.CPUPlace): the place that trainer will be executed on.
+
+    Raises:
+        TypeError if the type mismatched.
+
+    Returns:
+        the original place if it is not None.
+        if fluid is compiled with CUDA, returns CUDAPlace(0) by default.
+        Otherwise returns CPUPlace by default.
+    """
+    if place is None:
+        if core.is_compiled_with_cuda():
+            return core.CUDAPlace(0)
+        else:
+            return core.CPUPlace()
+    else:
+        if not isinstance(place, core.CUDAPlace) and not isinstance(
+                place, core.CPUPlace):
+            raise TypeError("Place should be either CUDAPlace or CPUPlace")
+        return place
+
+
+class Trainer(object):
+    """
+    A trainer wraps MultiGPU/MultiNode training loops and can be used to train a
+    simple neural network easily.
+
+    This API takes a :code:`train_func`. A :code:`train_func` is a function that
+    return loss as it first return value. The reset value can be fetched by
+    EndStepEvent.metrics
+
+    This API also takes a :code:`optimizer_func` that will return an optimizer
+    instance.
+
+    For example, to train a MLP for MNIST dataset, the sample program is
+
+    >>> import paddle.fluid as fluid
+    >>>
+    >>> def mlp(image, layer_sizes=[200, 100], activation="relu", num_classes=10):
+    >>>     hidden = image
+    >>>     for layer_size in layer_sizes:
+    >>>         hidden = fluid.layers.fc(input=hidden, size=layer_size, act=activation)
+    >>>     return fluid.layers.fc(input=hidden, size=num_classes, act="softmax")
+    >>>
+    >>> def train_mnist_mlp():
+    >>>     img = fluid.layers.data(name='image', shape=[784])
+    >>>     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    >>>     prediction = mlp(img)
+    >>>     return fluid.layers.mean(fluid.layers.cross_entropy(prediction, label))
+    >>>
+    >>> def optimizer():
+    >>>     return fluid.optimizer.Adam()
+    >>>
+    >>> trainer = Trainer(train_func=train_mnist_mlp,
+    >>>                   optimizer_func=optimizer,
+    >>>                   place=fluid.CUDAPlace(0),
+    >>>                   parallel=True)
+    >>>
+    >>> def train_callback(event):
+    >>>     if isinstance(event, fluid.EndStepEvent):
+    >>>         print "Epoch ID", event.epoch, "Step ID",\
+    >>>             event.step, "AvgLoss", event.metrics[0]
+    >>>     elif isinstance(event, fluid.EndEpochEvent):
+    >>>         trainer.save_params("./model_{0}".format(event.epoch))
+    >>>
+    >>> trainer.train(num_epochs=100, event_handler=train_callback)
+
+    For more example, please see :ref:`api_guide_high_level_api`.
+
+
+    Args:
+        train_func(callable): A function which will return loss. The loss must be
+            a scalar tensor.
+        optimizer_func(callable): A function that returns an Optimizer object.
+        place(CUDAPlace|CPUPlace): The device place of this trainer. If
+            :code:`parallel=True,` all CUDA Places will be used if :code:`place`
+            is a :code:`CUDAPlace`.
+        parallel(bool): True if use multiple devices.
+        checkpoint_config(CheckpointConfig): Configuration about how to save
+            checkpoints.
+    """
+
+    def __init__(self,
+                 train_func,
+                 optimizer_func,
+                 param_path=None,
+                 place=None,
+                 parallel=False,
+                 checkpoint_config=None):
+        self.__stop = False
+        self.parallel = parallel
+
+        # config for checkpoint
+        # only chief worker will save variables
+        self.trainer_id = 0
+        self.checkpoint_cfg = checkpoint_config
+        if self.checkpoint_cfg:
+            assert isinstance(self.checkpoint_cfg, CheckpointConfig)
+            serial = _get_latest_checkpoint_serial(
+                self.checkpoint_cfg.checkpoint_dir)
+            self.checkpoint_cfg.load_serial = serial if serial >= 0 else None
+
+        self.scope = core.Scope()
+
+        # 1. we need to generate a framework.Program by calling
+        # program_func. Reference: fluid.program_guard in
+        # test_word2vec.py
+
+        self.startup_program = framework.Program()
+        self.train_program = framework.Program()
+
+        with framework.program_guard(self.train_program, self.startup_program):
+            program_func_outs = train_func()
+            self.train_func_outputs = program_func_outs if isinstance(
+                program_func_outs, list) else [program_func_outs]
+            self.test_program = self.train_program.clone(for_test=True)
+
+            # The first element of program_func_outs is loss.
+            loss = self.train_func_outputs[0]
+
+            optimizer = optimizer_func()
+            if not isinstance(optimizer, opt_module.Optimizer):
+                raise TypeError(
+                    "The optimizer should be an instance of Optimizer")
+            optimize_ops, params_grads = optimizer.minimize(loss)
+
+        self.place = check_and_get_place(place)
+
+        self._dist_transpile_if_necessary(optimize_ops, params_grads)
+
+        # 2. move the default_main_program to self.program and run the
+        # default_startup program on an empty core.Scope()
+        # Run startup program
+        with self._prog_and_scope_guard():
+            exe = executor.Executor(place)
+            exe.run(self.startup_program)
+
+        if self.checkpoint_cfg and self.checkpoint_cfg.load_serial is not None:
+            self._load_checkpoint()
+
+        if param_path and os.path.isdir(param_path):
+            with self._prog_and_scope_guard():
+                # load params from param_path into scope
+                io.load_persistables(
+                    executor=exe,
+                    dirname=param_path,
+                    main_program=self.startup_program)
+
+    def _transpile_nccl2_dist(self):
+        # PADDLE_TRAINER_IPS
+        if "PADDLE_TRAINER_IPS" not in os.environ:
+            self.nccl_id_var = None
+        else:
+            self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
+            port = os.getenv("PADDLE_PSERVER_PORT")
+            worker_ips = os.getenv("PADDLE_TRAINER_IPS")
+            worker_endpoints = []
+            for ip in worker_ips.split(","):
+                worker_endpoints.append(':'.join([ip, port]))
+            self.num_trainers = len(worker_endpoints)
+            current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
+            worker_endpoints.remove(current_endpoint)
+            # TODO(wuyi): use self.nccl_id_var, self.num_trainers and self.trainer_id
+            # in ParallelExecutor to start
+            # distributed training using NCCL2
+            self.nccl_id_var = self.startup_program.global_block().create_var(
+                name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
+            self.startup_program.global_block().append_op(
+                type="gen_nccl_id",
+                inputs={},
+                outputs={"NCCLID": self.nccl_id_var},
+                attrs={
+                    "endpoint": current_endpoint,
+                    "endpoint_list": worker_endpoints,
+                    "trainer_id": self.trainer_id
+                })
+
+    def _dist_transpile_if_necessary(self, optimize_ops, params_grads):
+        self._transpile_nccl2_dist()
+        if self.nccl_id_var != None:
+            return
+
+        if "PADDLE_TRAINING_ROLE" not in os.environ:
+            return
+
+        # the port of all pservers, needed by both trainer and pserver
+        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
+        # comma separated ips of all pservers, needed by trainer and
+        # pserver
+        pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, port]))
+        pserver_endpoints = ",".join(eplist)
+        # total number of workers/trainers in the job, needed by
+        # trainer and pserver
+        trainers = int(os.getenv("PADDLE_TRAINERS"))
+        # the IP of the local machine, needed by pserver only
+        current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
+        # the unique trainer id, starting from 0, needed by trainer
+        # only
+        self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+
+        # the role, should be either PSERVER or TRAINER
+        training_role = os.getenv("PADDLE_TRAINING_ROLE")
+        with self._prog_and_scope_guard():
+            t = distribute_transpiler.DistributeTranspiler()
+            t.transpile(
+                self.trainer_id, pservers=pserver_endpoints, trainers=trainers)
+            if training_role == "PSERVER":
+                if self.checkpoint_cfg:
+                    pserver_id = eplist.index(current_endpoint)
+                    self.checkpoint_cfg.pserver_id = pserver_id
+                    if t.has_distributed_lookup_table:
+                        self.checkpoint_cfg.lookup_table_name = t.table_name
+
+                self.train_program = t.get_pserver_program(current_endpoint)
+                self.startup_program = t.get_startup_program(current_endpoint,
+                                                             self.train_program)
+            elif training_role == "TRAINER":
+                self.train_program = t.get_trainer_program()
+            else:
+                raise ValueError(
+                    'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
+                )
+
+    def stop(self):
+        """
+        stop training
+        """
+        self.__stop = True
+
+    def train(self, num_epochs, event_handler, reader=None, feed_order=None):
+        """
+        Start the train loop to train the model.
+
+        Args:
+            num_epochs(int): The number of epoch. An epoch will process all data in reader
+            event_handler(callable): The event handler. A function with type (ev:Event)->void
+            reader(callable): A reader creator object. See also
+                :ref:`api_guide_python_reader` .
+            feed_order(list): Feeding order of reader. None will following the defining
+                order in program
+
+        Returns:
+            None
+        """
+        training_role = os.getenv("PADDLE_TRAINING_ROLE", "")
+        if training_role == "PSERVER":
+            with self._prog_and_scope_guard():
+                exe = executor.Executor(self.place)
+                exe.run()
+                return
+        if self.parallel:
+            self._train_by_parallel_executor(num_epochs, event_handler, reader,
+                                             feed_order)
+        else:
+            self._train_by_executor(num_epochs, event_handler, reader,
+                                    feed_order)
+
+    def test(self, reader, feed_order):
+        """
+        Test the model on given test data
+
+        Args:
+            reader(callable): The reader that yields test data.
+            feed_order(list): Feeding order of reader. None will following the
+                defining order in program
+        """
+
+        return self._test_by_executor(reader, feed_order,
+                                      self.train_func_outputs)
+
+    def save_params(self, param_path):
+        """
+        Save all parameters into :code:`param_path`.
+
+        Args:
+            param_path(str): The path to save parameters.
+
+        Returns:
+            None
+        """
+        with self._prog_and_scope_guard():
+            exe = executor.Executor(self.place)
+            io.save_persistables(exe, dirname=param_path)
+
+    def save_inference_model(self, param_path, feeded_var_names,
+                             target_var_indexes):
+        """
+        Save model for cpp inference into :code:`param_path`.
+
+        Args:
+            param_path(str): The path to save parameters.
+            feeded_var_names(list(str)): The name of the vars that you
+                need to feed in before run program.
+            target_var_indexes(list(int)): the index of target var that
+                you need to return in trainer.train_func.
+        Returns:
+            None
+        """
+        with self._prog_and_scope_guard():
+            exe = executor.Executor(self.place)
+            target_vars = [
+                self.train_func_outputs[index] for index in target_var_indexes
+            ]
+            io.save_inference_model(param_path, feeded_var_names, target_vars,
+                                    exe)
+
+    @contextlib.contextmanager
+    def _prog_and_scope_guard(self):
+        with framework.program_guard(
+                main_program=self.train_program,
+                startup_program=self.startup_program):
+            with executor.scope_guard(self.scope):
+                yield
+
+    def _train_by_executor(self, num_epochs, event_handler, reader, feed_order):
+        """
+        Train by Executor and single device.
+
+        Args:
+            num_epochs:
+            event_handler:
+            reader:
+            feed_order:
+
+        Returns:
+
+        """
+        with self._prog_and_scope_guard():
+            feed_var_list = build_feed_var_list(self.train_program, feed_order)
+            feeder = data_feeder.DataFeeder(
+                feed_list=feed_var_list, place=self.place)
+            exe = executor.Executor(self.place)
+            reader = feeder.decorate_reader(reader, multi_devices=False)
+            self._train_by_any_executor(event_handler, exe, num_epochs, reader)
+
+    def _train_by_any_executor(self, event_handler, exe, num_epochs, reader):
+        if self.checkpoint_cfg:
+            epochs = [
+                epoch_id for epoch_id in range(num_epochs)
+                if epoch_id >= self.checkpoint_cfg.epoch_id
+            ]
+        else:
+            epochs = [epoch_id for epoch_id in range(num_epochs)]
+
+        for epoch_id in epochs:
+            event_handler(BeginEpochEvent(epoch_id))
+            for step_id, data in enumerate(reader()):
+                if self.__stop:
+                    if self.checkpoint_cfg:
+                        self._clean_checkpoint()
+                    return
+
+                if self.checkpoint_cfg and self.checkpoint_cfg.load_serial \
+                        and self.checkpoint_cfg.step_id >= step_id and self.checkpoint_cfg.epoch_id == epoch_id:
+                    continue
+
+                begin_event = BeginStepEvent(epoch_id, step_id)
+                event_handler(begin_event)
+                if begin_event.fetch_metrics:
+                    metrics = exe.run(feed=data,
+                                      fetch_list=[
+                                          var.name
+                                          for var in self.train_func_outputs
+                                      ])
+                else:
+                    metrics = exe.run(feed=data, fetch_list=[])
+
+                if self.checkpoint_cfg:
+                    self._save_checkpoint(epoch_id, step_id)
+                event_handler(EndStepEvent(epoch_id, step_id, metrics))
+            event_handler(EndEpochEvent(epoch_id))
+        if self.checkpoint_cfg:
+            self._clean_checkpoint()
+
+    def _test_by_executor(self, reader, feed_order, fetch_list):
+        with executor.scope_guard(self.scope):
+            feed_var_list = build_feed_var_list(self.test_program, feed_order)
+            feeder = data_feeder.DataFeeder(
+                feed_list=feed_var_list, place=self.place)
+            exe = executor.Executor(self.place)
+            accumulated = len(fetch_list) * [0]
+            count = 0
+            for data in reader():
+                outs = exe.run(program=self.test_program,
+                               feed=feeder.feed(data),
+                               fetch_list=fetch_list)
+                accumulated = [x[0] + x[1][0] for x in zip(accumulated, outs)]
+                count += 1
+
+            return [x / count for x in accumulated]
+
+    def _train_by_parallel_executor(self, num_epochs, event_handler, reader,
+                                    feed_order):
+        with self._prog_and_scope_guard():
+            pe = self._get_or_create_parallel_executor()
+            feed_var_list = build_feed_var_list(self.train_program, feed_order)
+            feeder = data_feeder.DataFeeder(
+                feed_list=feed_var_list, place=self.place)
+            reader = feeder.decorate_reader(reader, multi_devices=True)
+            self._train_by_any_executor(event_handler, pe, num_epochs, reader)
+
+    def _get_parallel_executor(self):
+        return getattr(self, 'parallel_executor', None)
+
+    def _get_or_create_parallel_executor(self):
+        if self._get_parallel_executor() is None:
+            self.parallel_executor = parallel_executor.ParallelExecutor(
+                use_cuda=isinstance(self.place, core.CUDAPlace),
+                loss_name=self.train_func_outputs[0].name)
+        return self._get_parallel_executor()
+
+    def _clean_checkpoint(self):
+        assert self.checkpoint_cfg
+        clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir)
+
+    def _get_checkpoint_load_args(self):
+        """
+        epoch_id and step_id are runtime arguments, they are not variables, will load them independently.
+        """
+        return ["epoch_id", "step_id"]
+
+    def _get_checkpoint_save_args(self, epoch_id, step_id):
+        """
+        epoch_id and step_id are runtime arguments, they are not variables, will save them independently.
+        """
+        trainer_args = {}
+        trainer_args["epoch_id"] = epoch_id
+        trainer_args["step_id"] = step_id
+        return trainer_args
+
+    def _save_checkpoint(self, epoch_id, step_id):
+        assert self.checkpoint_cfg
+
+        if epoch_id % self.checkpoint_cfg.epoch_interval == 0 \
+                and step_id % self.checkpoint_cfg.step_interval == 0:
+            exe = executor.Executor(self.place)
+            save_checkpoint(
+                executor=exe,
+                checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
+                trainer_id=self.trainer_id,
+                trainer_args=self._get_checkpoint_save_args(epoch_id, step_id),
+                main_program=self.train_program,
+                max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints)
+
+    def _load_checkpoint(self):
+        with self._prog_and_scope_guard():
+            exe = executor.Executor(self.place)
+            load_checkpoint(
+                executor=exe,
+                checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
+                main_program=self.startup_program)
+
+            if not self.checkpoint_cfg.pserver_id:
+                load_trainer_args = self._get_checkpoint_load_args()
+                trainer_args = load_checkpoint(
+                    executor=exe,
+                    checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
+                    main_program=self.startup_program,
+                    role_id=self.trainer_id,
+                    is_trainer=True,
+                    load_trainer_args=load_trainer_args)
+
+                if len(trainer_args) != 2:
+                    raise ValueError(
+                        "the return trainer_args length do not equal _get_checkpoint_load_args"
+                    )
+                self.checkpoint_cfg.epoch_id = int(trainer_args[0])
+                self.checkpoint_cfg.step_id = int(trainer_args[1])
+            else:
+                if self.checkpoint_cfg.lookup_table_name:
+                    load_checkpoint(
+                        executor=exe,
+                        checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
+                        main_program=self.startup_program,
+                        role_id=self.checkpoint_cfg.pserver_id,
+                        is_trainer=False,
+                        load_trainer_args=None,
+                        load_lookup_table=self.checkpoint_cfg.lookup_table_name)
+
+
+def build_feed_var_list(program, feed_order):
+    if not isinstance(program, framework.Program):
+        raise TypeError("The 'program' should be an object of Program")
+
+    if isinstance(feed_order, list):
+        feed_var_list = [
+            program.global_block().var(var_name) for var_name in feed_order
+        ]
+    else:
+        if not isinstance(feed_order, dict):
+            raise TypeError(
+                "The 'feed_order' should be either None, list or dict.")
+        if not sorted(feed_order.values()) == list(range(len(feed_order))):
+            raise ValueError(
+                "The values of 'feed_order' should be a permutation of [0, len(feed_order))"
+            )
+        sorted_pair_list = sorted(
+            six.iteritems(feed_order), key=lambda item: item[1])
+        feed_var_list = [
+            program.global_block().var(pair[0]) for pair in sorted_pair_list
+        ]
+    return feed_var_list
+
+
+# move Checkpoint APIs from io.py to trainer.py, make all of them are private.
+SUCCESS_MARK_FILENAME = "_SUCCESS"
+CHECKPOINT_PREFIX = "checkpoint"
+MODEL_DIR = "__model__"
+LOOKUP_TABLE_DIR = "__lookup_table__"
+TRAINER_PREFIX = "trainer"
+CHECKPOINT_SEPARATOR = "_"
+
+
+def save_checkpoint(executor,
+                    checkpoint_dir,
+                    trainer_id,
+                    main_program,
+                    trainer_args=None,
+                    max_num_checkpoints=3,
+                    lookup_table=None,
+                    pserver_endpoints=None):
+    """
+    This function filters out all checkpoint variables from the give
+    main_program and then saves these variables to the `checkpoint_dir`
+    directory.
+
+    In the training precess, we generally save a checkpoint in each
+    iteration. So there might be a lot of checkpoints in the
+    `checkpoint_dir`. To avoid them taking too much disk space, the
+    `max_num_checkpoints` are introduced to limit the total number of
+    checkpoints. If the number of existing checkpints is greater than
+    the `max_num_checkpoints`, oldest ones will be scroll deleted.
+
+    A variable is a checkpoint variable and will be saved if it meets
+    all following conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for save checkpoint.
+        checkpoint_dir(str): The folder where to save checkpoints.
+        trainer_id(int): currect trainer id, if id is equal to 0, the trainer
+            is chief.
+        trainer_args(dict|None): Current training arguments. Such as 'epoch_id'
+            and 'step_id'.
+            Defaut: None
+        main_program(Program): The program whose checkpoint variables will
+            be saved.
+        max_num_checkpoints(int): The max number of total number of existing
+            checkpoints.
+            Default: 3
+        lookup_table(string|None): the lookup table name, when use distribute
+            lookup table, we can get lookup table name by DistributeTranspiler.
+            table_name
+        pserver_endpoints(list|None): the parameter server ip:port list.
+            when use distribute lookup table, we can get pserver_endpoints by
+            distribute arguments.
+
+    Returns:
+        None
+
+    Raises:
+        ValueError: If `checkpoint_dir` is None.
+        AssertionError: If `trainer_args` is not a dict.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./checkpoints"
+            prog = fluid.default_main_program()
+            trainer_args = {"epoch_id": 200,
+                            "step_id": 20} # just an example
+            table_name = "share_w"
+            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
+
+            save_checkpoint(executor=exe,
+                                     checkpoint_dir=path,
+                                     trainer_id=0,
+                                     trainer_args=trainer_args,
+                                     main_program=prog,
+                                     max_num_checkpoints=3,
+                                     lookup_table=table_name,
+                                     pserver_endpoints = ps_endpoints)
+    """
+    if checkpoint_dir is None:
+        raise ValueError("'checkpoint_dir' should not be None")
+
+    if main_program is None:
+        raise ValueError('main_program should not be None.')
+
+    if trainer_args:
+        assert isinstance(trainer_args, dict)
+
+    is_chief = trainer_id == 0
+
+    _make_chekcpoint_dirs(checkpoint_dir)
+    serial = _get_latest_checkpoint_serial(checkpoint_dir) + 1
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+
+    _save_trainer_args(cur_dir, trainer_id, trainer_args)
+
+    if is_chief:
+        _save_persist_vars_without_grad(executor, cur_dir, main_program)
+
+    if is_chief and lookup_table and pserver_endpoints:
+        _save_pserver_vars_by_notify(executor, cur_dir, lookup_table,
+                                     pserver_endpoints)
+
+    _scroll_delete(checkpoint_dir, max_num_checkpoints)
+
+
+def load_checkpoint(executor,
+                    checkpoint_dir,
+                    main_program,
+                    role_id=0,
+                    is_trainer=True,
+                    load_trainer_args=None,
+                    load_lookup_table=None):
+    """
+    This function filters out all checkpoint variables from the give
+    main_program and then try to load these variables from the
+    `checkpoint_dir` directory.
+
+    In the training precess, we generally save a checkpoint in each
+    iteration. So there are more than one checkpoint in the
+    `checkpoint_dir` (each checkpoint has its own sub folder), use
+    `serial` to specify which serial of checkpoint you would like to
+    load.
+
+    A variable is a checkpoint variable and will be loaded if it meets
+    all following conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for loading checkpoint.
+        checkpoint_dir(str): The folder where all checkpoints are.
+        serial(int): The serial of checkpoint you would like to load.
+        main_program(Program): The program whose checkpoint variables will
+                               be loaded.
+        role_id(int):  the trainer id or the parameter server id.
+        is_trainer(bool): trainer is True and parameter server is False.
+        load_trainer_args(list|None): list about load trainer args.
+        load_lookup_table(str|None): the lookup table name
+
+    Returns:
+        None
+
+    Raises:
+        ValueError: If `checkpoint_dir` is None.
+        ValueError: If `main_program` is None.
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            path = "./checkpoints"
+            prog = fluid.default_main_program()
+            load_checkpoint(executor=exe, checkpoint_dir=path,
+                    serial=9, main_program=prog)
+
+            # In this example, `load_checkpoint` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then try to load these variables form the
+            # folder "./checkpoints/checkpoint_9/__model__".
+    """
+
+    if checkpoint_dir is None:
+        raise ValueError("'checkpoint_dir' should not be None")
+
+    serial = _get_latest_checkpoint_serial(checkpoint_dir)
+
+    # there are nothing  need to be loaded
+    if serial is None or serial < 0:
+        return
+
+    if main_program is None:
+        raise ValueError('main_program should not be None.')
+
+    if is_trainer and load_trainer_args is None:
+        cur_dir = _get_serial_dir(checkpoint_dir, serial)
+        _load_persist_vars_without_grad(executor, cur_dir, main_program, True)
+        return
+
+    if is_trainer and load_trainer_args:
+        return _load_trainer_args(checkpoint_dir, serial, role_id,
+                                  load_trainer_args)
+
+    if not is_trainer and load_lookup_table:
+        _load_lookup_table_vars(executor, checkpoint_dir, main_program, role_id,
+                                load_lookup_table)
+
+
+def clean_checkpoint(checkpoint_dir, delete_dir=False):
+    """
+    clean the checkpoint dir, when the train exits normally,
+    the trainer will call clean_checkpoint to delete checkpoint directory saved before.
+    delete_dir only works when the directory is empty, otherwise, OSError is raised.
+
+    : param checkpoint_dir
+    : param delete_dir
+    """
+
+    if checkpoint_dir is None:
+        raise ValueError("'checkpoint_dir' should not be None")
+    _scroll_delete(checkpoint_dir, max_num_checkpoints=0)
+
+    if delete_dir and not os.listdir(checkpoint_dir):
+        os.rmdir(checkpoint_dir)
+
+
+def _load_persist_vars_without_grad(executor,
+                                    dirname,
+                                    program,
+                                    has_model_dir=False):
+    """
+    This function filters out all checkpoint variables from the give
+    program and then trys to load these variables from the given directory.
+
+    A variable is a checkpoint variable if it meets all following
+    conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for loading variables.
+        dirname(str): The directory path.
+        program(Program): The program whose checkpoint variables will
+                          be loaded.
+        has_model_dir(bool): if True, the function loads variables
+                             from a sub directory named '__model__'.
+                             Default: False
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            _load_persist_vars_without_grad(executor=exe,
+                    dirname=param_path, program=prog, has_model_dir=True)
+
+            # In this example, `_load_persist_vars_without_grad` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then trys to load these variables form the
+            # folder "./my_paddle_model/__model__".
+    """
+
+    if has_model_dir:
+        dirname = _get_model_dir(dirname)
+
+    io.load_vars(
+        executor,
+        dirname=dirname,
+        main_program=program,
+        predicate=_is_checkpoint_var,
+        filename=None)
+
+
+def _load_lookup_table_vars(executor, dirname, program, pserver_id, table_name):
+    """
+    The parameter server will load lookup table's local file in
+    selectedrows variable.
+
+    Args:
+        executor(Executor): The executor to run for loading persistable variables
+        dirname(str): The directory path
+        main_program(Program): Find the variable named table_name in main_program
+        pserver_id(int): the serial number in pserver_endpoints list
+        table_name(str): lookup table name
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            dirname = "./checkpoints/checkpoint_9/"
+            prog = fluid.default_main_program()
+            pserver_id = 1
+            table_name = "share_w"
+            _load_lookup_table_vars(executor=exe,
+                    dirname=dirname, program=prog, pserver_id=pserver_id,
+                    table_name=table_name)
+    """
+
+    for var in program.list_vars():
+        if var.name == table_name:
+            lookup_table_var = var
+            break
+
+    assert lookup_table_var is not None
+
+    lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
+    table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id)
+
+    load_prog = framework.Program()
+    load_block = load_prog.global_block()
+
+    load_block.append_op(
+        type='load',
+        inputs={},
+        outputs={'Out': [lookup_table_var]},
+        attrs={'file_path': os.path.join(lookup_table_dir, table_file)})
+
+    executor.run(load_prog)
+
+
+def _save_persist_vars_without_grad(executor, dirname, program):
+    """
+    This function filters out all checkpoint variables from the give
+    program and then save these variables to a sub-folder '__model__' of
+    the given directory.
+
+    A variable is a checkpoint variable if it meets all following
+    conditions:
+        1. It's persistable.
+        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
+        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
+
+    Args:
+        executor(Executor): The executor to run for saving variables.
+        dirname(str): The directory path.
+        program(Program): The program whose checkpoint variables will
+                          be saved.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            _save_persist_vars_without_grad(executor=exe,
+                    dirname=param_path, program=prog)
+
+            # In this example, `_save_persist_vars_without_grad` function
+            # will first filters out all checkpoint variables in the default
+            # main program, and then saves these variables to the folder
+            # "./my_paddle_model/__model__".
+    """
+    cur_dir = _get_model_dir(dirname)
+    io.save_vars(
+        executor,
+        dirname=cur_dir,
+        main_program=program,
+        vars=None,
+        predicate=_is_checkpoint_var,
+        filename=None)
+    _write_success(cur_dir)
+
+
+def _save_pserver_vars_by_notify(executor, dirname, lookup_table,
+                                 ps_endpoint_list):
+    """
+    This function will send checkpoint notify message from Trainer 0
+    to all the pservers.
+    The checkpoint notify message contains lookup table name,
+    the absolute path on pserver to save lookup_table.
+
+    Args:
+        executor(Executor): The executor to run for send checkpoint notify.
+        dirname(str): The folder where to save checkpoints.
+        lookup_table(string): the lookup table name, when use distribute
+            lookup table, we can get lookup table name by DistributeTranspiler.
+            table_name
+        ps_endpoint_list(list): the parameter server ip:port list.
+            when use distribute lookup table, we can get ps_endpoint_list by
+            distribute arguments.
+    Return:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            exe = fluid.Executor(fluid.CPUPlace())
+            param_path = "./my_paddle_model"
+            prog = fluid.default_main_program()
+            table_name = "share_w"
+            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
+
+            _save_pserver_vars_by_notify(executor=exe,
+                    dirname=param_path, lookup_table=table_name,
+                    ps_endpoint_list=ps_endpoints)
+    """
+    cur_dir = _get_lookuptable_dir(dirname)
+
+    checkpoint_notify_program = framework.Program()
+    checkpoint_notify_block = checkpoint_notify_program.global_block()
+
+    attrs = {}
+    attrs['epmap'] = ps_endpoint_list
+    attrs['dir'] = cur_dir
+    attrs['lookup_table'] = lookup_table
+
+    checkpoint_notify_block.append_op(
+        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
+    executor.run(checkpoint_notify_program)
+
+
+def _save_trainer_args(dirname, trainer_id, trainer_args):
+    assert isinstance(trainer_args, dict)
+
+    cur_dir = _get_trainer_dir(dirname, trainer_id)
+
+    for name, value in six.iteritems(trainer_args):
+        args_file = os.path.join(cur_dir, name)
+        with open(args_file, 'w') as f:
+            f.write(str(value))
+    _write_success(cur_dir)
+
+
+def _load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
+    """
+    trainer will load some args from it's independent directory,
+    such as epoch_id and step_id.
+
+    Args:
+        checkpoint_dir(str): The folder where all checkpoints are.
+        serial(int): The serial of checkpoint you would like to load.
+        trainer_id(int): current trainer id.
+        trainer_args(list): list about load trainer args
+    Return:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            param_path = "./checkpoint/"
+            serial = 7
+            trainer_id = 2
+            trainer_args = ["epoch_id", "step_id"]
+
+            _load_trainer_args(checkpoint_dir=param_path, serial=serial,
+            trainer_id=trainer_id, trainer_args=trainer_args)
+    """
+    assert isinstance(trainer_args, list)
+
+    cur_dir = _get_serial_dir(checkpoint_dir, serial)
+    cur_dir = _get_trainer_dir(cur_dir, trainer_id)
+
+    ret_values = []
+
+    for arg in trainer_args:
+        cur_file = os.path.join(cur_dir, arg)
+        with open(cur_file, 'r') as f:
+            contents = f.read()
+            ret_values.append(contents.strip())
+    return ret_values
+
+
+def _is_checkpoint_var(var):
+    """
+    the checkpoint will not save or load all the variables.
+    var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
+
+    : param var(Variable)
+    """
+    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+            var.desc.type() == core.VarDesc.VarType.RAW:
+        return False
+    # @GRAD are named for gradient variables, checkpoint will not save it.
+    if "@GRAD" in var.name:
+        return False
+    # .trainer_ are named for distribute train variables, checkpoint will not save it.
+    if ".trainer_" in var.name:
+        return False
+
+    # .block is named for distribute train variables, checkpoint will not save it.
+    if ".block" in var.name:
+        return False
+
+    return var.persistable
+
+
+def _make_chekcpoint_dirs(dirs):
+    """
+    _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it.
+    """
+    assert dirs is not None
+
+    if os.path.isfile(dirs):
+        raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs)
+
+    if not os.path.isdir(dirs):
+        try:
+            os.makedirs(dirs)
+        except OSError as err:
+            if err.errno != errno.EEXIST:
+                raise err
+
+
+def _get_dir_serial(dirname):
+    _, serial = dirname.split(CHECKPOINT_SEPARATOR)
+
+    try:
+        serial_num = int(serial)
+    except ValueError:
+        serial_num = -1
+    return serial_num
+
+
+def _get_serial_dir(dirname, serial):
+    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
+    serial_dir = os.path.join(dirname, serial_folder)
+    _make_chekcpoint_dirs(serial_dir)
+
+    return serial_dir
+
+
+def _get_model_dir(dirname):
+    model_dir = os.path.join(dirname, MODEL_DIR)
+    _make_chekcpoint_dirs(model_dir)
+    return model_dir
+
+
+def _get_lookuptable_dir(dirname):
+    lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
+    _make_chekcpoint_dirs(lookuptable_dir)
+    return lookuptable_dir
+
+
+def _get_trainer_dir(dirname, trainer_id):
+    trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
+    trainer_dir = os.path.join(dirname, trainer_folder)
+    _make_chekcpoint_dirs(trainer_dir)
+    return trainer_dir
+
+
+def _scroll_delete(dirname, max_num_checkpoints=3):
+    dirs = os.listdir(dirname)
+    serial_map = {}
+    for serial in dirs:
+        serial_num = _get_dir_serial(serial)
+        serial_map[serial_num] = serial
+
+    if len(list(serial_map.keys())) <= max_num_checkpoints:
+        return
+
+    serials = list(serial_map.keys())
+    serials.sort(reverse=True)
+    serials = serials[max_num_checkpoints:]
+    for serial in serials:
+        cur_dir = _get_serial_dir(dirname, serial)
+        try:
+            shutil.rmtree(cur_dir)
+        except OSError as err:
+            if err.errno != errno.ENOENT:
+                raise err
+
+
+def _write_success(dirname):
+    """
+    write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.
+
+    : param dirname
+    """
+    success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
+    with open(success_file, 'a') as f:
+        now = time.ctime()
+        f.write(now)
+
+
+def _get_latest_checkpoint_serial(checkpoint_dir):
+    """
+    get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
+
+    : param checkpoint_dir
+    """
+    if not checkpoint_dir:
+        return -1
+
+    def has_success(checkpoint_dir, cur_dir):
+        """
+        is _SUCCESS in this dir
+        """
+
+        serial = _get_dir_serial(cur_dir)
+        if serial == -1 or not os.path.isdir(
+                os.path.join(checkpoint_dir, cur_dir)):
+            return -1
+
+        success_path = os.path.join(
+            _get_serial_dir(checkpoint_dir, serial), MODEL_DIR,
+            SUCCESS_MARK_FILENAME)
+        if os.path.isfile(success_path):
+            return serial
+
+    if not os.path.isdir(checkpoint_dir):
+        return -1
+
+    current_dir = -1
+    dirs = os.listdir(checkpoint_dir)
+    for cur_dir in dirs:
+        success_num = has_success(checkpoint_dir, cur_dir)
+        if success_num > current_dir:
+            current_dir = success_num
+    return current_dir
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 1d3c94229048ef568dfa651cc20731190beee3b8..bced5fd1d9c617ab614212c811e86422d65a2e56 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -18,6 +18,7 @@ import collections
 import contextlib
 import re
 import six
+import traceback
 
 import numpy as np
 
@@ -34,11 +35,12 @@ except ImportError as e:
 except Exception as e:
     raise e
 from . import unique_name
+import os
+PADDLE_ON_MODEL_CE = os.environ.get('PADDLE_ON_MODEL_CE', None) is not None
 
 __all__ = [
     'Program',
     'Operator',
-    'Parameter',
     'default_startup_program',
     'default_main_program',
     'program_guard',
@@ -490,7 +492,8 @@ class OpProtoHolder(object):
         return {
             core.op_proto_and_checker_maker.kOpRoleAttrName(),
             core.op_proto_and_checker_maker.kOpRoleVarAttrName(),
-            core.op_proto_and_checker_maker.kOpNameScopeAttrName()
+            core.op_proto_and_checker_maker.kOpNameScopeAttrName(),
+            core.op_proto_and_checker_maker.kOpCreationCallstackAttrName()
         }
 
 
@@ -573,6 +576,11 @@ class Operator(object):
         if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
             del op_attrs[role_var_name]
 
+        if not PADDLE_ON_MODEL_CE:
+            callstack_var_name = op_maker.kOpCreationCallstackAttrName()
+            op_attrs[callstack_var_name] = list(
+                reversed(traceback.format_stack()))[1:]
+
         if len(self.desc.type()) != 0:
             return
         if type is None:
diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py
index a9b94a20720615dbfca97749463f27dbc88ac64f..7bdd430f985bd0b3818f6ef305ce2d7d8976106b 100644
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -12,101 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
-import contextlib
-
-from . import core
-
-from . import executor
-from . import framework
-from . import io
-from . import parallel_executor
-from . import unique_name
-from .trainer import check_and_get_place
-
-__all__ = ['Inferencer', ]
-
-
-class Inferencer(object):
-    """
-    Inferencer High Level API.
-
-    Args:
-        infer_func (Python func): Infer function that will return predict Variable
-        param_path (str): The path where the inference model is saved by fluid.io.save_params
-        place (Place): place to do the inference
-        parallel (bool): use parallel_executor to run the inference, it will use multi CPU/GPU.
-
-    Examples:
-        .. code-block:: python
-
-            def inference_program():
-                x = fluid.layers.data(name='x', shape=[13], dtype='float32')
-                y_predict = fluid.layers.fc(input=x, size=1, act=None)
-                return y_predict
-
-            place = fluid.CPUPlace()
-            inferencer = fluid.Inferencer(
-                infer_func=inference_program, param_path="/tmp/model", place=place)
-
-    """
-
-    def __init__(self, infer_func, param_path, place=None, parallel=False):
-        self.param_path = param_path
-        self.scope = core.Scope()
-        self.parallel = parallel
-        self.place = check_and_get_place(place)
-
-        self.inference_program = framework.Program()
-        with framework.program_guard(self.inference_program):
-            with unique_name.guard():
-                self.predict_var = infer_func()
-
-        with self._prog_and_scope_guard():
-            # load params from param_path into scope
-            io.load_params(executor.Executor(self.place), param_path)
-
-        if parallel:
-            with self._prog_and_scope_guard():
-                self.exe = parallel_executor.ParallelExecutor(
-                    use_cuda=isinstance(self.place, core.CUDAPlace),
-                    loss_name=self.predict_var.name)
-        else:
-            self.exe = executor.Executor(self.place)
-
-        self.inference_program = self.inference_program.clone(for_test=True)
-
-    def infer(self, inputs, return_numpy=True):
-        """
-        Do Inference for Inputs
-
-        Args:
-            inputs (map): a map of {"input_name": input_var} that will be feed into the inference program
-            return_numpy (bool): transform return value into numpy or not
-
-        Returns:
-            Tensor or Numpy: the predict value of the inference model for the inputs
-
-        Examples:
-            .. code-block:: python
-
-                tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
-                results = inferencer.infer({'x': tensor_x})
-        """
-        if not isinstance(inputs, dict):
-            raise ValueError(
-                "inputs should be a map of {'input_name': input_var}")
-
-        with self._prog_and_scope_guard():
-            results = self.exe.run(feed=inputs,
-                                   fetch_list=[self.predict_var.name],
-                                   return_numpy=return_numpy)
-
-        return results
-
-    @contextlib.contextmanager
-    def _prog_and_scope_guard(self):
-        with framework.program_guard(main_program=self.inference_program):
-            with executor.scope_guard(self.scope):
-                yield
+# NOTE: inferencer is moved into fluid.contrib.inferencer.
+__all__ = []
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 6e0f3de4141aff3efb98b6d1162823a13f83a7bb..2cb61a9cd25c744710ab7ac9ea591902740f78da 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6471,12 +6471,14 @@ def _elementwise_op(helper):
     assert y is not None, 'y cannot be None in {}'.format(op_type)
     axis = helper.kwargs.get('axis', -1)
     use_mkldnn = helper.kwargs.get('use_mkldnn', False)
-    name = helper.kwargs.get('name', None)
-    if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
+    out = helper.kwargs.get('out', None)
+    if out is None:
+        name = helper.kwargs.get('name', None)
+        if name is None:
+            out = helper.create_tmp_variable(dtype=x.dtype)
+        else:
+            out = helper.create_variable(
+                name=name, dtype=x.dtype, persistable=False)
 
     helper.append_op(
         type=op_type,
@@ -6489,7 +6491,13 @@ def _elementwise_op(helper):
 
 
 @templatedoc()
-def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
+def scale(x,
+          scale=1.0,
+          bias=0.0,
+          bias_after_scale=True,
+          out=None,
+          act=None,
+          name=None):
     """
     ${comment}
 
@@ -6498,6 +6506,7 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
         scale(${scale_type}): ${scale_comment}
         bias(${bias_type}): ${bias_comment}
         bias_after_scale(${bias_after_scale_type}): ${bias_after_scale_comment}
+        out(Tensor): Output tensor.
         act(basestring|None): Activation applied to the output.
         name(basestring|None): Name of the output. 
 
@@ -6506,11 +6515,12 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
     """
 
     helper = LayerHelper('scale', **locals())
-    if name is None:
-        out = helper.create_tmp_variable(dtype=x.dtype)
-    else:
-        out = helper.create_variable(
-            name=name, dtype=x.dtype, persistable=False)
+    if out is None:
+        if name is None:
+            out = helper.create_tmp_variable(dtype=x.dtype)
+        else:
+            out = helper.create_variable(
+                name=name, dtype=x.dtype, persistable=False)
 
     helper.append_op(
         type='scale',
@@ -6524,31 +6534,73 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
     return helper.append_activation(out)
 
 
-def elementwise_add(x, y, axis=-1, use_mkldnn=False, act=None, name=None):
+def elementwise_add(x,
+                    y,
+                    out=None,
+                    axis=-1,
+                    use_mkldnn=False,
+                    act=None,
+                    name=None):
     return _elementwise_op(LayerHelper('elementwise_add', **locals()))
 
 
-def elementwise_div(x, y, axis=-1, use_mkldnn=False, act=None, name=None):
+def elementwise_div(x,
+                    y,
+                    out=None,
+                    axis=-1,
+                    use_mkldnn=False,
+                    act=None,
+                    name=None):
     return _elementwise_op(LayerHelper('elementwise_div', **locals()))
 
 
-def elementwise_sub(x, y, axis=-1, use_mkldnn=False, act=None, name=None):
+def elementwise_sub(x,
+                    y,
+                    out=None,
+                    axis=-1,
+                    use_mkldnn=False,
+                    act=None,
+                    name=None):
     return _elementwise_op(LayerHelper('elementwise_sub', **locals()))
 
 
-def elementwise_mul(x, y, axis=-1, use_mkldnn=False, act=None, name=None):
+def elementwise_mul(x,
+                    y,
+                    out=None,
+                    axis=-1,
+                    use_mkldnn=False,
+                    act=None,
+                    name=None):
     return _elementwise_op(LayerHelper('elementwise_mul', **locals()))
 
 
-def elementwise_max(x, y, axis=-1, use_mkldnn=False, act=None, name=None):
+def elementwise_max(x,
+                    y,
+                    out=None,
+                    axis=-1,
+                    use_mkldnn=False,
+                    act=None,
+                    name=None):
     return _elementwise_op(LayerHelper('elementwise_max', **locals()))
 
 
-def elementwise_min(x, y, axis=-1, use_mkldnn=False, act=None, name=None):
+def elementwise_min(x,
+                    y,
+                    out=None,
+                    axis=-1,
+                    use_mkldnn=False,
+                    act=None,
+                    name=None):
     return _elementwise_op(LayerHelper('elementwise_min', **locals()))
 
 
-def elementwise_pow(x, y, axis=-1, use_mkldnn=False, act=None, name=None):
+def elementwise_pow(x,
+                    y,
+                    out=None,
+                    axis=-1,
+                    use_mkldnn=False,
+                    act=None,
+                    name=None):
     return _elementwise_op(LayerHelper('elementwise_pow', **locals()))
 
 
@@ -6560,6 +6612,7 @@ for func in [
     func.__doc__ = _generate_doc_string_(
         op_proto,
         additional_args_lines=[
+            "out (Tensor): The output tensor of elementwise op.",
             "act (basestring|None): Activation applied to the output.",
             "name (basestring|None): Name of the output."
         ])
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 48d92c342d630582aebfae60b59d868f5893c422..7867bfe00e25711643eab1ab8d0141dbbad3da52 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -21,6 +21,7 @@ __activations_noattr__ = [
     'exp',
     'tanh',
     'tanh_shrink',
+    'softshrink',
     'sqrt',
     'abs',
     'ceil',
@@ -52,7 +53,6 @@ __all__ = [
     'slice',
     'shape',
     'maxout',
-    'softshrink',
 ]
 
 for _OP in set(__all__):
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 051fe84364639ca6028326c0cb02b204a02531af..06513801dd8b34d366f9632f6943c8046872c31b 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -21,6 +21,7 @@ __all__ = [
     "sequence_conv_pool",
     "glu",
     "scaled_dot_product_attention",
+    "img_conv_group",
 ]
 
 
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 44af29d3390e35129d0ee65b31eacad6b28a9d60..57d272cbfb948840679e80e8db40379c57603113 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -74,28 +74,7 @@ class ParallelExecutor(object):
                  build_strategy=None,
                  num_trainers=1,
                  trainer_id=0,
-                 scope=None,
-                 **kwargs):
-        if len(kwargs) != 0:
-            err_msg = ""
-            for key in kwargs:
-                if key in dir(ExecutionStrategy):
-                    err_msg += \
-                        "Setting {0} by constructor is deprecated. Use " \
-                        "strategy=ExecutionStrategy(); strategy.{0}=xxx; " \
-                        "pe=ParallelExecutor(exec_strategy=strategy) " \
-                        "instead.\n ".format(key)
-                elif key in dir(BuildStrategy):
-                    err_msg += \
-                        "Setting {0} by constructor is deprecated. Use " \
-                        "strategy=BuildStrategy(); See help(" \
-                        "paddle.fluid.ParallelExecutor.BuildStrategy) \n".format(
-                            key)
-                else:
-                    err_msg += "Setting {0} by constructor is deprecated. Use strategy.\n".format(
-                        key)
-            raise ValueError(err_msg)
-
+                 scope=None):
         self._places = []
         self._act_places = []
         if use_cuda:
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index f0be794327f51cbbc4202b8b7b401b712b6d66a3..a51607bfdb1dde3d25f490770cc2ba368ceb27ff 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -185,7 +185,17 @@ class WeightNormParamAttr(ParamAttr):
 
     Args:
         dim(list): The parameter's name. Default None.
-        kwargs: Any field in ParamAttr. Default None.
+        name(str): The parameter's name. Default None.
+        initializer(Initializer): The method to initial this parameter. Default None.
+        learning_rate(float): The parameter's learning rate. The learning rate when
+            optimize is :math:`global\_lr * parameter\_lr * scheduler\_factor`.
+            Default 1.0.
+        regularizer(WeightDecayRegularizer): Regularization factor. Default None.
+        trainable(bool): Whether this parameter is trainable. Default True.
+        gradient_clip(BaseGradientClipAttr): The method to clip this parameter's
+            gradient. Default None.
+        do_model_average(bool): Whether this parameter should do model average.
+            Default False.
 
     Examples:
         .. code-block:: python
@@ -204,6 +214,21 @@ class WeightNormParamAttr(ParamAttr):
     # these paramters for inference.
     params_with_weight_norm = []
 
-    def __init__(self, dim=None, **kwargs):
-        super(WeightNormParamAttr, self).__init__(**kwargs)
+    def __init__(self,
+                 dim=None,
+                 name=None,
+                 initializer=None,
+                 learning_rate=1.0,
+                 regularizer=None,
+                 trainable=True,
+                 gradient_clip=None,
+                 do_model_average=False):
+        super(WeightNormParamAttr, self).__init__(
+            name=name,
+            initializer=initializer,
+            learning_rate=learning_rate,
+            regularizer=regularizer,
+            trainable=trainable,
+            gradient_clip=gradient_clip,
+            do_model_average=do_model_average)
         self.dim = dim
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
index e1368a3392a9cab3e82eff0a73eb225a52aa03bf..87f3b7502e26d3e6a437985f99d7897b060e101e 100644
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -16,6 +16,16 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid as fluid
+import sys
+try:
+    from paddle.fluid.contrib.trainer import *
+    from paddle.fluid.contrib.inferencer import *
+except ImportError:
+    print(
+        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
+        file=sys.stderr)
+    from paddle.fluid.trainer import *
+    from paddle.fluid.inferencer import *
 import contextlib
 import numpy
 import unittest
@@ -57,11 +67,11 @@ def optimizer_func():
 def train(use_cuda, train_program, params_dirname, inference_model_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
-    trainer = fluid.Trainer(
+    trainer = Trainer(
         train_func=train_program, place=place, optimizer_func=optimizer_func)
 
     def event_handler(event):
-        if isinstance(event, fluid.EndStepEvent):
+        if isinstance(event, EndStepEvent):
             if event.step == 10:
                 test_metrics = trainer.test(
                     reader=test_reader, feed_order=['x', 'y'])
@@ -91,7 +101,7 @@ def infer(use_cuda, inference_program, params_dirname=None):
         return
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = fluid.Inferencer(
+    inferencer = Inferencer(
         infer_func=inference_program, param_path=params_dirname, place=place)
 
     batch_size = 10
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
index de276755bb1eb2746cc780575a40357255223809..d744a00242422defb360590b193e07c6f811dcb9 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
@@ -14,11 +14,22 @@
 
 from __future__ import print_function
 
+import sys
+
 import paddle
 import paddle.fluid as fluid
+
+try:
+    from paddle.fluid.contrib.trainer import *
+    from paddle.fluid.contrib.inferencer import *
+except ImportError:
+    print(
+        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
+        file=sys.stderr)
+    from paddle.fluid.trainer import *
+    from paddle.fluid.inferencer import *
 import paddle.fluid.core as core
 import numpy
-import six
 import os
 import cifar10_small_test_set
 
@@ -106,7 +117,7 @@ def train(use_cuda, train_program, parallel, params_dirname):
         paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
 
     def event_handler(event):
-        if isinstance(event, fluid.EndStepEvent):
+        if isinstance(event, EndStepEvent):
             avg_cost, accuracy = trainer.test(
                 reader=test_reader, feed_order=['pixel', 'label'])
 
@@ -118,7 +129,7 @@ def train(use_cuda, train_program, parallel, params_dirname):
                 return
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    trainer = fluid.Trainer(
+    trainer = Trainer(
         train_func=train_program,
         optimizer_func=optimizer_func,
         place=place,
@@ -133,7 +144,7 @@ def train(use_cuda, train_program, parallel, params_dirname):
 
 def infer(use_cuda, inference_program, parallel, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = fluid.Inferencer(
+    inferencer = Inferencer(
         infer_func=inference_program,
         param_path=params_dirname,
         place=place,
diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
index dd547f3448ae55c07b6c09f9de4ac08d8ec5ee88..82294d4b26fe64e6cddc81f9ba3480caf5b51620 100644
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
@@ -14,11 +14,22 @@
 
 from __future__ import print_function
 
+import sys
+
 import paddle
 import paddle.fluid as fluid
+
+try:
+    from paddle.fluid.contrib.trainer import *
+    from paddle.fluid.contrib.inferencer import *
+except ImportError:
+    print(
+        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
+        file=sys.stderr)
+    from paddle.fluid.trainer import *
+    from paddle.fluid.inferencer import *
 import paddle.fluid.core as core
 import numpy
-import six
 import os
 import cifar10_small_test_set
 
@@ -83,7 +94,7 @@ def train(use_cuda, train_program, parallel, params_dirname):
         paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE, drop_last=False)
 
     def event_handler(event):
-        if isinstance(event, fluid.EndStepEvent):
+        if isinstance(event, EndStepEvent):
             avg_cost, accuracy = trainer.test(
                 reader=test_reader, feed_order=['pixel', 'label'])
 
@@ -95,7 +106,7 @@ def train(use_cuda, train_program, parallel, params_dirname):
                 return
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    trainer = fluid.Trainer(
+    trainer = Trainer(
         train_func=train_program,
         place=place,
         optimizer_func=optimizer_func,
@@ -110,7 +121,7 @@ def train(use_cuda, train_program, parallel, params_dirname):
 
 def infer(use_cuda, inference_program, parallel, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = fluid.Inferencer(
+    inferencer = Inferencer(
         infer_func=inference_program,
         param_path=params_dirname,
         place=place,
diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
index ec4e1c768c7f2a2421ac409a2eecc0100c086a6a..9e155a59145db88dab27576a4a67a5d450bcfc9d 100755
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
@@ -16,6 +16,16 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid as fluid
+import sys
+try:
+    from paddle.fluid.contrib.trainer import *
+    from paddle.fluid.contrib.inferencer import *
+except ImportError:
+    print(
+        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
+        file=sys.stderr)
+    from paddle.fluid.trainer import *
+    from paddle.fluid.inferencer import *
 import numpy as np
 
 WORD_DICT, VERB_DICT, LABEL_DICT = paddle.dataset.conll05.get_dict()
@@ -149,7 +159,7 @@ def optimize_func():
 def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
-    trainer = fluid.Trainer(
+    trainer = Trainer(
         train_func=train_program, place=place, optimizer_func=optimize_func)
 
     feed_order = [
@@ -164,7 +174,7 @@ def train(use_cuda, train_program, params_dirname):
     #        place)
 
     def event_handler(event):
-        if isinstance(event, fluid.EndEpochEvent):
+        if isinstance(event, EndEpochEvent):
             test_reader = paddle.batch(
                 paddle.dataset.conll05.test(), batch_size=BATCH_SIZE)
             avg_cost_set = trainer.test(
@@ -184,7 +194,7 @@ def train(use_cuda, train_program, params_dirname):
                 if math.isnan(float(avg_cost)):
                     sys.exit("got NaN loss, training failed.")
 
-        elif isinstance(event, fluid.EndStepEvent):
+        elif isinstance(event, EndStepEvent):
             print("Step {0}, Epoch {1} Metrics {2}".format(
                 event.step, event.epoch, list(map(np.array, event.metrics))))
             if event.step == 1:  # Run 2 iterations to speed CI
@@ -204,7 +214,7 @@ def train(use_cuda, train_program, params_dirname):
 
 def infer(use_cuda, inference_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = fluid.Inferencer(
+    inferencer = Inferencer(
         inference_program, param_path=params_dirname, place=place)
 
     # Setup input by creating LoDTensor to represent sequence of words.
diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
index 560f1189581f631dc6a3470cf8f22f902ca26f26..b597dcf801dc5ad4b5957875634018cfdcd0b83b 100644
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
@@ -13,17 +13,28 @@
 # limitations under the License.
 
 from __future__ import print_function
+
 import contextlib
+import sys
 
 import numpy as np
 import paddle
 import paddle.fluid as fluid
+
+try:
+    from paddle.fluid.contrib.trainer import *
+    from paddle.fluid.contrib.inferencer import *
+except ImportError:
+    print(
+        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
+        file=sys.stderr)
+    from paddle.fluid.trainer import *
+    from paddle.fluid.inferencer import *
 import paddle.fluid.framework as framework
 import paddle.fluid.layers as pd
 from paddle.fluid.executor import Executor
 from functools import partial
 import unittest
-import os
 
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
@@ -198,12 +209,12 @@ def train(use_cuda, is_sparse, is_local=True):
     ]
 
     def event_handler(event):
-        if isinstance(event, fluid.EndStepEvent):
+        if isinstance(event, EndStepEvent):
             print('pass_id=' + str(event.epoch) + ' batch=' + str(event.step))
             if event.step == 10:
                 trainer.stop()
 
-    trainer = fluid.Trainer(
+    trainer = Trainer(
         train_func=partial(train_program, is_sparse),
         place=place,
         optimizer_func=optimizer_func)
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
index 973308498bec3cddde2ef651751ad5d0c9f84503..ce183883e3bddd8633dd9c393ee358ba6210ea61 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -14,14 +14,22 @@
 
 from __future__ import print_function
 
-import argparse
+import sys
+
 import paddle.fluid as fluid
+
+try:
+    from paddle.fluid.contrib.trainer import *
+    from paddle.fluid.contrib.inferencer import *
+except ImportError:
+    print(
+        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
+        file=sys.stderr)
+    from paddle.fluid.trainer import *
+    from paddle.fluid.inferencer import *
 import paddle.fluid.core as core
 import paddle
-import six
-import sys
 import numpy
-import unittest
 import math
 import sys
 import os
@@ -68,14 +76,14 @@ def optimizer_func():
 def train(use_cuda, train_program, parallel, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
-    trainer = fluid.Trainer(
+    trainer = Trainer(
         train_func=train_program,
         place=place,
         optimizer_func=optimizer_func,
         parallel=parallel)
 
     def event_handler(event):
-        if isinstance(event, fluid.EndEpochEvent):
+        if isinstance(event, EndEpochEvent):
             test_reader = paddle.batch(
                 paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
             avg_cost, acc = trainer.test(
@@ -91,7 +99,7 @@ def train(use_cuda, train_program, parallel, params_dirname):
                     event.epoch + 1, avg_cost, acc))
                 if math.isnan(avg_cost):
                     sys.exit("got NaN loss, training failed.")
-        elif isinstance(event, fluid.EndStepEvent):
+        elif isinstance(event, EndStepEvent):
             print(
                 ("Step {0}, Epoch {1} Metrics {2}".format(
                     event.step, event.epoch,
@@ -112,7 +120,7 @@ def train(use_cuda, train_program, parallel, params_dirname):
 def infer(use_cuda, inference_program, parallel, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
-    inferencer = fluid.Inferencer(
+    inferencer = Inferencer(
         infer_func=inference_program,
         param_path=params_dirname,
         place=place,
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
index cb4aeb430e1a9662a183084c0cdacc41c5a8ec11..45a5ff34af00f2dbe69bd4f08a50626d6ca814f8 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -14,14 +14,22 @@
 
 from __future__ import print_function
 
-import argparse
+import sys
+
 import paddle.fluid as fluid
+
+try:
+    from paddle.fluid.contrib.trainer import *
+    from paddle.fluid.contrib.inferencer import *
+except ImportError:
+    print(
+        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
+        file=sys.stderr)
+    from paddle.fluid.trainer import *
+    from paddle.fluid.inferencer import *
 import paddle.fluid.core as core
 import paddle
-import six
-import sys
 import numpy
-import unittest
 import math
 import sys
 import os
@@ -55,14 +63,14 @@ def optimizer_func():
 def train(use_cuda, train_program, params_dirname, parallel):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
-    trainer = fluid.Trainer(
+    trainer = Trainer(
         train_func=train_program,
         place=place,
         optimizer_func=optimizer_func,
         parallel=parallel)
 
     def event_handler(event):
-        if isinstance(event, fluid.EndEpochEvent):
+        if isinstance(event, EndEpochEvent):
             test_reader = paddle.batch(
                 paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
             avg_cost, acc = trainer.test(
@@ -94,7 +102,7 @@ def train(use_cuda, train_program, params_dirname, parallel):
 def infer(use_cuda, inference_program, parallel, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
-    inferencer = fluid.Inferencer(
+    inferencer = Inferencer(
         infer_func=inference_program,
         param_path=params_dirname,
         place=place,
diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
index 9e2767783bb6748cfc8f95567627068d7532a8c8..82193737967b2bebdd17cef8752eeb9cec2e85ce 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
@@ -19,6 +19,16 @@ import sys
 import numpy as np
 import paddle
 import paddle.fluid as fluid
+import sys
+try:
+    from paddle.fluid.contrib.trainer import *
+    from paddle.fluid.contrib.inferencer import *
+except ImportError:
+    print(
+        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
+        file=sys.stderr)
+    from paddle.fluid.trainer import *
+    from paddle.fluid.inferencer import *
 import paddle.fluid.layers as layers
 import paddle.fluid.nets as nets
 
@@ -164,7 +174,7 @@ def optimizer_func():
 def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
-    trainer = fluid.Trainer(
+    trainer = Trainer(
         train_func=train_program, place=place, optimizer_func=optimizer_func)
 
     feed_order = [
@@ -173,7 +183,7 @@ def train(use_cuda, train_program, params_dirname):
     ]
 
     def event_handler(event):
-        if isinstance(event, fluid.EndStepEvent):
+        if isinstance(event, EndStepEvent):
             test_reader = paddle.batch(
                 paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)
             avg_cost_set = trainer.test(
@@ -208,7 +218,7 @@ def train(use_cuda, train_program, params_dirname):
 
 def infer(use_cuda, inference_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = fluid.Inferencer(
+    inferencer = Inferencer(
         inference_program, param_path=params_dirname, place=place)
 
     # Use the first data from paddle.dataset.movielens.test() as input.
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
index 097c2a468fca558106aba2f24c332256189d9076..14719774b9d90c2e96d8f6134469502241a5f1f2 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
@@ -16,6 +16,16 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid as fluid
+import sys
+try:
+    from paddle.fluid.contrib.trainer import *
+    from paddle.fluid.contrib.inferencer import *
+except ImportError:
+    print(
+        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
+        file=sys.stderr)
+    from paddle.fluid.trainer import *
+    from paddle.fluid.inferencer import *
 from functools import partial
 import numpy as np
 
@@ -72,13 +82,13 @@ def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     word_dict = paddle.dataset.imdb.word_dict()
-    trainer = fluid.Trainer(
+    trainer = Trainer(
         train_func=partial(train_program, word_dict),
         place=place,
         optimizer_func=optimizer_func)
 
     def event_handler(event):
-        if isinstance(event, fluid.EndEpochEvent):
+        if isinstance(event, EndEpochEvent):
             test_reader = paddle.batch(
                 paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
             avg_cost, acc = trainer.test(
@@ -96,7 +106,7 @@ def train(use_cuda, train_program, params_dirname):
                     event.epoch + 1, avg_cost, acc))
                 if math.isnan(avg_cost):
                     sys.exit("got NaN loss, training failed.")
-        elif isinstance(event, fluid.EndStepEvent):
+        elif isinstance(event, EndStepEvent):
             print("Step {0}, Epoch {1} Metrics {2}".format(
                 event.step, event.epoch, list(map(np.array, event.metrics))))
             if event.step == 1:  # Run 2 iterations to speed CI
@@ -119,7 +129,7 @@ def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     word_dict = paddle.dataset.imdb.word_dict()
 
-    inferencer = fluid.Inferencer(
+    inferencer = Inferencer(
         infer_func=partial(inference_program, word_dict),
         param_path=params_dirname,
         place=place)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
index 5f74cd142590abb93f8846bc831a9f5e3dd2f311..62fbba6fe1a62da6a93d50abc074bf5d794cf458 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
@@ -16,6 +16,16 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid as fluid
+import sys
+try:
+    from paddle.fluid.contrib.trainer import *
+    from paddle.fluid.contrib.inferencer import *
+except ImportError:
+    print(
+        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
+        file=sys.stderr)
+    from paddle.fluid.trainer import *
+    from paddle.fluid.inferencer import *
 from functools import partial
 import numpy as np
 
@@ -87,13 +97,13 @@ def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     word_dict = paddle.dataset.imdb.word_dict()
-    trainer = fluid.Trainer(
+    trainer = Trainer(
         train_func=partial(train_program, word_dict),
         place=place,
         optimizer_func=optimizer_func)
 
     def event_handler(event):
-        if isinstance(event, fluid.EndEpochEvent):
+        if isinstance(event, EndEpochEvent):
             test_reader = paddle.batch(
                 paddle.dataset.imdb.test(word_dict), batch_size=BATCH_SIZE)
             avg_cost, acc = trainer.test(
@@ -111,7 +121,7 @@ def train(use_cuda, train_program, params_dirname):
                     event.epoch + 1, avg_cost, acc))
                 if math.isnan(avg_cost):
                     sys.exit("got NaN loss, training failed.")
-        elif isinstance(event, fluid.EndStepEvent):
+        elif isinstance(event, EndStepEvent):
             print("Step {0}, Epoch {1} Metrics {2}".format(
                 event.step, event.epoch, list(map(np.array, event.metrics))))
             if event.step == 1:  # Run 2 iterations to speed CI
@@ -134,7 +144,7 @@ def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     word_dict = paddle.dataset.imdb.word_dict()
 
-    inferencer = fluid.Inferencer(
+    inferencer = Inferencer(
         infer_func=partial(inference_program, word_dict),
         param_path=params_dirname,
         place=place)
diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
index 284a6ca168636377699c287236c491352566909b..7523ad3fef17f61b1bde1fc687761cc6b86c3d9e 100644
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
@@ -16,6 +16,16 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid as fluid
+import sys
+try:
+    from paddle.fluid.contrib.trainer import *
+    from paddle.fluid.contrib.inferencer import *
+except ImportError:
+    print(
+        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
+        file=sys.stderr)
+    from paddle.fluid.trainer import *
+    from paddle.fluid.inferencer import *
 from functools import partial
 import numpy as np
 
@@ -79,13 +89,13 @@ def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     word_dict = paddle.dataset.imdb.word_dict()
-    trainer = fluid.Trainer(
+    trainer = Trainer(
         train_func=partial(train_program, word_dict),
         place=place,
         optimizer_func=optimizer_func)
 
     def event_handler(event):
-        if isinstance(event, fluid.EndEpochEvent):
+        if isinstance(event, EndEpochEvent):
             test_reader = paddle.batch(
                 paddle.dataset.imdb.test(word_dict),
                 batch_size=BATCH_SIZE,
@@ -105,7 +115,7 @@ def train(use_cuda, train_program, params_dirname):
                     event.epoch + 1, avg_cost, acc))
                 if math.isnan(avg_cost):
                     sys.exit("got NaN loss, training failed.")
-        elif isinstance(event, fluid.EndStepEvent):
+        elif isinstance(event, EndStepEvent):
             print("Step {0}, Epoch {1} Metrics {2}".format(
                 event.step, event.epoch, list(map(np.array, event.metrics))))
             if event.step == 1:  # Run 2 iterations to speed CI
@@ -129,7 +139,7 @@ def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     word_dict = paddle.dataset.imdb.word_dict()
 
-    inferencer = fluid.Inferencer(
+    inferencer = Inferencer(
         infer_func=partial(inference_program, word_dict),
         param_path=params_dirname,
         place=place)
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
index 1c7cf3199a07c3f65d967eda70a481b1bd1b1638..e4c0cc5429d3fe891034161d90fadfa9dd078b0b 100644
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
@@ -16,6 +16,16 @@ from __future__ import print_function
 
 import paddle
 import paddle.fluid as fluid
+import sys
+try:
+    from paddle.fluid.contrib.trainer import *
+    from paddle.fluid.contrib.inferencer import *
+except ImportError:
+    print(
+        "In the fluid 1.0, the trainer and inferencer are moving to paddle.fluid.contrib",
+        file=sys.stderr)
+    from paddle.fluid.trainer import *
+    from paddle.fluid.inferencer import *
 import numpy as np
 import math
 import sys
@@ -95,7 +105,7 @@ def train(use_cuda, train_program, params_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     def event_handler(event):
-        if isinstance(event, fluid.EndStepEvent):
+        if isinstance(event, EndStepEvent):
             outs = trainer.test(
                 reader=test_reader,
                 feed_order=['firstw', 'secondw', 'thirdw', 'forthw', 'nextw'])
@@ -109,7 +119,7 @@ def train(use_cuda, train_program, params_dirname):
             if math.isnan(avg_cost):
                 sys.exit("got NaN loss, training failed.")
 
-    trainer = fluid.Trainer(
+    trainer = Trainer(
         train_func=train_program, optimizer_func=optimizer_func, place=place)
 
     trainer.train(
@@ -121,7 +131,7 @@ def train(use_cuda, train_program, params_dirname):
 
 def infer(use_cuda, inference_program, params_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = fluid.Inferencer(
+    inferencer = Inferencer(
         infer_func=inference_program, param_path=params_dirname, place=place)
 
     # Setup inputs by creating 4 LoDTensors representing 4 words. Here each word 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
index a198b25520f97ce23b9c1ebb9cd82fc458222d73..ecde407e6d85ea1bfc0181b4b60e095ea496fb1a 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
@@ -659,5 +659,28 @@ class TestLoadSliceVar(TranspilerTest):
                                  pserver2._slice_vars_and_attrs[idx][2].shape))
 
 
+class TestNCCL2Transpile(TranspilerTest):
+    def test_nccl2_transpile(self):
+        if fluid.core.is_compiled_with_cuda():  #test nccl2 only with cuda
+            main = fluid.Program()
+            startup = fluid.Program()
+            with fluid.program_guard(main, startup):
+                self.net_conf()
+
+            config = fluid.DistributeTranspilerConfig()
+            config.mode = "nccl2"
+            t = fluid.DistributeTranspiler(config=config)
+            t.transpile(
+                0,
+                trainers="127.0.0.1:6174,127.0.0.1:6175",
+                current_endpoint="127.0.0.1:6174",
+                startup_program=startup)
+            print([op.type for op in startup.global_block().ops])
+            self.assertEqual(startup.global_block().ops[-1].type, "gen_nccl_id")
+            self.assertIsNotNone(startup.global_block().vars.get("NCCLID"))
+        else:
+            pass
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 1fe7016924696b6e47d9cc35c137004f15a9b507..f474cdae2054531d44724e0e3e0e58a35fb8ddcd 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -758,6 +758,14 @@ class TestBook(unittest.TestCase):
             out = layers.expand(x, [1, 2])
         print(str(program))
 
+    def test_softshrink(self):
+        program = Program()
+        with program_guard(program):
+            input = layers.data(name="input", shape=[16], dtype="float32")
+            out = layers.softshrink(input, name='softshrink')
+            self.assertIsNotNone(out)
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index 4153394c1da776d0a41e1415a09fa7d6f4b14d6d..37b9a9188ab44df81029ae6d9925ae21c1929cff 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -69,7 +69,7 @@ class TestOperator(unittest.TestCase):
             set(mul_op.attr_names),
             set([
                 "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var",
-                "op_namescope"
+                "op_namescope", "op_callstack"
             ]))
         self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
         self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 30cdfe4ad2c9892184862b70ff49417ce5a08516..b495b6699b5d02ca8c466c984820be5c497d626e 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -12,1247 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
-import contextlib
-import os
-import errno
-import shutil
-import six
-import time
-
-from . import core
-from . import data_feeder
-from . import executor
-from . import framework
-from . import io
-# optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
-from . import optimizer as opt_module
-from . import parallel_executor
-from .transpiler import distribute_transpiler
-
-__all__ = [
-    'Trainer', 'BeginEpochEvent', 'EndEpochEvent', 'BeginStepEvent',
-    'EndStepEvent', 'CheckpointConfig'
-]
-
-
-class BeginEpochEvent(object):
-    """
-    The begin of a training epoch.
-
-    Args:
-        epoch_id(int): The current epoch ID.
-    """
-
-    def __init__(self, epoch_id):
-        self.epoch = epoch_id
-
-
-class EndEpochEvent(object):
-    """
-    The end of a training epoch.
-
-    Args:
-        epoch_id(int): The current epoch ID.
-    """
-
-    def __init__(self, epoch_id):
-        self.epoch = epoch_id
-
-
-class BeginStepEvent(object):
-    """
-    The begin of a training epoch.
-
-    Args:
-        epoch_id(int): The current epoch ID.
-        step_id(int): The current step ID.
-    """
-
-    def __init__(self, epoch_id, step_id):
-        self.epoch = epoch_id
-        self.step = step_id
-        self.fetch_metrics = True
-        """
-        If fetch_metrics is true, the metrics will be fetched at the
-        EndStepEvent. Default is True.
-        """
-
-
-class EndStepEvent(object):
-    """
-    The end of a training step.
-
-    Args:
-        epoch_id(int): The current epoch ID.
-        step_id(int): The current step ID.
-        metrics(list): A list of fetched tensor. The order of this list is same
-            as the :code:`train_func` returns.
-    """
-
-    def __init__(self, epoch_id, step_id, metrics):
-        self.epoch = epoch_id
-        self.step = step_id
-        self.metrics = metrics
-
-
-class CheckpointConfig(object):
-    """
-    Parameter object for :code:`save_checkpoint` and
-    :code:`fluid.Trainer`. Used to configuration how to save checkpoint.
-
-    Args:
-        checkpoint_dir(str): Directory path to save check point. Default is the
-            current directory.
-
-        max_num_checkpoints(int): The max number of local check points.
-        epoch_interval(int): Every number of epoch to save check point.
-        step_interval(int): Every number of step to save check point.
-
-    Examples:
-        >>> config = fluid.CheckpointConfig("./checkpoints")
-        >>> trainer = fluid.Trainer(train_func=train_program,
-        >>>                         place=place,
-        >>>                         optimizer_func=optimizer_func,
-        >>>                         checkpoint_config=config)
-        >>> trainer.train(...)
-    """
-
-    def __init__(self,
-                 checkpoint_dir=None,
-                 max_num_checkpoints=3,
-                 epoch_interval=1,
-                 step_interval=10):
-
-        assert epoch_interval >= 1
-        assert step_interval >= 1
-
-        self.checkpoint_dir = checkpoint_dir \
-            if checkpoint_dir is not None else os.getcwd()
-        self.max_num_checkpoints = max_num_checkpoints
-        self.epoch_interval = epoch_interval
-        self.step_interval = step_interval
-        self.epoch_id = 0
-        self.step_id = 0
-        self.load_serial = None
-        self.pserver_id = None
-        self.lookup_table_name = None
-
-
-def check_and_get_place(place):
-    """
-    Check the type of place or get the default place
-    Args:
-        place(None|core.CUDAPlace|core.CPUPlace): the place that trainer will be executed on.
-
-    Raises:
-        TypeError if the type mismatched.
-
-    Returns:
-        the original place if it is not None.
-        if fluid is compiled with CUDA, returns CUDAPlace(0) by default.
-        Otherwise returns CPUPlace by default.
-    """
-    if place is None:
-        if core.is_compiled_with_cuda():
-            return core.CUDAPlace(0)
-        else:
-            return core.CPUPlace()
-    else:
-        if not isinstance(place, core.CUDAPlace) and not isinstance(
-                place, core.CPUPlace):
-            raise TypeError("Place should be either CUDAPlace or CPUPlace")
-        return place
-
-
-class Trainer(object):
-    """
-    A trainer wraps MultiGPU/MultiNode training loops and can be used to train a
-    simple neural network easily.
-
-    This API takes a :code:`train_func`. A :code:`train_func` is a function that
-    return loss as it first return value. The reset value can be fetched by
-    EndStepEvent.metrics
-
-    This API also takes a :code:`optimizer_func` that will return an optimizer
-    instance.
-
-    For example, to train a MLP for MNIST dataset, the sample program is
-
-    >>> import paddle.fluid as fluid
-    >>>
-    >>> def mlp(image, layer_sizes=[200, 100], activation="relu", num_classes=10):
-    >>>     hidden = image
-    >>>     for layer_size in layer_sizes:
-    >>>         hidden = fluid.layers.fc(input=hidden, size=layer_size, act=activation)
-    >>>     return fluid.layers.fc(input=hidden, size=num_classes, act="softmax")
-    >>>
-    >>> def train_mnist_mlp():
-    >>>     img = fluid.layers.data(name='image', shape=[784])
-    >>>     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-    >>>     prediction = mlp(img)
-    >>>     return fluid.layers.mean(fluid.layers.cross_entropy(prediction, label))
-    >>>
-    >>> def optimizer():
-    >>>     return fluid.optimizer.Adam()
-    >>>
-    >>> trainer = Trainer(train_func=train_mnist_mlp,
-    >>>                   optimizer_func=optimizer,
-    >>>                   place=fluid.CUDAPlace(0),
-    >>>                   parallel=True)
-    >>>
-    >>> def train_callback(event):
-    >>>     if isinstance(event, fluid.EndStepEvent):
-    >>>         print "Epoch ID", event.epoch, "Step ID",\
-    >>>             event.step, "AvgLoss", event.metrics[0]
-    >>>     elif isinstance(event, fluid.EndEpochEvent):
-    >>>         trainer.save_params("./model_{0}".format(event.epoch))
-    >>>
-    >>> trainer.train(num_epochs=100, event_handler=train_callback)
-
-    For more example, please see :ref:`api_guide_high_level_api`.
-
-
-    Args:
-        train_func(callable): A function which will return loss. The loss must be
-            a scalar tensor.
-        optimizer_func(callable): A function that returns an Optimizer object.
-        place(CUDAPlace|CPUPlace): The device place of this trainer. If
-            :code:`parallel=True,` all CUDA Places will be used if :code:`place`
-            is a :code:`CUDAPlace`.
-        parallel(bool): True if use multiple devices.
-        checkpoint_config(CheckpointConfig): Configuration about how to save
-            checkpoints.
-    """
-
-    def __init__(self,
-                 train_func,
-                 optimizer_func,
-                 param_path=None,
-                 place=None,
-                 parallel=False,
-                 checkpoint_config=None):
-        self.__stop = False
-        self.parallel = parallel
-
-        # config for checkpoint
-        # only chief worker will save variables
-        self.trainer_id = 0
-        self.checkpoint_cfg = checkpoint_config
-        if self.checkpoint_cfg:
-            assert isinstance(self.checkpoint_cfg, CheckpointConfig)
-            serial = _get_latest_checkpoint_serial(
-                self.checkpoint_cfg.checkpoint_dir)
-            self.checkpoint_cfg.load_serial = serial if serial >= 0 else None
-
-        self.scope = core.Scope()
-
-        # 1. we need to generate a framework.Program by calling
-        # program_func. Reference: fluid.program_guard in
-        # test_word2vec.py
-
-        self.startup_program = framework.Program()
-        self.train_program = framework.Program()
-
-        with framework.program_guard(self.train_program, self.startup_program):
-            program_func_outs = train_func()
-            self.train_func_outputs = program_func_outs if isinstance(
-                program_func_outs, list) else [program_func_outs]
-            self.test_program = self.train_program.clone(for_test=True)
-
-            # The first element of program_func_outs is loss.
-            loss = self.train_func_outputs[0]
-
-            optimizer = optimizer_func()
-            if not isinstance(optimizer, opt_module.Optimizer):
-                raise TypeError(
-                    "The optimizer should be an instance of Optimizer")
-            optimize_ops, params_grads = optimizer.minimize(loss)
-
-        self.place = check_and_get_place(place)
-
-        self._dist_transpile_if_necessary(optimize_ops, params_grads)
-
-        # 2. move the default_main_program to self.program and run the
-        # default_startup program on an empty core.Scope()
-        # Run startup program
-        with self._prog_and_scope_guard():
-            exe = executor.Executor(place)
-            exe.run(self.startup_program)
-
-        if self.checkpoint_cfg and self.checkpoint_cfg.load_serial is not None:
-            self._load_checkpoint()
-
-        if param_path and os.path.isdir(param_path):
-            with self._prog_and_scope_guard():
-                # load params from param_path into scope
-                io.load_persistables(
-                    executor=exe,
-                    dirname=param_path,
-                    main_program=self.startup_program)
-
-    def _transpile_nccl2_dist(self):
-        # PADDLE_TRAINER_IPS
-        if "PADDLE_TRAINER_IPS" not in os.environ:
-            self.nccl_id_var = None
-        else:
-            self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
-            port = os.getenv("PADDLE_PSERVER_PORT")
-            worker_ips = os.getenv("PADDLE_TRAINER_IPS")
-            worker_endpoints = []
-            for ip in worker_ips.split(","):
-                worker_endpoints.append(':'.join([ip, port]))
-            self.num_trainers = len(worker_endpoints)
-            current_endpoint = os.getenv("PADDLE_CURRENT_IP") + ":" + port
-            worker_endpoints.remove(current_endpoint)
-            # TODO(wuyi): use self.nccl_id_var, self.num_trainers and self.trainer_id
-            # in ParallelExecutor to start
-            # distributed training using NCCL2
-            self.nccl_id_var = self.startup_program.global_block().create_var(
-                name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
-            self.startup_program.global_block().append_op(
-                type="gen_nccl_id",
-                inputs={},
-                outputs={"NCCLID": self.nccl_id_var},
-                attrs={
-                    "endpoint": current_endpoint,
-                    "endpoint_list": worker_endpoints,
-                    "trainer_id": self.trainer_id
-                })
-
-    def _dist_transpile_if_necessary(self, optimize_ops, params_grads):
-        self._transpile_nccl2_dist()
-        if self.nccl_id_var != None:
-            return
-
-        if "PADDLE_TRAINING_ROLE" not in os.environ:
-            return
-
-        # the port of all pservers, needed by both trainer and pserver
-        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-        # comma separated ips of all pservers, needed by trainer and
-        # pserver
-        pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
-        eplist = []
-        for ip in pserver_ips.split(","):
-            eplist.append(':'.join([ip, port]))
-        pserver_endpoints = ",".join(eplist)
-        # total number of workers/trainers in the job, needed by
-        # trainer and pserver
-        trainers = int(os.getenv("PADDLE_TRAINERS"))
-        # the IP of the local machine, needed by pserver only
-        current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
-        # the unique trainer id, starting from 0, needed by trainer
-        # only
-        self.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
-
-        # the role, should be either PSERVER or TRAINER
-        training_role = os.getenv("PADDLE_TRAINING_ROLE")
-        with self._prog_and_scope_guard():
-            t = distribute_transpiler.DistributeTranspiler()
-            t.transpile(
-                self.trainer_id, pservers=pserver_endpoints, trainers=trainers)
-            if training_role == "PSERVER":
-                if self.checkpoint_cfg:
-                    pserver_id = eplist.index(current_endpoint)
-                    self.checkpoint_cfg.pserver_id = pserver_id
-                    if t.has_distributed_lookup_table:
-                        self.checkpoint_cfg.lookup_table_name = t.table_name
-
-                self.train_program = t.get_pserver_program(current_endpoint)
-                self.startup_program = t.get_startup_program(current_endpoint,
-                                                             self.train_program)
-            elif training_role == "TRAINER":
-                self.train_program = t.get_trainer_program()
-            else:
-                raise ValueError(
-                    'TRAINING_ROLE environment variable must be either TRAINER or PSERVER'
-                )
-
-    def stop(self):
-        """
-        stop training
-        """
-        self.__stop = True
-
-    def train(self, num_epochs, event_handler, reader=None, feed_order=None):
-        """
-        Start the train loop to train the model.
-
-        Args:
-            num_epochs(int): The number of epoch. An epoch will process all data in reader
-            event_handler(callable): The event handler. A function with type (ev:Event)->void
-            reader(callable): A reader creator object. See also
-                :ref:`api_guide_python_reader` .
-            feed_order(list): Feeding order of reader. None will following the defining
-                order in program
-
-        Returns:
-            None
-        """
-        training_role = os.getenv("PADDLE_TRAINING_ROLE", "")
-        if training_role == "PSERVER":
-            with self._prog_and_scope_guard():
-                exe = executor.Executor(self.place)
-                exe.run()
-                return
-        if self.parallel:
-            self._train_by_parallel_executor(num_epochs, event_handler, reader,
-                                             feed_order)
-        else:
-            self._train_by_executor(num_epochs, event_handler, reader,
-                                    feed_order)
-
-    def test(self, reader, feed_order):
-        """
-        Test the model on given test data
-
-        Args:
-            reader(callable): The reader that yields test data.
-            feed_order(list): Feeding order of reader. None will following the
-                defining order in program
-        """
-
-        return self._test_by_executor(reader, feed_order,
-                                      self.train_func_outputs)
-
-    def save_params(self, param_path):
-        """
-        Save all parameters into :code:`param_path`.
-
-        Args:
-            param_path(str): The path to save parameters.
-
-        Returns:
-            None
-        """
-        with self._prog_and_scope_guard():
-            exe = executor.Executor(self.place)
-            io.save_persistables(exe, dirname=param_path)
-
-    def save_inference_model(self, param_path, feeded_var_names,
-                             target_var_indexes):
-        """
-        Save model for cpp inference into :code:`param_path`.
-
-        Args:
-            param_path(str): The path to save parameters.
-            feeded_var_names(list(str)): The name of the vars that you
-                need to feed in before run program.
-            target_var_indexes(list(int)): the index of target var that
-                you need to return in trainer.train_func.
-        Returns:
-            None
-        """
-        with self._prog_and_scope_guard():
-            exe = executor.Executor(self.place)
-            target_vars = [
-                self.train_func_outputs[index] for index in target_var_indexes
-            ]
-            io.save_inference_model(param_path, feeded_var_names, target_vars,
-                                    exe)
-
-    @contextlib.contextmanager
-    def _prog_and_scope_guard(self):
-        with framework.program_guard(
-                main_program=self.train_program,
-                startup_program=self.startup_program):
-            with executor.scope_guard(self.scope):
-                yield
-
-    def _train_by_executor(self, num_epochs, event_handler, reader, feed_order):
-        """
-        Train by Executor and single device.
-
-        Args:
-            num_epochs:
-            event_handler:
-            reader:
-            feed_order:
-
-        Returns:
-
-        """
-        with self._prog_and_scope_guard():
-            feed_var_list = build_feed_var_list(self.train_program, feed_order)
-            feeder = data_feeder.DataFeeder(
-                feed_list=feed_var_list, place=self.place)
-            exe = executor.Executor(self.place)
-            reader = feeder.decorate_reader(reader, multi_devices=False)
-            self._train_by_any_executor(event_handler, exe, num_epochs, reader)
-
-    def _train_by_any_executor(self, event_handler, exe, num_epochs, reader):
-        if self.checkpoint_cfg:
-            epochs = [
-                epoch_id for epoch_id in range(num_epochs)
-                if epoch_id >= self.checkpoint_cfg.epoch_id
-            ]
-        else:
-            epochs = [epoch_id for epoch_id in range(num_epochs)]
-
-        for epoch_id in epochs:
-            event_handler(BeginEpochEvent(epoch_id))
-            for step_id, data in enumerate(reader()):
-                if self.__stop:
-                    if self.checkpoint_cfg:
-                        self._clean_checkpoint()
-                    return
-
-                if self.checkpoint_cfg and self.checkpoint_cfg.load_serial \
-                    and self.checkpoint_cfg.step_id >= step_id and self.checkpoint_cfg.epoch_id == epoch_id:
-                    continue
-
-                begin_event = BeginStepEvent(epoch_id, step_id)
-                event_handler(begin_event)
-                if begin_event.fetch_metrics:
-                    metrics = exe.run(feed=data,
-                                      fetch_list=[
-                                          var.name
-                                          for var in self.train_func_outputs
-                                      ])
-                else:
-                    metrics = exe.run(feed=data, fetch_list=[])
-
-                if self.checkpoint_cfg:
-                    self._save_checkpoint(epoch_id, step_id)
-                event_handler(EndStepEvent(epoch_id, step_id, metrics))
-            event_handler(EndEpochEvent(epoch_id))
-        if self.checkpoint_cfg:
-            self._clean_checkpoint()
-
-    def _test_by_executor(self, reader, feed_order, fetch_list):
-        with executor.scope_guard(self.scope):
-            feed_var_list = build_feed_var_list(self.test_program, feed_order)
-            feeder = data_feeder.DataFeeder(
-                feed_list=feed_var_list, place=self.place)
-            exe = executor.Executor(self.place)
-            accumulated = len(fetch_list) * [0]
-            count = 0
-            for data in reader():
-                outs = exe.run(program=self.test_program,
-                               feed=feeder.feed(data),
-                               fetch_list=fetch_list)
-                accumulated = [x[0] + x[1][0] for x in zip(accumulated, outs)]
-                count += 1
-
-            return [x / count for x in accumulated]
-
-    def _train_by_parallel_executor(self, num_epochs, event_handler, reader,
-                                    feed_order):
-        with self._prog_and_scope_guard():
-            pe = self._get_or_create_parallel_executor()
-            feed_var_list = build_feed_var_list(self.train_program, feed_order)
-            feeder = data_feeder.DataFeeder(
-                feed_list=feed_var_list, place=self.place)
-            reader = feeder.decorate_reader(reader, multi_devices=True)
-            self._train_by_any_executor(event_handler, pe, num_epochs, reader)
-
-    def _get_parallel_executor(self):
-        return getattr(self, 'parallel_executor', None)
-
-    def _get_or_create_parallel_executor(self):
-        if self._get_parallel_executor() is None:
-            self.parallel_executor = parallel_executor.ParallelExecutor(
-                use_cuda=isinstance(self.place, core.CUDAPlace),
-                loss_name=self.train_func_outputs[0].name)
-        return self._get_parallel_executor()
-
-    def _clean_checkpoint(self):
-        assert self.checkpoint_cfg
-        clean_checkpoint(checkpoint_dir=self.checkpoint_cfg.checkpoint_dir)
-
-    def _get_checkpoint_load_args(self):
-        """
-        epoch_id and step_id are runtime arguments, they are not variables, will load them independently.
-        """
-        return ["epoch_id", "step_id"]
-
-    def _get_checkpoint_save_args(self, epoch_id, step_id):
-        """
-        epoch_id and step_id are runtime arguments, they are not variables, will save them independently.
-        """
-        trainer_args = {}
-        trainer_args["epoch_id"] = epoch_id
-        trainer_args["step_id"] = step_id
-        return trainer_args
-
-    def _save_checkpoint(self, epoch_id, step_id):
-        assert self.checkpoint_cfg
-
-        if epoch_id % self.checkpoint_cfg.epoch_interval == 0 \
-            and step_id % self.checkpoint_cfg.step_interval == 0:
-            exe = executor.Executor(self.place)
-            save_checkpoint(
-                executor=exe,
-                checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
-                trainer_id=self.trainer_id,
-                trainer_args=self._get_checkpoint_save_args(epoch_id, step_id),
-                main_program=self.train_program,
-                max_num_checkpoints=self.checkpoint_cfg.max_num_checkpoints)
-
-    def _load_checkpoint(self):
-        with self._prog_and_scope_guard():
-            exe = executor.Executor(self.place)
-            load_checkpoint(
-                executor=exe,
-                checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
-                main_program=self.startup_program)
-
-            if not self.checkpoint_cfg.pserver_id:
-                load_trainer_args = self._get_checkpoint_load_args()
-                trainer_args = load_checkpoint(
-                    executor=exe,
-                    checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
-                    main_program=self.startup_program,
-                    role_id=self.trainer_id,
-                    is_trainer=True,
-                    load_trainer_args=load_trainer_args)
-
-                if len(trainer_args) != 2:
-                    raise ValueError(
-                        "the return trainer_args length do not equal _get_checkpoint_load_args"
-                    )
-                self.checkpoint_cfg.epoch_id = int(trainer_args[0])
-                self.checkpoint_cfg.step_id = int(trainer_args[1])
-            else:
-                if self.checkpoint_cfg.lookup_table_name:
-                    load_checkpoint(
-                        executor=exe,
-                        checkpoint_dir=self.checkpoint_cfg.checkpoint_dir,
-                        main_program=self.startup_program,
-                        role_id=self.checkpoint_cfg.pserver_id,
-                        is_trainer=False,
-                        load_trainer_args=None,
-                        load_lookup_table=self.checkpoint_cfg.lookup_table_name)
-
-
-def build_feed_var_list(program, feed_order):
-    if not isinstance(program, framework.Program):
-        raise TypeError("The 'program' should be an object of Program")
-
-    if isinstance(feed_order, list):
-        feed_var_list = [
-            program.global_block().var(var_name) for var_name in feed_order
-        ]
-    else:
-        if not isinstance(feed_order, dict):
-            raise TypeError(
-                "The 'feed_order' should be either None, list or dict.")
-        if not sorted(feed_order.values()) == list(range(len(feed_order))):
-            raise ValueError(
-                "The values of 'feed_order' should be a permutation of [0, len(feed_order))"
-            )
-        sorted_pair_list = sorted(
-            six.iteritems(feed_order), key=lambda item: item[1])
-        feed_var_list = [
-            program.global_block().var(pair[0]) for pair in sorted_pair_list
-        ]
-    return feed_var_list
-
-
-# move Checkpoint APIs from io.py to trainer.py, make all of them are private.
-SUCCESS_MARK_FILENAME = "_SUCCESS"
-CHECKPOINT_PREFIX = "checkpoint"
-MODEL_DIR = "__model__"
-LOOKUP_TABLE_DIR = "__lookup_table__"
-TRAINER_PREFIX = "trainer"
-CHECKPOINT_SEPARATOR = "_"
-
-
-def save_checkpoint(executor,
-                    checkpoint_dir,
-                    trainer_id,
-                    main_program,
-                    trainer_args=None,
-                    max_num_checkpoints=3,
-                    lookup_table=None,
-                    pserver_endpoints=None):
-    """
-    This function filters out all checkpoint variables from the give
-    main_program and then saves these variables to the `checkpoint_dir`
-    directory.
-
-    In the training precess, we generally save a checkpoint in each
-    iteration. So there might be a lot of checkpoints in the
-    `checkpoint_dir`. To avoid them taking too much disk space, the
-    `max_num_checkpoints` are introduced to limit the total number of
-    checkpoints. If the number of existing checkpints is greater than
-    the `max_num_checkpoints`, oldest ones will be scroll deleted.
-
-    A variable is a checkpoint variable and will be saved if it meets
-    all following conditions:
-        1. It's persistable.
-        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
-        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-
-    Args:
-        executor(Executor): The executor to run for save checkpoint.
-        checkpoint_dir(str): The folder where to save checkpoints.
-        trainer_id(int): currect trainer id, if id is equal to 0, the trainer
-            is chief.
-        trainer_args(dict|None): Current training arguments. Such as 'epoch_id'
-            and 'step_id'.
-            Defaut: None
-        main_program(Program): The program whose checkpoint variables will
-            be saved.
-        max_num_checkpoints(int): The max number of total number of existing
-            checkpoints.
-            Default: 3
-        lookup_table(string|None): the lookup table name, when use distribute
-            lookup table, we can get lookup table name by DistributeTranspiler.
-            table_name
-        pserver_endpoints(list|None): the parameter server ip:port list.
-            when use distribute lookup table, we can get pserver_endpoints by
-            distribute arguments.
-
-    Returns:
-        None
-
-    Raises:
-        ValueError: If `checkpoint_dir` is None.
-        AssertionError: If `trainer_args` is not a dict.
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            path = "./checkpoints"
-            prog = fluid.default_main_program()
-            trainer_args = {"epoch_id": 200,
-                            "step_id": 20} # just an example
-            table_name = "share_w"
-            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
-
-            save_checkpoint(executor=exe,
-                                     checkpoint_dir=path,
-                                     trainer_id=0,
-                                     trainer_args=trainer_args,
-                                     main_program=prog,
-                                     max_num_checkpoints=3,
-                                     lookup_table=table_name,
-                                     pserver_endpoints = ps_endpoints)
-    """
-    if checkpoint_dir is None:
-        raise ValueError("'checkpoint_dir' should not be None")
-
-    if main_program is None:
-        raise ValueError('main_program should not be None.')
-
-    if trainer_args:
-        assert isinstance(trainer_args, dict)
-
-    is_chief = trainer_id == 0
-
-    _make_chekcpoint_dirs(checkpoint_dir)
-    serial = _get_latest_checkpoint_serial(checkpoint_dir) + 1
-    cur_dir = _get_serial_dir(checkpoint_dir, serial)
-
-    _save_trainer_args(cur_dir, trainer_id, trainer_args)
-
-    if is_chief:
-        _save_persist_vars_without_grad(executor, cur_dir, main_program)
-
-    if is_chief and lookup_table and pserver_endpoints:
-        _save_pserver_vars_by_notify(executor, cur_dir, lookup_table,
-                                     pserver_endpoints)
-
-    _scroll_delete(checkpoint_dir, max_num_checkpoints)
-
-
-def load_checkpoint(executor,
-                    checkpoint_dir,
-                    main_program,
-                    role_id=0,
-                    is_trainer=True,
-                    load_trainer_args=None,
-                    load_lookup_table=None):
-    """
-    This function filters out all checkpoint variables from the give
-    main_program and then try to load these variables from the
-    `checkpoint_dir` directory.
-
-    In the training precess, we generally save a checkpoint in each
-    iteration. So there are more than one checkpoint in the
-    `checkpoint_dir` (each checkpoint has its own sub folder), use
-    `serial` to specify which serial of checkpoint you would like to
-    load.
-
-    A variable is a checkpoint variable and will be loaded if it meets
-    all following conditions:
-        1. It's persistable.
-        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
-        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-
-    Args:
-        executor(Executor): The executor to run for loading checkpoint.
-        checkpoint_dir(str): The folder where all checkpoints are.
-        serial(int): The serial of checkpoint you would like to load.
-        main_program(Program): The program whose checkpoint variables will
-                               be loaded.
-        role_id(int):  the trainer id or the parameter server id.
-        is_trainer(bool): trainer is True and parameter server is False.
-        load_trainer_args(list|None): list about load trainer args.
-        load_lookup_table(str|None): the lookup table name
-
-    Returns:
-        None
-
-    Raises:
-        ValueError: If `checkpoint_dir` is None.
-        ValueError: If `main_program` is None.
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            path = "./checkpoints"
-            prog = fluid.default_main_program()
-            load_checkpoint(executor=exe, checkpoint_dir=path,
-                    serial=9, main_program=prog)
-
-            # In this example, `load_checkpoint` function
-            # will first filters out all checkpoint variables in the default
-            # main program, and then try to load these variables form the
-            # folder "./checkpoints/checkpoint_9/__model__".
-    """
-
-    if checkpoint_dir is None:
-        raise ValueError("'checkpoint_dir' should not be None")
-
-    serial = _get_latest_checkpoint_serial(checkpoint_dir)
-
-    # there are nothing  need to be loaded
-    if serial is None or serial < 0:
-        return
-
-    if main_program is None:
-        raise ValueError('main_program should not be None.')
-
-    if is_trainer and load_trainer_args is None:
-        cur_dir = _get_serial_dir(checkpoint_dir, serial)
-        _load_persist_vars_without_grad(executor, cur_dir, main_program, True)
-        return
-
-    if is_trainer and load_trainer_args:
-        return _load_trainer_args(checkpoint_dir, serial, role_id,
-                                  load_trainer_args)
-
-    if not is_trainer and load_lookup_table:
-        _load_lookup_table_vars(executor, checkpoint_dir, main_program, role_id,
-                                load_lookup_table)
-
-
-def clean_checkpoint(checkpoint_dir, delete_dir=False):
-    """
-    clean the checkpoint dir, when the train exits normally,
-    the trainer will call clean_checkpoint to delete checkpoint directory saved before.
-    delete_dir only works when the directory is empty, otherwise, OSError is raised.
-
-    : param checkpoint_dir
-    : param delete_dir
-    """
-
-    if checkpoint_dir is None:
-        raise ValueError("'checkpoint_dir' should not be None")
-    _scroll_delete(checkpoint_dir, max_num_checkpoints=0)
-
-    if delete_dir and not os.listdir(checkpoint_dir):
-        os.rmdir(checkpoint_dir)
-
-
-def _load_persist_vars_without_grad(executor,
-                                    dirname,
-                                    program,
-                                    has_model_dir=False):
-    """
-    This function filters out all checkpoint variables from the give
-    program and then trys to load these variables from the given directory.
-
-    A variable is a checkpoint variable if it meets all following
-    conditions:
-        1. It's persistable.
-        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
-        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-
-    Args:
-        executor(Executor): The executor to run for loading variables.
-        dirname(str): The directory path.
-        program(Program): The program whose checkpoint variables will
-                          be loaded.
-        has_model_dir(bool): if True, the function loads variables
-                             from a sub directory named '__model__'.
-                             Default: False
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            prog = fluid.default_main_program()
-            _load_persist_vars_without_grad(executor=exe,
-                    dirname=param_path, program=prog, has_model_dir=True)
-
-            # In this example, `_load_persist_vars_without_grad` function
-            # will first filters out all checkpoint variables in the default
-            # main program, and then trys to load these variables form the
-            # folder "./my_paddle_model/__model__".
-    """
-
-    if has_model_dir:
-        dirname = _get_model_dir(dirname)
-
-    io.load_vars(
-        executor,
-        dirname=dirname,
-        main_program=program,
-        predicate=_is_checkpoint_var,
-        filename=None)
-
-
-def _load_lookup_table_vars(executor, dirname, program, pserver_id, table_name):
-    """
-    The parameter server will load lookup table's local file in
-    selectedrows variable.
-
-    Args:
-        executor(Executor): The executor to run for loading persistable variables
-        dirname(str): The directory path
-        main_program(Program): Find the variable named table_name in main_program
-        pserver_id(int): the serial number in pserver_endpoints list
-        table_name(str): lookup table name
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            dirname = "./checkpoints/checkpoint_9/"
-            prog = fluid.default_main_program()
-            pserver_id = 1
-            table_name = "share_w"
-            _load_lookup_table_vars(executor=exe,
-                    dirname=dirname, program=prog, pserver_id=pserver_id,
-                    table_name=table_name)
-    """
-
-    for var in program.list_vars():
-        if var.name == table_name:
-            lookup_table_var = var
-            break
-
-    assert lookup_table_var is not None
-
-    lookup_table_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
-    table_file = table_name + CHECKPOINT_SEPARATOR + str(pserver_id)
-
-    load_prog = framework.Program()
-    load_block = load_prog.global_block()
-
-    load_block.append_op(
-        type='load',
-        inputs={},
-        outputs={'Out': [lookup_table_var]},
-        attrs={'file_path': os.path.join(lookup_table_dir, table_file)})
-
-    executor.run(load_prog)
-
-
-def _save_persist_vars_without_grad(executor, dirname, program):
-    """
-    This function filters out all checkpoint variables from the give
-    program and then save these variables to a sub-folder '__model__' of
-    the given directory.
-
-    A variable is a checkpoint variable if it meets all following
-    conditions:
-        1. It's persistable.
-        2. It's type is not FEED_MINIBATCH nor FETCH_LIST nor RAW.
-        3. It's name contains no "@GRAD" nor ".trainer_" nor ".block".
-
-    Args:
-        executor(Executor): The executor to run for saving variables.
-        dirname(str): The directory path.
-        program(Program): The program whose checkpoint variables will
-                          be saved.
-
-    Returns:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            prog = fluid.default_main_program()
-            _save_persist_vars_without_grad(executor=exe,
-                    dirname=param_path, program=prog)
-
-            # In this example, `_save_persist_vars_without_grad` function
-            # will first filters out all checkpoint variables in the default
-            # main program, and then saves these variables to the folder
-            # "./my_paddle_model/__model__".
-    """
-    cur_dir = _get_model_dir(dirname)
-    io.save_vars(
-        executor,
-        dirname=cur_dir,
-        main_program=program,
-        vars=None,
-        predicate=_is_checkpoint_var,
-        filename=None)
-    _write_success(cur_dir)
-
-
-def _save_pserver_vars_by_notify(executor, dirname, lookup_table,
-                                 ps_endpoint_list):
-    """
-    This function will send checkpoint notify message from Trainer 0
-    to all the pservers.
-    The checkpoint notify message contains lookup table name,
-    the absolute path on pserver to save lookup_table.
-
-    Args:
-        executor(Executor): The executor to run for send checkpoint notify.
-        dirname(str): The folder where to save checkpoints.
-        lookup_table(string): the lookup table name, when use distribute
-            lookup table, we can get lookup table name by DistributeTranspiler.
-            table_name
-        ps_endpoint_list(list): the parameter server ip:port list.
-            when use distribute lookup table, we can get ps_endpoint_list by
-            distribute arguments.
-    Return:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            exe = fluid.Executor(fluid.CPUPlace())
-            param_path = "./my_paddle_model"
-            prog = fluid.default_main_program()
-            table_name = "share_w"
-            ps_endpoints = ["127.0.0.1:6000","127.0.0.1:6001"]
-
-            _save_pserver_vars_by_notify(executor=exe,
-                    dirname=param_path, lookup_table=table_name,
-                    ps_endpoint_list=ps_endpoints)
-    """
-    cur_dir = _get_lookuptable_dir(dirname)
-
-    checkpoint_notify_program = framework.Program()
-    checkpoint_notify_block = checkpoint_notify_program.global_block()
-
-    attrs = {}
-    attrs['epmap'] = ps_endpoint_list
-    attrs['dir'] = cur_dir
-    attrs['lookup_table'] = lookup_table
-
-    checkpoint_notify_block.append_op(
-        type='checkpoint_notify', inputs={}, outputs={}, attrs=attrs)
-    executor.run(checkpoint_notify_program)
-
-
-def _save_trainer_args(dirname, trainer_id, trainer_args):
-    assert isinstance(trainer_args, dict)
-
-    cur_dir = _get_trainer_dir(dirname, trainer_id)
-
-    for name, value in six.iteritems(trainer_args):
-        args_file = os.path.join(cur_dir, name)
-        with open(args_file, 'w') as f:
-            f.write(str(value))
-    _write_success(cur_dir)
-
-
-def _load_trainer_args(checkpoint_dir, serial, trainer_id, trainer_args):
-    """
-    trainer will load some args from it's independent directory,
-    such as epoch_id and step_id.
-
-    Args:
-        checkpoint_dir(str): The folder where all checkpoints are.
-        serial(int): The serial of checkpoint you would like to load.
-        trainer_id(int): current trainer id.
-        trainer_args(list): list about load trainer args
-    Return:
-        None
-
-    Examples:
-        .. code-block:: python
-
-            param_path = "./checkpoint/"
-            serial = 7
-            trainer_id = 2
-            trainer_args = ["epoch_id", "step_id"]
-
-            _load_trainer_args(checkpoint_dir=param_path, serial=serial,
-            trainer_id=trainer_id, trainer_args=trainer_args)
-    """
-    assert isinstance(trainer_args, list)
-
-    cur_dir = _get_serial_dir(checkpoint_dir, serial)
-    cur_dir = _get_trainer_dir(cur_dir, trainer_id)
-
-    ret_values = []
-
-    for arg in trainer_args:
-        cur_file = os.path.join(cur_dir, arg)
-        with open(cur_file, 'r') as f:
-            contents = f.read()
-            ret_values.append(contents.strip())
-    return ret_values
-
-
-def _is_checkpoint_var(var):
-    """
-    the checkpoint will not save or load all the variables.
-    var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
-
-    : param var(Variable)
-    """
-    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-            var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-            var.desc.type() == core.VarDesc.VarType.RAW:
-        return False
-    # @GRAD are named for gradient variables, checkpoint will not save it.
-    if "@GRAD" in var.name:
-        return False
-    # .trainer_ are named for distribute train variables, checkpoint will not save it.
-    if ".trainer_" in var.name:
-        return False
-
-    # .block is named for distribute train variables, checkpoint will not save it.
-    if ".block" in var.name:
-        return False
-
-    return var.persistable
-
-
-def _make_chekcpoint_dirs(dirs):
-    """
-    _make_chekcpoint_dirs will makdir local directory directly, when the directory is exist, it will igore it.
-    """
-    assert dirs is not None
-
-    if os.path.isfile(dirs):
-        raise OSError(errno.ENOTDIR, "dirs path shoule be a Directory.", dirs)
-
-    if not os.path.isdir(dirs):
-        try:
-            os.makedirs(dirs)
-        except OSError as err:
-            if err.errno != errno.EEXIST:
-                raise err
-
-
-def _get_dir_serial(dirname):
-    _, serial = dirname.split(CHECKPOINT_SEPARATOR)
-
-    try:
-        serial_num = int(serial)
-    except ValueError:
-        serial_num = -1
-    return serial_num
-
-
-def _get_serial_dir(dirname, serial):
-    serial_folder = CHECKPOINT_PREFIX + CHECKPOINT_SEPARATOR + str(serial)
-    serial_dir = os.path.join(dirname, serial_folder)
-    _make_chekcpoint_dirs(serial_dir)
-
-    return serial_dir
-
-
-def _get_model_dir(dirname):
-    model_dir = os.path.join(dirname, MODEL_DIR)
-    _make_chekcpoint_dirs(model_dir)
-    return model_dir
-
-
-def _get_lookuptable_dir(dirname):
-    lookuptable_dir = os.path.join(dirname, LOOKUP_TABLE_DIR)
-    _make_chekcpoint_dirs(lookuptable_dir)
-    return lookuptable_dir
-
-
-def _get_trainer_dir(dirname, trainer_id):
-    trainer_folder = TRAINER_PREFIX + CHECKPOINT_SEPARATOR + str(trainer_id)
-    trainer_dir = os.path.join(dirname, trainer_folder)
-    _make_chekcpoint_dirs(trainer_dir)
-    return trainer_dir
-
-
-def _scroll_delete(dirname, max_num_checkpoints=3):
-    dirs = os.listdir(dirname)
-    serial_map = {}
-    for serial in dirs:
-        serial_num = _get_dir_serial(serial)
-        serial_map[serial_num] = serial
-
-    if len(list(serial_map.keys())) <= max_num_checkpoints:
-        return
-
-    serials = list(serial_map.keys())
-    serials.sort(reverse=True)
-    serials = serials[max_num_checkpoints:]
-    for serial in serials:
-        cur_dir = _get_serial_dir(dirname, serial)
-        try:
-            shutil.rmtree(cur_dir)
-        except OSError as err:
-            if err.errno != errno.ENOENT:
-                raise err
-
-
-def _write_success(dirname):
-    """
-    write an empty file named "_SUCCESS" in checkpoint dir, indicate this checkpoint is correct.
-
-    : param dirname
-    """
-    success_file = os.path.join(dirname, SUCCESS_MARK_FILENAME)
-    with open(success_file, 'a') as f:
-        now = time.ctime()
-        f.write(now)
-
-
-def _get_latest_checkpoint_serial(checkpoint_dir):
-    """
-    get the latest file in checkpoint directory, the _SUCCESS file must exist in the directory
-
-    : param checkpoint_dir
-    """
-    if not checkpoint_dir:
-        return -1
-
-    def has_success(checkpoint_dir, cur_dir):
-        """
-        is _SUCCESS in this dir
-        """
-
-        serial = _get_dir_serial(cur_dir)
-        if serial == -1 or not os.path.isdir(
-                os.path.join(checkpoint_dir, cur_dir)):
-            return -1
-
-        success_path = os.path.join(
-            _get_serial_dir(checkpoint_dir, serial), MODEL_DIR,
-            SUCCESS_MARK_FILENAME)
-        if os.path.isfile(success_path):
-            return serial
-
-    if not os.path.isdir(checkpoint_dir):
-        return -1
-
-    current_dir = -1
-    dirs = os.listdir(checkpoint_dir)
-    for cur_dir in dirs:
-        success_num = has_success(checkpoint_dir, cur_dir)
-        if success_num > current_dir:
-            current_dir = success_num
-    return current_dir
+# NOTE: Trainer is moved into fluid.contrib.trainer.
+__all__ = []
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 3f8c7b844a9fdc8404560ba4c78f9d328af2852a..43071def7a906e585909e50e4c0c52c56d981cde 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -136,6 +136,8 @@ class DistributeTranspilerConfig(object):
     slice_var_up = True
     split_method = None
     min_block_size = 8192
+    # supported modes: pserver, nccl2
+    mode = "pserver"
     print_log = False
 
 
@@ -144,27 +146,30 @@ class DistributeTranspiler(object):
     **DistributeTranspiler**
 
     Convert the fluid program to distributed data-parallelism programs.
+    Supports two modes: pserver mode and nccl2 mode.
 
-    The main_program will be transformed to use a remote parameter server
-    to do parameter optimization. And the optimization graph will be put
-    into a parameter server program.
+    In pserver mode, the main_program will be transformed to use a remote
+    parameter server to do parameter optimization. And the optimization
+    graph will be put into a parameter server program.
+
+    In nccl2 mode, the transpiler will append a NCCL_ID broadcasting
+    op in startup_program to share the NCCL_ID across the job nodes.
+    After transpile_nccl2 called, you ***must*** pass trainer_id and
+    num_trainers argument to ParallelExecutor to enable NCCL2 distributed
+    mode.
 
     Examples:
         .. code-block:: python
 
-           # Define your model before these codes.
-           port = os.getenv("PADDLE_PSERVER_PORT", "6174")
-           pserver_ips = os.getenv("PADDLE_PSERVER_IPS", "")
-           eplist = []
-           for ip in pserver_ips.split(","):
-                eplist.append(':'.join([ip, port]))
-           pserver_endpoints = ",".join(eplist)
-           trainers = int(os.getenv("PADDLE_TRAINERS"))
-           current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
-           trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
+           # for pserver mode
+           pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
+           trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174"
+           current_endpoint = "192.168.0.1:6174"
+           trainer_id = 0
+           trainers = 4
            role = os.getenv("PADDLE_TRAINING_ROLE")
 
-           t = distribute_transpiler.DistributeTranspiler()
+           t = fluid.DistributeTranspiler()
            t.transpile(
                 trainer_id, pservers=pserver_endpoints, trainers=trainers)
            if role == "PSERVER":
@@ -173,6 +178,18 @@ class DistributeTranspiler(object):
                                                                 pserver_program)
            elif role == "TRAINER":
                 trainer_program = t.get_trainer_program()
+           
+           # for nccl2 mode
+           config = fluid.DistributeTranspilerConfig()
+           config.mode = "nccl2"
+           t = fluid.DistributeTranspiler(config=config)
+           t.transpile(trainer_id, workers=workers, current_endpoint=curr_ep)
+           exe = fluid.ParallelExecutor(
+               use_cuda,
+               loss_name=loss_var.name,
+               num_trainers=len(trainers.split(",)),
+               trainer_id=trainer_id
+           )
     """
 
     def __init__(self, config=None):
@@ -190,13 +207,41 @@ class DistributeTranspiler(object):
         assert (self.config.min_block_size >= 8192)
         assert (self.config.split_method.__bases__[0] == PSDispatcher)
 
+    def _transpile_nccl2(self,
+                         trainer_id,
+                         trainers,
+                         current_endpoint,
+                         startup_program=None):
+        if not startup_program:
+            startup_program = default_startup_program()
+        if trainer_id >= 0:
+            worker_endpoints = trainers.split(",")
+            # send NCCL_ID to others or recv from trainer 0
+            worker_endpoints.remove(current_endpoint)
+
+            nccl_id_var = startup_program.global_block().create_var(
+                name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW)
+            startup_program.global_block().append_op(
+                type="gen_nccl_id",
+                inputs={},
+                outputs={"NCCLID": nccl_id_var},
+                attrs={
+                    "endpoint": current_endpoint,
+                    "endpoint_list": worker_endpoints,
+                    "trainer_id": trainer_id
+                })
+            return nccl_id_var
+        else:
+            raise ValueError("must set trainer_id > 0")
+
     def transpile(self,
                   trainer_id,
                   program=None,
                   pservers="127.0.0.1:6174",
                   trainers=1,
                   sync_mode=True,
-                  startup_program=None):
+                  startup_program=None,
+                  current_endpoint="127.0.0.1:6174"):
         """
         Run the transpiler.
 
@@ -207,10 +252,15 @@ class DistributeTranspiler(object):
                 default is fluid.default_main_program().
             pservers (str): comma separated ip:port string for the pserver
                 list.
-            trainers (int): number of trainers in the distributed job.
+            trainers (int|str): in pserver mode this is the number of
+                trainers, in nccl2 mode this is a string of trainer
+                endpoints.
             sync_mode (bool): Do sync training or not, default is True.
             startup_program (Program|None): startup_program to transpile,
                 default is fluid.default_main_program().
+            current_endpoint (str): need pass current endpoint when
+                transpile as nccl2 distributed mode. In pserver mode
+                this argument is not used.
         """
         if program is None:
             program = default_main_program()
@@ -220,6 +270,15 @@ class DistributeTranspiler(object):
         self.startup_program = startup_program
         self.origin_startup_program = self.startup_program.clone()
 
+        if self.config.mode == "nccl2":
+            assert (isinstance(trainers, str))
+            self._transpile_nccl2(
+                trainer_id,
+                trainers,
+                current_endpoint,
+                startup_program=startup_program)
+            return
+
         self.trainer_num = trainers
         self.sync_mode = sync_mode
         self.trainer_id = trainer_id
@@ -1082,7 +1141,7 @@ to transpile() call.")
                         if self.sync_mode else []
                     },
                     attrs={
-                        "sync_mode": False,
+                        "sync_mode": self.sync_mode,
                         "epmap": pserver_endpoints,
                         RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE,
                         OP_ROLE_VAR_ATTR_NAME: [