diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4117f077219d3b8fc097631073eafa748ff918bc..c23be24c6c8971746f0e7fce7209da3dd3b667ec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -61,8 +61,10 @@ option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen"            OFF)
 option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 option(WITH_FAST_BUNDLE_TEST    "Bundle tests that can be run in a single process together to reduce launch overhead"   OFF)
 option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
+option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
 option(WITH_ANAKIN      "Compile with Anakin library"                   OFF)
 option(WITH_GRPC     "Use grpc as the default rpc framework"            ${WITH_DISTRIBUTE})
+option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -131,6 +133,10 @@ if (NOT DEFINED WITH_MKLDNN)
         set(WITH_MKLDNN OFF)
     endif()
 endif()
+
+if (REPLACE_ENFORCE_GLOG)
+  add_definitions("-DREPLACE_ENFORCE_GLOG")
+endif()
 ########################################################################################
 
 include(external/mklml)     # download mklml package
@@ -153,12 +159,24 @@ include(external/cares)
 if(WITH_DISTRIBUTE)
     if(WITH_GRPC)
         include(external/grpc)
+        message(STATUS "Use grpc framework.")
     else()
+        message(STATUS "Use brpc framework.")
         include(external/leveldb)
         include(external/brpc)
     endif()
 endif()
 
+if(WITH_BRPC_RDMA)
+    message(STATUS "Use brpc with rdma.")
+    if(WITH_GRPC)
+        message(FATAL_ERROR "Can't use grpc with brpc rdma.")
+    endif()
+    if(NOT WITH_DISTRIBUTE)
+        message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
+    endif()
+endif()
+
 include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
diff --git a/README.md b/README.md
index 8d89c6b1ec9e4aefbd64328dedb4e8c7cc50c21b..63abca069a6629ac59739224ded9cd9f06207d0a 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,6 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
 [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
-[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 6a8b15a6b60a2e5635dc78fc877f0c8da9a2a998..e4af34d10ed92c501dd805addb62747c91c00978 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -174,3 +174,7 @@ endif(WITH_GOLANG)
 if(WITH_GRPC)
     add_definitions(-DPADDLE_WITH_GRPC)
 endif(WITH_GRPC)
+
+if(WITH_BRPC_RDMA)
+    add_definitions(-DPADDLE_WITH_BRPC_RDMA)
+endif(WITH_BRPC_RDMA)
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index 8e2c913b2caae0c4eeb844d2b51a8975e81c1592..30b227b6452abf44171a1a4e04569e66b16e67a4 100644
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -14,6 +14,15 @@
 
 INCLUDE(ExternalProject)
 
+find_library(SSL_LIBRARY NAMES ssl)
+ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${SSL_LIBRARY})
+
+find_library(CRYPTO_LIBRARY NAMES crypto)
+ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${CRYPTO_LIBRARY})
+
+
 SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc)
 SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc)
 SET(BRPC_INCLUDE_DIR "${BRPC_INSTALL_DIR}/include" CACHE PATH "brpc include directory." FORCE)
@@ -22,14 +31,14 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr
 INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR})
 
 # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args
-set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf")
+set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib")
 
 # If minimal .a is need, you can set  WITH_DEBUG_SYMBOLS=OFF
 ExternalProject_Add(
     extern_brpc
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/brpc/brpc"
-    GIT_TAG         "6d153dd7ff00f960ae6895c9c5fff0ce9f07aff2"
+    GIT_REPOSITORY  "https://github.com/gongweibao/brpc"
+    GIT_TAG         "7dc04defad1fd4173aae170c3fcbde131b65155a"
     PREFIX          ${BRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -42,6 +51,8 @@ ExternalProject_Add(
                     -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                     -DCMAKE_PREFIX_PATH=${prefix_path}
                     -DBRPC_WITH_GLOG=ON
+                    -DIOBUF_WITH_HUGE_BLOCK=ON
+                    -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA}
                     ${EXTERNAL_OPTIONAL_ARGS}
     LIST_SEPARATOR |
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${BRPC_INSTALL_DIR}
@@ -49,7 +60,7 @@ ExternalProject_Add(
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
-ADD_DEPENDENCIES(extern_brpc protobuf leveldb gflags glog gtest snappy)
+ADD_DEPENDENCIES(extern_brpc protobuf ssl crypto leveldb gflags glog gtest snappy)
 ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
 ADD_DEPENDENCIES(brpc extern_brpc)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 9c42044ec163e9db1dd21d5c3915b010c30fdf1c..fd7fc16bff5651f022b484623243048fbd225b5a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID)
     set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
 endif(NOT APPLE AND NOT ANDROID)
 
+set_property(GLOBAL PROPERTY FLUID_MODULES "")
+# find all fluid modules is used for paddle fluid static library
+# for building inference libs
+function(find_fluid_modules TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+  string(FIND "${__target_path}" "fluid" pos)
+  if(pos GREATER 1)
+    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
+  endif()
+endfunction(find_fluid_modules)
+
 function(merge_static_libs TARGET_NAME)
   set(libs ${ARGN})
   list(REMOVE_DUPLICATES libs)
@@ -250,6 +264,7 @@ function(cc_test TARGET_NAME)
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (${cc_test_SERIAL})
         set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     endif()
   endif()
 endfunction(cc_test)
@@ -314,6 +329,7 @@ function(nv_test TARGET_NAME)
     add_test(${TARGET_NAME} ${TARGET_NAME})
     if (nv_test_SERIAL)
         set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     endif()
   endif()
 endfunction(nv_test)
@@ -561,7 +577,7 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+             COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 850098297e1456487cb8a7b83dffd3d2b0478689..c6979713231f631f8757e4139d6f685d4554b54e 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -12,19 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set_property(GLOBAL PROPERTY FLUID_MODULES "")
-# find all fluid modules is used for paddle fluid static library
-function(find_fluid_modules TARGET_NAME)
-  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
-  string(FIND "${__target_path}" "fluid" pos)
-  if(pos GREATER 1)
-    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
-  endif()
-endfunction(find_fluid_modules)
-
 # make package for paddle fluid shared and static library
 function(copy TARGET)
     set(options "")
@@ -154,7 +141,7 @@ set(inference_deps paddle_fluid_shared paddle_fluid)
 if(WITH_CONTRIB)
     message(STATUS "installing contrib")
     set(contrib_dst_dir "${FLUID_INSTALL_DIR}/contrib/inference")
-    if (WITH_ANAKIN)
+    if (WITH_ANAKIN AND WITH_GPU)
         copy(contrib_anakin_inference_lib DEPS paddle_inference_api inference_anakin_api
             SRCS
             ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libinference_anakin_api* # compiled anakin api
@@ -163,9 +150,9 @@ if(WITH_CONTRIB)
         list(APPEND inference_deps contrib_anakin_inference_lib)
    endif()
 
-  copy(contrib_inference_lib DEPS paddle_inference_api
+  copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
         SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
-        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
+        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
         DSTS ${contrib_dst_dir} ${contrib_dst_dir})
   list(APPEND inference_deps contrib_inference_lib)
 endif()
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 2cd6ab2bbf042bced41957193a0269f477eb10d0..a8bbb4eb8081420ae0bbaf761bd27303c0d043cb 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -46,6 +46,10 @@ cc_library(paddle_inference_api
     SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
+cc_library(paddle_inference_api_shared SHARED
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
 cc_test(test_paddle_inference_api
         SRCS test_paddle_inference_api.cc
         DEPS paddle_inference_api)
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index bc48fd3b479157d4aea390cd5f4dc61ea46dca4b..cd00b7de7338982308acfa1f1e8c38e010c6a43b 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -147,9 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                  "Input tensor type is not supported: ", in.type().name());
   memory::data_type out_type = in_type;
 
-  auto in_format = MKLDNNFormatForSize(in_tz.size(), in.format());
+  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
   auto out_format =
-      MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
+      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
 
   void* in_data = GetDataFromTensor(in, in_type);
 
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 67f91e4e48d3e11ed493c5e6943cb9071aff60c4..90bb206ec6b698bc23ad1a5c9609a25186ec6de8 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -62,12 +62,6 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
   return MKLDNNDataType::data_undef;
 }
 
-inline MKLDNNFormat MKLDNNFormatForSize(size_t dims_size,
-                                        MKLDNNFormat default_format) {
-  return (dims_size == 1
-              ? mkldnn::memory::format::x
-              : dims_size == 2 ? mkldnn::memory::format::nc : default_format);
-}
 #endif
 
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 635eb404d4a94e9a765f511bb249ce0cce22c477..82872224501709080ff02a13464d58543a0abda8 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace framework {
 
@@ -48,8 +52,8 @@ void TransformData(const OpKernelType &expected_kernel_type,
         // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
         // Just set layout/format. No real transform occur
 
-        auto out_format =
-            MKLDNNFormatForSize(in.dims().size(), ToMKLDNNFormat(lin));
+        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
+                                                        ToMKLDNNFormat(lin));
 
         out.ShareDataWith(input_tensor);
         out.set_layout(DataLayout::kMKLDNN);
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 87b0ff0c80270c5b29f81dde66fb16c5fc2b9e54..39287501385844a7ee1225d132c9372325135342 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -20,9 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
-#ifdef PADDLE_WITH_DISTRIBUTE
-#include "paddle/fluid/operators/distributed/grpc_client.h"
-#endif
+#include "paddle/fluid/operators/detail/macros.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc
index d29d8ce1c561e45980d10c17c984ca2ed3b453f3..5373d769a4993bb378b30c3b23885c072b778e5c 100644
--- a/paddle/fluid/framework/lod_tensor.cc
+++ b/paddle/fluid/framework/lod_tensor.cc
@@ -68,7 +68,7 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
   // only print first ten elements
   int64_t size = t.numel() < 10 ? t.numel() : 10;
   for (int64_t i = 0; i < size; ++i) {
-    if (t.type().hash_code() == typeid(float).hash_code()) {
+    if (t.type().hash_code() == typeid(float).hash_code()) {  // NOLINT
       os << t.data<float>()[i] << " ";
     } else if (t.type().hash_code() == typeid(int64_t).hash_code()) {
       os << t.data<int64_t>()[i] << " ";
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index aa1a42fc97310e4c149f2018f1706d8e42630a98..4586183d8d206d07f8dcdc000a5ce0bc65d847d5 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -748,10 +748,6 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType(
           t = &var->Get<LoDTensor>();
         } else if (var->IsType<SelectedRows>()) {
           t = &(var->Get<SelectedRows>().value());
-        } else if (var->IsType<LoDTensorArray>()) {
-          const LoDTensorArray& arr = var->Get<LoDTensorArray>();
-          PADDLE_ENFORCE(arr.size() > 0);
-          t = &(arr[0]);
         }
         if (t != nullptr) {
           int tmp = static_cast<int>(ToDataType(t->type()));
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 751b10eeeed10828c08ada4173300c07f81c093e..b53a6f43fbd1f23e69d23ad0fcc54d5c25d352a3 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -253,9 +253,6 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
       t->set_lod(lod_tensors[j].lod());
     }
   }
-  for (auto &p : member_->places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  }
 }
 
 ParallelExecutor::~ParallelExecutor() {
diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h
index 96114678a9992f2975c4173c7cc003114f04d8df..7f678f869aac4616c8bca440d0431f765da41dd6 100644
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -23,9 +23,9 @@ namespace framework {
 template <typename T>
 inline const T* Tensor::data() const {
   check_memory_size();
-  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type() == std::type_index(typeid(T)),
-                 "Tensor holds the wrong type, it holds %s",
+  bool valid = std::is_same<T, void>::value ||
+               holder_->type() == std::type_index(typeid(T));
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
                  this->holder_->type().name());
 
   return reinterpret_cast<const T*>(
@@ -37,9 +37,9 @@ inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
 template <typename T>
 inline T* Tensor::data() {
   check_memory_size();
-  PADDLE_ENFORCE(std::is_same<T, void>::value ||
-                     holder_->type() == std::type_index(typeid(T)),
-                 "Tensor holds the wrong type, it holds %s",
+  bool valid = std::is_same<T, void>::value ||
+               holder_->type() == std::type_index(typeid(T));
+  PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s",
                  this->holder_->type().name());
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                               offset_);
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index e5bc74755f46449296a153e8b330968e6d9f1e1d..f98011e896f4033ef210e0eb69f93ce7800a3cd6 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -69,7 +69,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+    if (platform::is_same_place(src_place, dst_place)) {
+      memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                   stream);
+    } else {
+      if (platform::is_same_place(ctx_place, src_place)) {
+        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                     stream);
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
+      } else if (platform::is_same_place(ctx_place, dst_place)) {
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
+        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                     stream);
+      } else {
+        PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
+      }
+    }
   }
 #endif
 }
@@ -78,10 +93,10 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext* dev_ctx;
-  if (platform::is_gpu_place(src.place())) {
-    dev_ctx = pool.Get(src.place());
-  } else {
+  if (platform::is_gpu_place(dst_place)) {
     dev_ctx = pool.Get(dst_place);
+  } else {
+    dev_ctx = pool.Get(src.place());
   }
   TensorCopy(src, dst_place, *dev_ctx, dst);
 }
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index dca279b69382b80e055f661cefe84b81326704b5..4457382ade37a12f5f3613fc4113fbf1f6f91124 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -23,10 +23,25 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+// NOTE(zcd): Because TensorCopy is an async operation, when the src_place
+// and dst_place are two different GPU, to ensure that the operation can
+// be carried out correctly, there is a src_ctx wait operation in TensorCopy.
+// If ctx_place and src_place are the same, src_ctx.Wait() is added
+// after memory::Copy; if ctx_place and dst_place are the same,
+// src_ctx.Wait() is added before memory::Copy.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 const platform::DeviceContext& ctx, Tensor* dst);
+
+// NOTE(zcd): If the src.place() and dst_place are two different GPU,
+// the copy operation is carried out on the dst_place's stream. This is
+// very important, because TensorCopy is an async operator, and in most
+// case, once this copy operator returns, dst is to be used in dst_place's
+// stream, if this copy operation is carried out on the src_place's stream,
+// when dst is used in dst_place's stream the copy operation may be
+// not completed.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst);
+
 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                     Tensor* dst);
 
diff --git a/paddle/fluid/inference/analysis/README.md b/paddle/fluid/inference/analysis/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4c5de189cd1eab1ba3de0b2cdfd2294d139ceab2
--- /dev/null
+++ b/paddle/fluid/inference/analysis/README.md
@@ -0,0 +1,57 @@
+# Inference Analysis
+
+The `inference/analysis` module is used to analyze and optimize the inference program,
+it references some philosophy from `LLVM/analysis`, 
+and make the various optimization features be pluggable and co-exist in a pipeline.
+
+We borrowed some concepts from LLVM, such as
+
+- [Pass](./pass.h)es to implement optimization that traverse the inference program,
+- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program,
+- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph.
+
+There are some other basic concepts here
+
+- [Node](./node.h), the node in a `DataFlowGraph`,
+  - `Function`, the Operator in Fluid,
+  - `Value`, the Variable in Fluid;
+- [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline,
+
+## How it works
+
+The `inference/analysis` module make all the passes in a pipeline, and works in such way:
+
+1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc,
+2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes,
+3. Transform a new ProgramDesc from the modified `DataFlowGraph`.
+
+The new optimization features can be added as an independent `Pass` and controlled by gflags,
+each pass will generate unified debug information or visualization for better debugging.
+
+## Supported Passes
+
+### `FluidToDataFlowGraphPass`
+Transform the fluid `ProgramDesc` to a `DataFlowGraph` to give an abstract representation for all the middle passes, 
+this should be the first pass of the pipeline.
+
+### `DataFlowGraphToFluidPass`
+Generate a final `ProgramDesc` from a data flow graph, this should be the last pass of the pipeline.
+
+### `TensorRTSubgraphNodeMarkPass`
+Mark the `Node` that are supported by TensorRT, 
+this pass will generate a visualization file which can be used for debugging.
+
+### `TensorRTSubGraphPass`
+Split the sub-graph that are can be accelerated by TensorRT.
+
+### `DFG_GraphvizDrawPass`
+This pass is just for debug, it will visualize the `DataFlowGraph` using the [graphviz](http://www.graphviz.org) tool.
+
+It can be used as a helper class that draws the modified graph after each pass.
+
+## Utilities
+
+There is some helper function/class for analysis.
+
+- [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
+- [graph_traits.h](./graph_traits.h) contains the graph traversal algorithms, it uses `iterator` to make the algorithms easy to share across different passes.
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index 5d85530969c5bec1c84d5f5b0d2626431a9e1c63..a4625f008c15300b88ef0bce71cd7d8aa473c9a8 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/analyzer.h"
+#include <string>
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
@@ -79,4 +80,4 @@ void Analyzer::Run(Argument* argument) {
 
 }  // namespace analysis
 }  // namespace inference
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index f290a3777d5be2ef64667d8c17ec59adddc3ef1b..e9e14fb1947da059c8d126d3da182ce446f6421e 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
 /*
  * This file contains Analyzer, an class that exposed as a library that analyze
  * and optimize
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 30c60661f3492034248e164a70a682bae3819d23..a4fefc83e0c551d52bec87299bcbc966e7a2dbf7 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -138,7 +138,7 @@ struct GraphTraits<DataFlowGraph> {
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
 static std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
   std::unordered_set<Node *> nodes(graph.begin(), graph.end());
   std::unordered_set<Node *> inputs;
   std::unordered_set<Node *> outputs;
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index e74efd17b834db1d0314c8b7082f3e9c15d6eda3..29ca008123addf07959b965a4b54bf55b18c401d 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/proto_desc.h"
@@ -150,13 +151,14 @@ namespace {
 class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
  public:
   using Config = DFG_GraphvizDrawPass::Config;
-  DFG_DebuggerPass(const Config& config) : DFG_GraphvizDrawPass(config) {}
+  explicit DFG_DebuggerPass(const Config& config)
+      : DFG_GraphvizDrawPass(config) {}
 
   std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
 
   bool Finalize() override { return true; }
 };
-}
+}  // namespace
 
 Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
   return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
index 1726e056ed37e2e5fbe2042851ca9bd188806bac..edc84b02ed20991e3e7c6c437d2b1fac169bae03 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -19,6 +19,7 @@
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/pass.h"
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
index b064782586f6243353eda67ac8db040509716b20..17445ab4407a159ca11345bc9a9226b3ad0044f0 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -46,7 +46,7 @@ class DFG_GraphvizDrawPass : public DataFlowGraphPass {
     const bool display_deleted_node;
   };
 
-  DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
+  explicit DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
 
   bool Initialize(Argument *argument) override { return true; }
   void Run(DataFlowGraph *graph) override;
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 5d7eb43b7cbd7bc45b5f0c940bf80ad72348e1b9..e918622d74cfb11d83090555be2a768cc14e7742 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "analyzer.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
 
@@ -88,7 +88,8 @@ namespace {
 class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
  public:
   using Config = DFG_GraphvizDrawPass::Config;
-  DFG_DebuggerPass(const Config &config) : DFG_GraphvizDrawPass(config) {}
+  explicit DFG_DebuggerPass(const Config &config)
+      : DFG_GraphvizDrawPass(config) {}
   std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
   bool Finalize() override { return true; }
 };
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
index 6caba8f04237e014c5ddf1a3a077bcbadb0ddb71..dac1c509d728114bd24a2ea1150c407646026fd4 100644
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 
-#include <gtest/gtest.h>
-
 namespace paddle {
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
index 5ad092a9ed201e5e6ab7770bcfd9ddf871779c12..f736e385c11add152dc9ab9485bf1de40f80b2f3 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+#include <string>
+
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/node_attr_flags.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
 
 namespace paddle {
 namespace inference {
@@ -29,7 +31,7 @@ void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) {
 
 class DfgDebuggerPass : public DFG_GraphvizDrawPass {
  public:
-  DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
+  explicit DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
       : DFG_GraphvizDrawPass(config) {}
 
   std::string repr() const override {
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
index 6cfac55d3b7b501e8ccc141cb7309f1428478672..c558a6ebbde371071c7330a14cc986bf764d1773 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h
@@ -16,6 +16,10 @@
  * This file defines TensorRTSubgraphNodeMarkPass which helps to mark the ops
  * that supported by TensorRT engine.
  */
+
+#pragma once
+
+#include <string>
 #include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
 
@@ -30,7 +34,8 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
  public:
   using teller_t = SubGraphSplitter::NodeInsideSubgraphTeller;
 
-  TensorRTSubgraphNodeMarkPass(const teller_t& teller) : teller_(teller) {}
+  explicit TensorRTSubgraphNodeMarkPass(const teller_t& teller)
+      : teller_(teller) {}
 
   bool Initialize(Argument* argument) override { return true; }
 
@@ -38,8 +43,10 @@ class TensorRTSubgraphNodeMarkPass : public DataFlowGraphPass {
   // sub-graph into TensorRT.
   void Run(DataFlowGraph* graph) override;
 
-  std::string repr() const { return "tensorrt-sub-subgraph-mark"; }
-  std::string description() const { return "tensorrt sub-graph mark pass"; }
+  std::string repr() const override { return "tensorrt-sub-subgraph-mark"; }
+  std::string description() const override {
+    return "tensorrt sub-graph mark pass";
+  }
 
   Pass* CreateGraphvizDebugerPass() const override;
   bool Finalize() override;
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
index 11e088069538414c79371b920cb8fa1509b24bb1..c6741a92095d33d261a4e1667c87a8ca02e51a9f 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/inference/analysis/node.h"
 #include "paddle/fluid/inference/analysis/pass.h"
 #include "paddle/fluid/inference/analysis/subgraph_splitter.h"
@@ -30,7 +31,7 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
   // Tell whether to transform a sub-graph into TensorRT.
   using NodeInsideSubgraphTeller = SubGraphFuse::NodeInsideSubgraphTeller;
 
-  TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
+  explicit TensorRTSubGraphPass(const NodeInsideSubgraphTeller& teller);
 
   bool Initialize(Argument* argument) override { return true; }
 
@@ -40,8 +41,8 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
 
   bool Finalize() override { return true; }
 
-  std::string repr() const { return "tensorrt-sub-graph"; }
-  std::string description() const { return "tensorrt sub graph pass"; }
+  std::string repr() const override { return "tensorrt-sub-graph"; }
+  std::string description() const override { return "tensorrt sub graph pass"; }
 
  private:
   NodeInsideSubgraphTeller node_inside_subgraph_teller_;
@@ -49,4 +50,4 @@ class TensorRTSubGraphPass : public DataFlowGraphPass {
 
 }  // namespace analysis
 }  // namespace inference
-}  // paddle
+}  // namespace paddle
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 0c74f62de5c6f5d432ee928945db6dcf385ca209..bd98ed81899440a46415d30b6d74fec2dac4c155 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -20,6 +20,12 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
+DEFINE_bool(init_allocated_mem, false,
+            "It is a mistake that the values of the memory allocated by "
+            "BuddyAllocator are always zeroed in some op's implementation. "
+            "To find this error in time, we use init_allocated_mem to indicate "
+            "that initializing the allocated memory with a small value "
+            "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
 
 namespace paddle {
@@ -41,6 +47,9 @@ template <>
 void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   void* p = GetCPUBuddyAllocator()->Alloc(size);
+  if (FLAGS_init_allocated_mem) {
+    memset(p, 0xEF, size);
+  }
   VLOG(10) << "  pointer=" << p;
   return p;
 }
@@ -104,6 +113,9 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
     LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
     platform::SetDeviceId(cur_dev);
   }
+  if (FLAGS_init_allocated_mem) {
+    cudaMemset(ptr, 0xEF, size);
+  }
   return ptr;
 }
 
@@ -137,6 +149,9 @@ void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
     LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
                  << " bytes in CUDAPinnedPlace";
   }
+  if (FLAGS_init_allocated_mem) {
+    memset(ptr, 0xEF, size);
+  }
   return ptr;
 }
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 9dc39ad0ddf8c5de3e1960a1171431e026de35ae..ab1d2143330fb8cbfd535758a83bc71de939c4e0 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -184,6 +184,7 @@ else()
     set(DEPS_OPS ${DEPS_OPS} nccl_op)
 endif()
 
+set(DISTRIBUTE_DEPS "")
 if(WITH_DISTRIBUTE)
     add_subdirectory(distributed)
     
@@ -192,6 +193,18 @@ if(WITH_DISTRIBUTE)
         set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
     else()
         set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib)
+        if(WITH_BRPC_RDMA)
+            find_library(IBVERBS_LIBRARY NAMES ibverbs)
+            ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL)
+            SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY})
+
+
+            find_library(RDMACM_LIBRARY NAMES rdmacm)
+            ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL)
+            SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY})
+
+            set(DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} ibverbs rdmacm)
+        endif()
     endif()
 
     set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
@@ -205,7 +218,7 @@ if(WITH_DISTRIBUTE)
     #        listen_and_serv_op sum_op executor SERIAL)
     if(WITH_GPU)
         set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op executor SERIAL)
+        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op ${DISTRIBUTE_DEPS} executor SERIAL)
         if(WITH_GRPC)
             op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
         else()
@@ -297,6 +310,7 @@ foreach(src ${DETECTION_LIBRARY})
 endforeach()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
+set(GLOB_DISTRIBUTE_DEPS ${DISTRIBUTE_DEPS} CACHE INTERNAL "distributed dependency")
 
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
index 6ecb43c49c30f9da2a273d506f7b85c0a4f5fa2c..9ab2179b5fe689762704039c5f67dd080e530aa5 100644
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -115,9 +115,12 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
 
     // create mkldnn memory from input x tensor
-    auto src_memory =
-        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
-               to_void_cast(x_data));
+    mkldnn::memory::format input_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
+
+    auto src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+        to_void_cast(x_data));
 
     // create primitive descriptor for batch norm forward
     using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
@@ -251,15 +254,21 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
 
     // create mkldnn memory from input diff_y tensor
-    auto user_diff_dst_memory =
-        memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()},
-                mkldnn_engine},
-               to_void_cast(diff_y_data));
+
+    mkldnn::memory::format dst_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
+
+    auto user_diff_dst_memory = memory(
+        {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
+        to_void_cast(diff_y_data));
 
     // create mkldnn memory from input x tensor
-    auto src_memory =
-        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
-               to_void_cast(x_data));
+    mkldnn::memory::format input_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
+
+    auto src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+        to_void_cast(x_data));
 
     // for diff_dst, try to use same format as dst in forward pass
     auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
diff --git a/paddle/fluid/operators/detail/macros.h b/paddle/fluid/operators/detail/macros.h
index b9e385994efcea0388756e8bd780ebfc719ed08d..6f4a15caa5542a45cd8e26a72b055ca8948069d0 100644
--- a/paddle/fluid/operators/detail/macros.h
+++ b/paddle/fluid/operators/detail/macros.h
@@ -14,14 +14,22 @@
 
 #pragma once
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+
 #ifdef PADDLE_WITH_GRPC
+
 #include "paddle/fluid/operators/distributed/grpc_client.h"
 #include "paddle/fluid/operators/distributed/grpc_server.h"
-#define RPCSERVER_T distributed::AsyncGRPCServer
-#define RPCCLIENT_T distributed::GRPCClient
-#else
+#define RPCSERVER_T paddle::operators::distributed::AsyncGRPCServer
+#define RPCCLIENT_T paddle::operators::distributed::GRPCClient
+
+#else  // PADDLE_WITH_GRPC
+
 #include "paddle/fluid/operators/distributed/brpc_client.h"
 #include "paddle/fluid/operators/distributed/brpc_server.h"
-#define RPCSERVER_T distributed::AsyncBRPCServer
-#define RPCCLIENT_T distributed::BRPCClient
-#endif
+#define RPCSERVER_T paddle::operators::distributed::AsyncBRPCServer
+#define RPCCLIENT_T paddle::operators::distributed::BRPCClient
+
+#endif  // PADDLE_WITH_GRPC
+
+#endif  // PADDLE_WITH_DISTRIBUTE
diff --git a/paddle/fluid/operators/distributed/rpc_client.cc b/paddle/fluid/operators/distributed/rpc_client.cc
index 382b65d637c0be6ac432d2b5feb44e0df3c6fd6e..b5ec9fe5367beb97b3cc7298102deff1e8ca4ec9 100644
--- a/paddle/fluid/operators/distributed/rpc_client.cc
+++ b/paddle/fluid/operators/distributed/rpc_client.cc
@@ -16,7 +16,7 @@
 #include "gflags/gflags.h"
 
 // default to 3min to avoid temprary network failures.
-DEFINE_int32(rpc_deadline, 30000, "deadline timeouts for rpc");
+DEFINE_int32(rpc_deadline, 180000, "deadline timeouts for rpc");
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fill_zeros_like_op.cc b/paddle/fluid/operators/fill_zeros_like_op.cc
index a9d47c017275193cdacc7db8f31e8e874b9b84de..d67bec36b3248be8602da562a88aeb58f5effe39 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.cc
+++ b/paddle/fluid/operators/fill_zeros_like_op.cc
@@ -26,12 +26,8 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
                    "Input(X) of FillZerosLikeOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FillZerosLikeOp should not be null.");
-
-    if (ctx->IsRuntime() &&
-        ctx->GetOutputsVarType("Out")[0] ==
-            framework::proto::VarType::LOD_TENSOR_ARRAY) {
-      return;  // skip runtime infershape when is tensor array;
-    }
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
@@ -43,7 +39,7 @@ class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 FillZerosLike Operator.
 
-Fill up a variable with zeros, supporting both LoDTensor and LoDTensorArray.
+Fill up a variable with zeros.
 The output will have the same size as the input.
 
 )DOC");
diff --git a/paddle/fluid/operators/fill_zeros_like_op.h b/paddle/fluid/operators/fill_zeros_like_op.h
index daa6521b32e583f733bc040afb61bf13c4236731..4bbe0df6b6890122381c87494e510cf125792377 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.h
+++ b/paddle/fluid/operators/fill_zeros_like_op.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -24,29 +23,12 @@ template <typename DeviceContext, typename T>
 class FillZerosLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto var = context.InputVar("X");
-    if (var->IsType<framework::LoDTensor>()) {
-      auto& input = *context.Input<framework::LoDTensor>("X");
-      auto& output = *context.Output<framework::LoDTensor>("Out");
-      output.Resize(input.dims());
-      output.set_lod(input.lod());
-      output.mutable_data<T>(context.GetPlace());
-      math::SetConstant<DeviceContext, T> setter;
-      setter(context.template device_context<DeviceContext>(), &(output),
-             static_cast<T>(0));
-    } else if (var->IsType<framework::LoDTensorArray>()) {
-      auto& input = *context.Input<framework::LoDTensorArray>("X");
-      auto& output = *context.Output<framework::LoDTensorArray>("Out");
-      output.resize(input.size());
-      for (auto i = 0; i < input.size(); i++) {
-        output[i].Resize(input[i].dims());
-        output[i].set_lod(input[i].lod());
-        output[i].mutable_data<T>(context.GetPlace());
-        math::SetConstant<DeviceContext, T> setter;
-        setter(context.template device_context<DeviceContext>(), &(output[i]),
-               static_cast<T>(0));
-      }
-    }
+    auto* out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    math::SetConstant<DeviceContext, T> setter;
+    setter(context.template device_context<DeviceContext>(), out,
+           static_cast<T>(0));
   }
 };
 
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index a34e4371cccfd1be0d173fa11595e4368eb65b85..70bc9c4e8340b8e02ef2826d828faff3f6d11965 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -113,7 +113,11 @@ template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     bool stat, const Args&... args) {
   if (UNLIKELY(!(stat))) {
+#ifndef REPLACE_ENFORCE_GLOG
     throw std::runtime_error(string::Sprintf(args...));
+#else
+    LOG(FATAL) << string::Sprintf(args...);
+#endif
   }
 }
 
@@ -123,8 +127,12 @@ template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     cudaError_t e, const Args&... args) {
   if (UNLIKELY(e)) {
+#ifndef REPLACE_ENFORCE_GLOG
     throw thrust::system_error(e, thrust::cuda_category(),
                                string::Sprintf(args...));
+#else
+    LOG(FATAL) << string::Sprintf(args...);
+#endif
   }
 }
 
@@ -132,8 +140,12 @@ template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     curandStatus_t stat, const Args&... args) {
   if (stat != CURAND_STATUS_SUCCESS) {
+#ifndef REPLACE_ENFORCE_GLOG
     throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
                                string::Sprintf(args...));
+#else
+    LOG(FATAL) << string::Sprintf(args...);
+#endif
   }
 }
 
@@ -143,8 +155,12 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   if (stat == CUDNN_STATUS_SUCCESS) {
     return;
   } else {
+#ifndef REPLACE_ENFORCE_GLOG
     throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
                              string::Sprintf(args...));
+#else
+    LOG(FATAL) << string::Sprintf(args...);
+#endif
   }
 }
 
@@ -173,7 +189,11 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
     err = "CUBLAS: license error, ";
   }
+#ifndef REPLACE_ENFORCE_GLOG
   throw std::runtime_error(err + string::Sprintf(args...));
+#else
+  LOG(FATAL) << err << string::Sprintf(args...);
+#endif
 }
 
 #ifndef __APPLE__
@@ -183,8 +203,13 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   if (stat == ncclSuccess) {
     return;
   } else {
+#ifndef REPLACE_ENFORCE_GLOG
     throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) +
                              string::Sprintf(args...));
+#else
+    LOG(FATAL) << platform::dynload::ncclGetErrorString(stat)
+               << string::Sprintf(args...);
+#endif
   }
 }
 #endif  // __APPLE__
@@ -203,6 +228,7 @@ inline void throw_on_error(T e) {
         __FILE__, __LINE__);                                           \
   } while (false)
 
+#ifndef REPLACE_ENFORCE_GLOG
 #define PADDLE_ENFORCE(...)                                             \
   do {                                                                  \
     try {                                                               \
@@ -212,6 +238,9 @@ inline void throw_on_error(T e) {
                                               __FILE__, __LINE__);      \
     }                                                                   \
   } while (false)
+#else
+#define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
+#endif
 
 /*
  * Some enforce helpers here, usage:
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index ed99932546446eb877c9701de15e2d37d29b5f88..33fec2c1073819d88d85a8872227adcb9df3e8f4 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <mkldnn.h>
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/place.h"
@@ -182,10 +183,11 @@ class MKLDNNHandler {
   }
 
   std::shared_ptr<mkldnn::memory> AcquireMemory(
-      mkldnn::memory::primitive_desc& mpd,
-      mkldnn::memory::primitive_desc& user_mpd,
+      mkldnn::memory::primitive_desc& mpd,       // NOLINT
+      mkldnn::memory::primitive_desc& user_mpd,  // NOLINT
       const std::shared_ptr<mkldnn::memory> user_memory_p,
-      const std::string& suffix, std::vector<mkldnn::primitive>& pipeline) {
+      const std::string& suffix,
+      std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
     // create reorder primitive if the input format is not the preferred one
     auto local_key = key_ + suffix;
     auto key_reorder_p = key_ + suffix + "reorder_p";
@@ -218,7 +220,7 @@ class MKLDNNHandler {
     return target_memory_p;
   }
 
-  static std::string GetHash(mkldnn::memory::dims& operand_dims,
+  static std::string GetHash(mkldnn::memory::dims& operand_dims,  // NOLINT
                              const std::string& suffix) {
     auto dims2str = [](const mkldnn::memory::dims& operand_dims) {
       std::string dstr = "";
@@ -227,8 +229,9 @@ class MKLDNNHandler {
       }
       return dstr;
     };
+
     return dims2str(operand_dims) + suffix;
-  };
+  }
 
  protected:
   const MKLDNNDeviceContext& dev_ctx_;
@@ -237,5 +240,15 @@ class MKLDNNHandler {
   bool is_reusing_;
 };
 
+inline mkldnn::memory::format MKLDNNFormatForSize(
+    size_t dims_size, mkldnn::memory::format data_format) {
+  if (dims_size == 1) {
+    return mkldnn::memory::format::x;
+  } else if (dims_size == 2) {
+    return mkldnn::memory::format::nc;
+  }
+  return data_format;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/string/printf.h b/paddle/fluid/string/printf.h
index 062095a1c3e977c0bcc89346ead765acb023bcf7..e0f6202506868fd0dcbe93297785682edbcfe792 100644
--- a/paddle/fluid/string/printf.h
+++ b/paddle/fluid/string/printf.h
@@ -84,7 +84,7 @@ void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
 }
 
 template <typename... Args>
-std::string Sprintf(const char* fmt, const Args&... args) {
+std::string Sprintf(const char* fmt = "", const Args&... args) {
   std::ostringstream oss;
   Fprintf(oss, fmt, args...);
   return oss.str();
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 45af83708ea63fc1b6aa86f1e8423bb44b7388a6..3034c1a0875a71421bcba172c16ee32d809df152 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -118,7 +118,8 @@ def __bootstrap__():
 
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb'
+        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
+        'init_allocated_mem'
     ]
     if core.is_compiled_with_cuda():
         read_env_flags += [
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 61c01b3b0056648b6cc67430d5d3bfffc8412928..bcf520d5a4e3bbe1d949d08f42199dd8c5cdc947 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -95,7 +95,6 @@ __all__ = [
     'relu',
     'log',
     'crop',
-    'fill_zeros_like',
 ]
 
 
@@ -5185,40 +5184,3 @@ def crop(x, shape=None, offsets=None, name=None):
         outputs={'Out': out},
         attrs=None if len(attrs) == 0 else attrs)
     return out
-
-
-def fill_zeros_like(x):
-    """
-    This layer takes an input and outputs a variable that has the same structure as
-    the input and with all the element values as zero. The variable can be a Tensor
-    or TensorArray.
-
-    .. code-block:: text
-
-
-       Given
-          X = [[0, 1, 2, 0],
-               [0, 3, 4, 0],
-               [0, 0, 0, 0]],
-       output is:
-          Out = [[0, 0, 0, 0],
-                 [0, 0, 0, 0],
-                 [0, 0, 0, 0]].
-
-    Args:
-        x (Variable): The input variable, which could be a tensor or tensor array
-
-    Returns:
-        Variable: The zero-filled variable, which has the same type and shape as
-                  the input variable.
-
-    Examples:
-
-        .. code-block:: python
-            y = fluid.layers.fill_zeros_like(x)
-    """
-    helper = LayerHelper('fill_zeros_like', **locals())
-    out = helper.create_tmp_variable(dtype=x.dtype)
-    helper.append_op(
-        type='fill_zeros_like', inputs={'X': [x]}, outputs={'Out': [out]})
-    return out
diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py
index 74f96f456a8dc917b715d0f4308bb5ea41947f0b..71bf5f8b3a9b17f24ce35220a9348bb871852623 100644
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
@@ -110,14 +110,23 @@ def infer(use_cuda, save_dirname=None):
         # The input's dimension should be 2-D and the second dim is 13
         # The input data should be >= 0
         batch_size = 10
-        tensor_x = numpy.random.uniform(0, 10,
-                                        [batch_size, 13]).astype("float32")
+
+        test_reader = paddle.batch(
+            paddle.dataset.uci_housing.test(), batch_size=batch_size)
+
+        test_data = test_reader().next()
+        test_feat = numpy.array(
+            [data[0] for data in test_data]).astype("float32")
+        test_label = numpy.array(
+            [data[1] for data in test_data]).astype("float32")
+
         assert feed_target_names[0] == 'x'
         results = exe.run(inference_program,
-                          feed={feed_target_names[0]: tensor_x},
+                          feed={feed_target_names[0]: numpy.array(test_feat)},
                           fetch_list=fetch_targets)
         print("infer shape: ", results[0].shape)
         print("infer results: ", results[0])
+        print("ground truth: ", test_label)
 
 
 def main(use_cuda, is_local=True):
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 5f27864c140573086d07415f83caca708889a068..f6c8dcabcbc592024188f4742e6c532a704d2289 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -52,3 +52,4 @@ py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SE
 py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)
 set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
 set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 180)
+set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 180)
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 21f2037ad408b0a92718c0ea2bae5e8bf563c665..cddf00765f4894126988c794763c34629449e8e6 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -18,6 +18,8 @@ import unittest
 import paddle.fluid as fluid
 import time
 import numpy as np
+import math
+import sys
 
 __all__ = ['TestParallelExecutorBase']
 
@@ -93,6 +95,12 @@ class TestParallelExecutorBase(unittest.TestCase):
                 print "%.4f Instance per second" % (
                     (batch_size * iter + 2) / (end - begin))
 
+            avg_last_loss_val = np.array(last_loss).mean()
+            avg_first_loss_val = np.array(first_loss).mean()
+            if math.isnan(float(avg_last_loss_val)) or math.isnan(
+                    float(avg_first_loss_val)):
+                sys.exit("got NaN loss, training failed.")
+
             print first_loss, last_loss
             # self.assertGreater(first_loss[0], last_loss[0])
             return first_loss, last_loss
diff --git a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
new file mode 100644
index 0000000000000000000000000000000000000000..712fd5849d80b1915ae3b2ae5108bedee8d88a2c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
@@ -0,0 +1,203 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import argparse
+import time
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import os
+import signal
+
+IS_SPARSE = True
+EMBED_SIZE = 32
+HIDDEN_SIZE = 256
+N = 5
+BATCH_SIZE = 32
+ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
+
+
+def get_model():
+    def __network__(words):
+        embed_first = fluid.layers.embedding(
+            input=words[0],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_second = fluid.layers.embedding(
+            input=words[1],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_third = fluid.layers.embedding(
+            input=words[2],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+        embed_forth = fluid.layers.embedding(
+            input=words[3],
+            size=[dict_size, EMBED_SIZE],
+            dtype='float32',
+            is_sparse=IS_SPARSE,
+            param_attr='shared_w')
+
+        concat_embed = fluid.layers.concat(
+            input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+        hidden1 = fluid.layers.fc(input=concat_embed,
+                                  size=HIDDEN_SIZE,
+                                  act='sigmoid')
+        predict_word = fluid.layers.fc(input=hidden1,
+                                       size=dict_size,
+                                       act='softmax')
+        cost = fluid.layers.cross_entropy(input=predict_word, label=words[4])
+        avg_cost = fluid.layers.mean(cost)
+        return avg_cost, predict_word
+
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+
+    first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+    second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+    third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+    forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+    next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+    avg_cost, predict_word = __network__(
+        [first_word, second_word, third_word, forth_word, next_word])
+
+    inference_program = paddle.fluid.default_main_program().clone()
+
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
+
+    train_reader = paddle.batch(
+        paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.imikolov.test(word_dict, N), BATCH_SIZE)
+
+    return inference_program, avg_cost, train_reader, test_reader, predict_word
+
+
+def get_transpiler(trainer_id, main_program, pserver_endpoints, trainers):
+    t = fluid.DistributeTranspiler()
+    t.transpile(
+        trainer_id=trainer_id,
+        program=main_program,
+        pservers=pserver_endpoints,
+        trainers=trainers)
+    return t
+
+
+def run_pserver(pserver_endpoints, trainers, current_endpoint):
+    get_model()
+    t = get_transpiler(0,
+                       fluid.default_main_program(), pserver_endpoints,
+                       trainers)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    startup_prog = t.get_startup_program(current_endpoint, pserver_prog)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    exe.run(startup_prog)
+
+    exe.run(pserver_prog)
+
+
+class TestDistMnist(unittest.TestCase):
+    def setUp(self):
+        self._trainers = 1
+        self._pservers = 1
+        self._ps_endpoints = "127.0.0.1:9123"
+
+    def start_pserver(self, endpoint):
+        p = Process(
+            target=run_pserver,
+            args=(self._ps_endpoints, self._trainers, endpoint))
+        p.start()
+        return p.pid
+
+    def _wait_ps_ready(self, pid):
+        retry_times = 5
+        while True:
+            assert retry_times >= 0, "wait ps ready failed"
+            time.sleep(1)
+            try:
+                # the listen_and_serv_op would touch a file which contains the listen port
+                # on the /tmp directory until it was ready to process all the RPC call.
+                os.stat("/tmp/paddle.%d.port" % pid)
+                return
+            except os.error:
+                retry_times -= 1
+
+    def stop_pserver(self, pid):
+        os.kill(pid, signal.SIGKILL)
+
+    def test_with_place(self):
+        p = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
+        ) else fluid.CPUPlace()
+
+        pserver_pid = self.start_pserver(self._ps_endpoints)
+        self._wait_ps_ready(pserver_pid)
+
+        self.run_trainer(p, 0)
+
+        self.stop_pserver(pserver_pid)
+
+    def run_trainer(self, place, trainer_id):
+        test_program, avg_cost, train_reader, test_reader, predict = get_model()
+        t = get_transpiler(trainer_id,
+                           fluid.default_main_program(), self._ps_endpoints,
+                           self._trainers)
+
+        trainer_prog = t.get_trainer_program()
+
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        use_gpu = True if core.is_compiled_with_cuda() else False
+
+        exec_strategy = ExecutionStrategy()
+        exec_strategy.use_cuda = use_gpu
+        train_exe = fluid.ParallelExecutor(
+            use_cuda=use_gpu,
+            main_program=trainer_prog,
+            loss_name=avg_cost.name,
+            exec_strategy=exec_strategy)
+
+        feed_var_list = [
+            var for var in trainer_prog.global_block().vars.itervalues()
+            if var.is_data
+        ]
+
+        feeder = fluid.DataFeeder(feed_var_list, place)
+        for pass_id in xrange(10):
+            for batch_id, data in enumerate(train_reader()):
+                avg_loss_np = train_exe.run(feed=feeder.feed(data),
+                                            fetch_list=[avg_cost.name])
+                loss = np.array(avg_loss_np).mean()
+                if float(loss) < 5.0:
+                    return
+                if math.isnan(loss):
+                    assert ("Got Nan loss, training failed")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op_for_array.py b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op_for_array.py
deleted file mode 100644
index 23871508d8042ade5253c2f0b3bc9f32ec71a135..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op_for_array.py
+++ /dev/null
@@ -1,88 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import paddle.fluid.core as core
-import numpy
-import paddle.fluid.layers as layers
-from paddle.fluid.framework import Program, program_guard
-from paddle.fluid.executor import Executor
-
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-
-
-class TestFillZerosLikeOpForTensorArray(unittest.TestCase):
-    def place(self):
-        return core.CPUPlace()
-
-    def test_zero_filling_lod_tensor_array(self):
-        tensor = core.LoDTensor()
-        tensor.set(
-            numpy.arange(20).reshape(20, 1).astype('int32'), self.place())
-        tensor.set_lod([[0, 2, 5], [0, 3, 9, 11, 17, 20]])
-
-        expect = [
-            numpy.array(
-                [0, 0, 0, 0, 0], dtype='int32'), numpy.array(
-                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype='int32'),
-            numpy.array(
-                [0, 0, 0], dtype='int32')
-        ]
-
-        lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]]
-        self.main(
-            tensor=tensor,
-            expect_array=expect,
-            expect_lod=lod,
-            expect_max_len=3)
-
-    def main(self, tensor, expect_array, expect_lod, expect_max_len, level=0):
-        place = self.place()
-        program = Program()
-        with program_guard(program):
-            x = layers.data(name='x', shape=[10])
-            x.persistable = True
-            table = layers.lod_rank_table(x, level=level)
-            max_len = layers.max_sequence_len(table)
-            max_len.persistable = True
-            array = layers.lod_tensor_to_array(x, table)
-            array = layers.fill_zeros_like(array)
-            array.persistable = True
-
-            result = layers.array_to_lod_tensor(array, table)
-            result.persistable = True
-        exe = Executor(place)
-        scope = core.Scope()
-        exe.run(program, feed={'x': tensor}, scope=scope)
-        var = scope.find_var(array.name)
-        array = var.get_lod_tensor_array()
-        if expect_array is not None and expect_lod is not None:
-            self.check_array_same(array, expect_array, expect_lod)
-
-        self.assertEqual(
-            numpy.array(scope.find_var(max_len.name).get_tensor())[0],
-            expect_max_len)
-
-    def check_array_same(self, array, expect_tensor, expect_lod):
-        self.assertEqual(len(expect_tensor), len(array))
-        for i, exp in enumerate(zip(expect_tensor, expect_lod)):
-            exp_tensor, exp_lod = exp
-            exp_tensor = numpy.expand_dims(exp_tensor, axis=1)
-            self.assertTrue(numpy.allclose(exp_tensor, numpy.array(array[i])))
-            self.assertEqual(exp_lod, array[i].lod())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index 252793944462244539084a288e5259f216359650..9a2733927d38f1a2b1af92fcc12f036158b4d06f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -16,6 +16,8 @@ import paddle.fluid as fluid
 import numpy as np
 import unittest
 import os
+import sys
+import math
 
 
 def simple_fc_net():
@@ -73,6 +75,14 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
 
                 train_loss, = train_exe.run([loss.name], feed=feed_dict)
 
+                avg_test_loss_val = np.array(test_loss).mean()
+                if math.isnan(float(avg_test_loss_val)):
+                    sys.exit("got NaN loss, testing failed.")
+
+                avg_train_loss_val = np.array(train_loss).mean()
+                if math.isnan(float(avg_train_loss_val)):
+                    sys.exit("got NaN loss, training failed.")
+
                 self.assertTrue(
                     np.allclose(
                         train_loss, test_loss, atol=1e-8),