diff --git a/README.md b/README.md
index 8d89c6b1ec9e4aefbd64328dedb4e8c7cc50c21b..63abca069a6629ac59739224ded9cd9f06207d0a 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,6 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
 [![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
 [![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
-[![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
 
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 9c42044ec163e9db1dd21d5c3915b010c30fdf1c..fd7fc16bff5651f022b484623243048fbd225b5a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -96,6 +96,20 @@ if(NOT APPLE AND NOT ANDROID)
     set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
 endif(NOT APPLE AND NOT ANDROID)
 
+set_property(GLOBAL PROPERTY FLUID_MODULES "")
+# find all fluid modules is used for paddle fluid static library
+# for building inference libs
+function(find_fluid_modules TARGET_NAME)
+  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
+  string(FIND "${__target_path}" "fluid" pos)
+  if(pos GREATER 1)
+    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
+    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
+  endif()
+endfunction(find_fluid_modules)
+
 function(merge_static_libs TARGET_NAME)
   set(libs ${ARGN})
   list(REMOVE_DUPLICATES libs)
@@ -250,6 +264,7 @@ function(cc_test TARGET_NAME)
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (${cc_test_SERIAL})
         set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     endif()
   endif()
 endfunction(cc_test)
@@ -314,6 +329,7 @@ function(nv_test TARGET_NAME)
     add_test(${TARGET_NAME} ${TARGET_NAME})
     if (nv_test_SERIAL)
         set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     endif()
   endif()
 endfunction(nv_test)
@@ -561,7 +577,7 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
+             COMMAND env FLAGS_init_allocated_mem=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
   endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 850098297e1456487cb8a7b83dffd3d2b0478689..0c720faa353438b76a72e1574cb90931ddd0cf73 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -12,19 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set_property(GLOBAL PROPERTY FLUID_MODULES "")
-# find all fluid modules is used for paddle fluid static library
-function(find_fluid_modules TARGET_NAME)
-  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
-  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
-  string(FIND "${__target_path}" "fluid" pos)
-  if(pos GREATER 1)
-    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
-    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
-    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
-  endif()
-endfunction(find_fluid_modules)
-
 # make package for paddle fluid shared and static library
 function(copy TARGET)
     set(options "")
@@ -163,9 +150,9 @@ if(WITH_CONTRIB)
         list(APPEND inference_deps contrib_anakin_inference_lib)
    endif()
 
-  copy(contrib_inference_lib DEPS paddle_inference_api
+  copy(contrib_inference_lib DEPS paddle_inference_api paddle_inference_api_shared
         SRCS ${PADDLE_SOURCE_DIR}/paddle/contrib/inference/paddle_inference_api.h
-        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api.*
+        ${PADDLE_BINARY_DIR}/paddle/contrib/inference/libpaddle_inference_api*
         DSTS ${contrib_dst_dir} ${contrib_dst_dir})
   list(APPEND inference_deps contrib_inference_lib)
 endif()
diff --git a/doc/fluid/design/multi_devices/kernel_selection.md b/doc/fluid/design/multi_devices/kernel_selection.md
index 967317d5d2eeb818ab14faabca342cc8c4ed717e..4d2aab87b8cf30d03075e96cc4c67070efaf963a 100644
--- a/doc/fluid/design/multi_devices/kernel_selection.md
+++ b/doc/fluid/design/multi_devices/kernel_selection.md
@@ -74,10 +74,10 @@ void OperatorWithKernel::Run(
     auto kernel_type_for_var = this->GetKernelTypeForVar(...);
     if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
       auto* trans_var = new_scope.Var(var_name);
-      auto* out = DataTransform(expected_kernel_key,
+      auto* out = TransformData(expected_kernel_key,
                                 kernel_type_for_var,
                                 *tensor_in);
-      CopyVariableWithTensor(...);
+      SetTensorToVariable(...);
     }
   }
 
diff --git a/paddle/contrib/inference/CMakeLists.txt b/paddle/contrib/inference/CMakeLists.txt
index 2cd6ab2bbf042bced41957193a0269f477eb10d0..a8bbb4eb8081420ae0bbaf761bd27303c0d043cb 100644
--- a/paddle/contrib/inference/CMakeLists.txt
+++ b/paddle/contrib/inference/CMakeLists.txt
@@ -46,6 +46,10 @@ cc_library(paddle_inference_api
     SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
     DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
 
+cc_library(paddle_inference_api_shared SHARED
+    SRCS paddle_inference_api.cc paddle_inference_api_impl.cc
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
 cc_test(test_paddle_inference_api
         SRCS test_paddle_inference_api.cc
         DEPS paddle_inference_api)
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index bc48fd3b479157d4aea390cd5f4dc61ea46dca4b..cd00b7de7338982308acfa1f1e8c38e010c6a43b 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -147,9 +147,9 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
                  "Input tensor type is not supported: ", in.type().name());
   memory::data_type out_type = in_type;
 
-  auto in_format = MKLDNNFormatForSize(in_tz.size(), in.format());
+  auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format());
   auto out_format =
-      MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
+      platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout));
 
   void* in_data = GetDataFromTensor(in, in_type);
 
diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h
index 67f91e4e48d3e11ed493c5e6943cb9071aff60c4..90bb206ec6b698bc23ad1a5c9609a25186ec6de8 100644
--- a/paddle/fluid/framework/data_layout_transform.h
+++ b/paddle/fluid/framework/data_layout_transform.h
@@ -62,12 +62,6 @@ inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) {
   return MKLDNNDataType::data_undef;
 }
 
-inline MKLDNNFormat MKLDNNFormatForSize(size_t dims_size,
-                                        MKLDNNFormat default_format) {
-  return (dims_size == 1
-              ? mkldnn::memory::format::x
-              : dims_size == 2 ? mkldnn::memory::format::nc : default_format);
-}
 #endif
 
 void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var,
diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc
index 5f15e20c78fd5a333523fe9e73542c037a161cae..82872224501709080ff02a13464d58543a0abda8 100644
--- a/paddle/fluid/framework/data_transform.cc
+++ b/paddle/fluid/framework/data_transform.cc
@@ -18,17 +18,21 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout_transform.h"
 #include "paddle/fluid/framework/data_type_transform.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
 namespace paddle {
 namespace framework {
 
-static void PassTensorData(Tensor* from, Tensor* to) {
+static void PassTensorData(Tensor *from, Tensor *to) {
   to->ShareDataWith(*from);
   *from = Tensor();
 }
 
-void DataTransform(const OpKernelType& expected_kernel_type,
-                   const OpKernelType& kernel_type_for_var,
-                   const Tensor& input_tensor, Tensor* output_tensor) {
+void TransformData(const OpKernelType &expected_kernel_type,
+                   const OpKernelType &kernel_type_for_var,
+                   const Tensor &input_tensor, Tensor *output_tensor) {
   bool transformed = false;
   Tensor in;
   in.ShareDataWith(input_tensor);
@@ -48,8 +52,8 @@ void DataTransform(const OpKernelType& expected_kernel_type,
         // Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
         // Just set layout/format. No real transform occur
 
-        auto out_format =
-            MKLDNNFormatForSize(in.dims().size(), ToMKLDNNFormat(lin));
+        auto out_format = platform::MKLDNNFormatForSize(in.dims().size(),
+                                                        ToMKLDNNFormat(lin));
 
         out.ShareDataWith(input_tensor);
         out.set_layout(DataLayout::kMKLDNN);
@@ -89,17 +93,17 @@ void DataTransform(const OpKernelType& expected_kernel_type,
   output_tensor->ShareDataWith(in);
 }
 
-void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
-                            Variable* out_var) {
+void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
+                         Variable *out_var) {
   if (in_var.IsType<LoDTensor>()) {
-    auto& in_lod_tensor = in_var.Get<LoDTensor>();
-    auto* tran_lod_tensor = out_var->GetMutable<LoDTensor>();
+    auto &in_lod_tensor = in_var.Get<LoDTensor>();
+    auto *tran_lod_tensor = out_var->GetMutable<LoDTensor>();
     tran_lod_tensor->set_lod(in_lod_tensor.lod());
     tran_lod_tensor->set_layout(in_lod_tensor.layout());
     tran_lod_tensor->ShareDataWith(tensor);
   } else if (in_var.IsType<SelectedRows>()) {
-    auto& in_selected_rows = in_var.Get<SelectedRows>();
-    auto* trans_selected_rows = out_var->GetMutable<SelectedRows>();
+    auto &in_selected_rows = in_var.Get<SelectedRows>();
+    auto *trans_selected_rows = out_var->GetMutable<SelectedRows>();
     trans_selected_rows->set_height(in_selected_rows.height());
     trans_selected_rows->set_rows(in_selected_rows.rows());
     trans_selected_rows->mutable_value()->ShareDataWith(tensor);
diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
index dee5d8c7c1126013742460df1d94bb364220ad09..ae3ab051bda2e698801cc6fe6e3ddddf039f5385 100644
--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
@@ -30,12 +30,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-void DataTransform(const OpKernelType& expected_kernel_type,
-                   const OpKernelType& kernel_type_for_var,
-                   const Tensor& input_tensor, Tensor* out);
-
-void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
-                            Variable* out_var);
+void TransformData(const OpKernelType &expected_kernel_type,
+                   const OpKernelType &kernel_type_for_var,
+                   const Tensor &input_tensor, Tensor *out);
+
+/**
+ * Set OutVar from InVar, except the tensor is shared with `tensor`
+ */
+void SetTensorToVariable(const Variable &in_var, const Tensor &tensor,
+                         Variable *out_var);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h
index f51a184e7bae2283f335fe9462a77b9c5fb831a5..c59b232191c49ccb47bb9f51dcaf2fd9280fae19 100644
--- a/paddle/fluid/framework/op_kernel_type.h
+++ b/paddle/fluid/framework/op_kernel_type.h
@@ -97,7 +97,7 @@ inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
   return ret;
 }
 
-inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
+inline bool NeedTransform(const OpKernelType& l, const OpKernelType& r) {
   return (!platform::places_are_same_class(l.place_, r.place_)) ||
          (l.data_type_ != r.data_type_) ||
          NeedTransformLayout(l.data_layout_, r.data_layout_);
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index c1329b06d7e9bcd6604fed14cefa305339c5c4b8..aa1a42fc97310e4c149f2018f1706d8e42630a98 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -620,8 +620,6 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
         "There are no kernels which are registered in the %s operator.", type_);
   }
 
-  ExecutionContext ctx(*this, scope, *dev_ctx);
-
   OpKernelMap& kernels = kernels_iter->second;
 
   // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
@@ -631,7 +629,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   //   Do selection
   // }
 
-  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+  auto expected_kernel_key =
+      this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx));
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
@@ -640,56 +639,34 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
                  KernelTypeToString(expected_kernel_key));
   }
 
-  // do data transform
-  Scope& new_scope = scope.NewScope();
+  // do data transformScope &transfer_scope;
+  std::vector<std::string> transfered_inplace_vars;
+  auto* transfer_scope =
+      TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars);
 
-  std::vector<std::string> inplace_vars;
-  for (auto& var_name_item : this->Inputs()) {
-    for (auto& var_name : var_name_item.second) {
-      auto* var = scope.FindVar(var_name);
-      if (var && VarIsTensor(var)) {
-        auto* tensor_in = GetTensorFromVar(var);
-        if (tensor_in->IsInitialized()) {
-          auto kernel_type_for_var = this->GetKernelTypeForVar(
-              var_name_item.first, *tensor_in, expected_kernel_key);
-          if (TransFromNeeded(kernel_type_for_var, expected_kernel_key)) {
-            auto out_var_names = OutputVars(true);
-            if (std::find(out_var_names.begin(), out_var_names.end(),
-                          var_name) != out_var_names.end()) {
-              inplace_vars.push_back(var_name);
-            }
-            VLOG(3) << "Transform Variable " << var_name << " from "
-                    << kernel_type_for_var << " to " << expected_kernel_key;
-            auto* trans_var = new_scope.Var(var_name);
-            std::shared_ptr<Tensor> out(new Tensor);
-            DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in,
-                          out.get());
-            CopyVariableWithTensor(*var, *(out.get()), trans_var);
-          }
-        }
-      }
-    }
+  // exec scope is the scope that kernel actually executed on.
+  const Scope& exec_scope =
+      (transfer_scope == nullptr ? scope : *transfer_scope);
+
+  if (!(expected_kernel_key.place_ == dev_ctx->GetPlace())) {
+    dev_ctx = pool.Get(expected_kernel_key.place_);
   }
 
-  auto* new_dev_ctx = pool.Get(expected_kernel_key.place_);
-  kernel_iter->second->Compute(
-      ExecutionContext(*this, new_scope, *new_dev_ctx));
+  kernel_iter->second->Compute(ExecutionContext(*this, exec_scope, *dev_ctx));
 
-  for (auto& var_name : inplace_vars) {
-    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
-    auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
-    auto* transformed_tensor = GetTensorFromVar(new_scope.FindVar(var_name));
-    original_tensor->ShareDataWith(*transformed_tensor);
+  if (!transfered_inplace_vars.empty()) {
+    // there is inplace variable has been transfered.
+    TransferInplaceVarsBack(scope, transfered_inplace_vars, *transfer_scope);
   }
 
   /*For profiling/benchmark only*/
   if (FLAGS_benchmark) {
-    new_dev_ctx->Wait();
+    dev_ctx->Wait();
   }
 
   if (FLAGS_check_nan_inf) {
     for (auto& vname : OutputVars(true)) {
-      auto* var = new_scope.FindVar(vname);
+      auto* var = exec_scope.FindVar(vname);
       if (var == nullptr) continue;
       if (var->IsType<framework::LoDTensor>()) {
         CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
@@ -697,6 +674,64 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     }
   }
 }
+void OperatorWithKernel::TransferInplaceVarsBack(
+    const Scope& scope, const std::vector<std::string>& inplace_vars,
+    const Scope& transfer_scope) const {
+  for (auto& var_name : inplace_vars) {
+    VLOG(3) << "share inplace var " + var_name + " back to it's original scope";
+    auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name));
+    auto* transformed_tensor =
+        GetTensorFromVar(transfer_scope.FindVar(var_name));
+    original_tensor->ShareDataWith(*transformed_tensor);
+  }
+}
+
+Scope* OperatorWithKernel::TryTransferData(
+    const Scope& scope, const OpKernelType& expected_kernel_key,
+    std::vector<std::string>* transfered_inplace_vars) const {
+  Scope* new_scope = nullptr;
+  for (auto& var_name_item : Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope.FindVar(var_name);
+      // Only tensor can be tranfer to another device.
+      if (var == nullptr || !VarIsTensor(var)) {
+        continue;
+      }
+
+      auto* tensor_in = GetTensorFromVar(var);
+      if (!tensor_in->IsInitialized()) {
+        continue;
+      }
+
+      auto kernel_type_for_var = GetKernelTypeForVar(
+          var_name_item.first, *tensor_in, expected_kernel_key);
+
+      if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
+        continue;
+      }
+
+      auto out_var_names = OutputVars(true);
+      if (std::find(out_var_names.begin(), out_var_names.end(), var_name) !=
+          out_var_names.end()) {
+        transfered_inplace_vars->emplace_back(var_name);
+      }
+
+      VLOG(3) << "Transform Variable " << var_name << " from "
+              << kernel_type_for_var << " to " << expected_kernel_key;
+
+      if (new_scope == nullptr) {
+        new_scope = &scope.NewScope();
+      }
+
+      auto* trans_var = new_scope->Var(var_name);
+      Tensor out;
+      TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out);
+      SetTensorToVariable(*var, out, trans_var);
+    }
+  }
+
+  return new_scope;
+}
 
 proto::VarType::Type OperatorWithKernel::IndicateDataType(
     const ExecutionContext& ctx) const {
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index b1d75d0d0ff3dccc67a1e833ccfe03a4cad8df39..1550d5df172f0599e1b42e7f1ccf51ac4dd1e0c3 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -384,6 +384,20 @@ class OperatorWithKernel : public OperatorBase {
   // same.
   proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
+
+  /**
+   * Transfer data from scope to a transfered scope. If there is no data need to
+   * be tranfered, it returns nullptr.
+   *
+   * * transfered_inplace_vars is a output vector.
+   */
+  Scope* TryTransferData(
+      const Scope& scope, const OpKernelType& expected_kernel_key,
+      std::vector<std::string>* transfered_inplace_vars) const;
+
+  void TransferInplaceVarsBack(const Scope& scope,
+                               const std::vector<std::string>& inplace_vars,
+                               const Scope& exec_scope) const;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 751b10eeeed10828c08ada4173300c07f81c093e..b53a6f43fbd1f23e69d23ad0fcc54d5c25d352a3 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -253,9 +253,6 @@ void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes(
       t->set_lod(lod_tensors[j].lod());
     }
   }
-  for (auto &p : member_->places_) {
-    platform::DeviceContextPool::Instance().Get(p)->Wait();
-  }
 }
 
 ParallelExecutor::~ParallelExecutor() {
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index e5bc74755f46449296a153e8b330968e6d9f1e1d..f98011e896f4033ef210e0eb69f93ce7800a3cd6 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -69,7 +69,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
     PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream();
-    memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
+    if (platform::is_same_place(src_place, dst_place)) {
+      memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                   stream);
+    } else {
+      if (platform::is_same_place(ctx_place, src_place)) {
+        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                     stream);
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
+      } else if (platform::is_same_place(ctx_place, dst_place)) {
+        platform::DeviceContextPool::Instance().Get(src.place())->Wait();
+        memory::Copy(dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+                     stream);
+      } else {
+        PADDLE_THROW("ctx is not belong to dst_gpu_place or src_gpu_place.");
+      }
+    }
   }
 #endif
 }
@@ -78,10 +93,10 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst) {
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   const platform::DeviceContext* dev_ctx;
-  if (platform::is_gpu_place(src.place())) {
-    dev_ctx = pool.Get(src.place());
-  } else {
+  if (platform::is_gpu_place(dst_place)) {
     dev_ctx = pool.Get(dst_place);
+  } else {
+    dev_ctx = pool.Get(src.place());
   }
   TensorCopy(src, dst_place, *dev_ctx, dst);
 }
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index dca279b69382b80e055f661cefe84b81326704b5..4457382ade37a12f5f3613fc4113fbf1f6f91124 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -23,10 +23,25 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+// NOTE(zcd): Because TensorCopy is an async operation, when the src_place
+// and dst_place are two different GPU, to ensure that the operation can
+// be carried out correctly, there is a src_ctx wait operation in TensorCopy.
+// If ctx_place and src_place are the same, src_ctx.Wait() is added
+// after memory::Copy; if ctx_place and dst_place are the same,
+// src_ctx.Wait() is added before memory::Copy.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 const platform::DeviceContext& ctx, Tensor* dst);
+
+// NOTE(zcd): If the src.place() and dst_place are two different GPU,
+// the copy operation is carried out on the dst_place's stream. This is
+// very important, because TensorCopy is an async operator, and in most
+// case, once this copy operator returns, dst is to be used in dst_place's
+// stream, if this copy operation is carried out on the src_place's stream,
+// when dst is used in dst_place's stream the copy operation may be
+// not completed.
 void TensorCopy(const Tensor& src, const platform::Place& dst_place,
                 Tensor* dst);
+
 void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
                     Tensor* dst);
 
diff --git a/paddle/fluid/inference/analysis/README.md b/paddle/fluid/inference/analysis/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4c5de189cd1eab1ba3de0b2cdfd2294d139ceab2
--- /dev/null
+++ b/paddle/fluid/inference/analysis/README.md
@@ -0,0 +1,57 @@
+# Inference Analysis
+
+The `inference/analysis` module is used to analyze and optimize the inference program,
+it references some philosophy from `LLVM/analysis`, 
+and make the various optimization features be pluggable and co-exist in a pipeline.
+
+We borrowed some concepts from LLVM, such as
+
+- [Pass](./pass.h)es to implement optimization that traverse the inference program,
+- [DataFlowGraph](./data_flow_graph.h) to represent the data flow graph built from a program,
+- [PassManager](./pass_manager.h) to manage a sequence of `Pass`es over a graph.
+
+There are some other basic concepts here
+
+- [Node](./node.h), the node in a `DataFlowGraph`,
+  - `Function`, the Operator in Fluid,
+  - `Value`, the Variable in Fluid;
+- [Argument](./argument.h), the argument that treat as the input and output of all `Pass`es in the pipeline,
+
+## How it works
+
+The `inference/analysis` module make all the passes in a pipeline, and works in such way:
+
+1. Build a `DataFlowGraph` from a Fluid inference ProgramDesc,
+2. Call the middle passes one by one, the same `DataFlowGraph` is passed across all the passes,
+3. Transform a new ProgramDesc from the modified `DataFlowGraph`.
+
+The new optimization features can be added as an independent `Pass` and controlled by gflags,
+each pass will generate unified debug information or visualization for better debugging.
+
+## Supported Passes
+
+### `FluidToDataFlowGraphPass`
+Transform the fluid `ProgramDesc` to a `DataFlowGraph` to give an abstract representation for all the middle passes, 
+this should be the first pass of the pipeline.
+
+### `DataFlowGraphToFluidPass`
+Generate a final `ProgramDesc` from a data flow graph, this should be the last pass of the pipeline.
+
+### `TensorRTSubgraphNodeMarkPass`
+Mark the `Node` that are supported by TensorRT, 
+this pass will generate a visualization file which can be used for debugging.
+
+### `TensorRTSubGraphPass`
+Split the sub-graph that are can be accelerated by TensorRT.
+
+### `DFG_GraphvizDrawPass`
+This pass is just for debug, it will visualize the `DataFlowGraph` using the [graphviz](http://www.graphviz.org) tool.
+
+It can be used as a helper class that draws the modified graph after each pass.
+
+## Utilities
+
+There is some helper function/class for analysis.
+
+- [dot.h](./dot.h) give a easy to use interface for generating `DOT` codes,
+- [graph_traits.h](./graph_traits.h) contains the graph traversal algorithms, it uses `iterator` to make the algorithms easy to share across different passes.
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index 5d85530969c5bec1c84d5f5b0d2626431a9e1c63..a4625f008c15300b88ef0bce71cd7d8aa473c9a8 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/analyzer.h"
+#include <string>
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
@@ -79,4 +80,4 @@ void Analyzer::Run(Argument* argument) {
 
 }  // namespace analysis
 }  // namespace inference
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index f290a3777d5be2ef64667d8c17ec59adddc3ef1b..e9e14fb1947da059c8d126d3da182ce446f6421e 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#pragma once
+
 /*
  * This file contains Analyzer, an class that exposed as a library that analyze
  * and optimize
diff --git a/paddle/fluid/inference/analysis/data_flow_graph.h b/paddle/fluid/inference/analysis/data_flow_graph.h
index 30c60661f3492034248e164a70a682bae3819d23..a4fefc83e0c551d52bec87299bcbc966e7a2dbf7 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -138,7 +138,7 @@ struct GraphTraits<DataFlowGraph> {
 // sub-graph is the inputs nodes and output nodes that doesn't inside the
 // sub-graph.
 static std::pair<std::vector<Node *>, std::vector<Node *>>
-ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {
+ExtractInputAndOutputOfSubGraph(std::vector<Node *> &graph) {  // NOLINT
   std::unordered_set<Node *> nodes(graph.begin(), graph.end());
   std::unordered_set<Node *> inputs;
   std::unordered_set<Node *> outputs;
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
index e74efd17b834db1d0314c8b7082f3e9c15d6eda3..29ca008123addf07959b965a4b54bf55b18c401d 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
+#include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/proto_desc.h"
@@ -150,13 +151,14 @@ namespace {
 class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
  public:
   using Config = DFG_GraphvizDrawPass::Config;
-  DFG_DebuggerPass(const Config& config) : DFG_GraphvizDrawPass(config) {}
+  explicit DFG_DebuggerPass(const Config& config)
+      : DFG_GraphvizDrawPass(config) {}
 
   std::string repr() const override { return "dfg-to-fluid-debuger-pass"; }
 
   bool Finalize() override { return true; }
 };
-}
+}  // namespace
 
 Pass* DataFlowGraphToFluidPass::CreateGraphvizDebugerPass() const {
   return new DFG_DebuggerPass(DFG_GraphvizDrawPass::Config(
diff --git a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
index 1726e056ed37e2e5fbe2042851ca9bd188806bac..edc84b02ed20991e3e7c6c437d2b1fac169bae03 100644
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h
@@ -19,6 +19,7 @@
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/inference/analysis/data_flow_graph.h"
 #include "paddle/fluid/inference/analysis/pass.h"
diff --git a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
index b064782586f6243353eda67ac8db040509716b20..17445ab4407a159ca11345bc9a9226b3ad0044f0 100644
--- a/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
+++ b/paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h
@@ -46,7 +46,7 @@ class DFG_GraphvizDrawPass : public DataFlowGraphPass {
     const bool display_deleted_node;
   };
 
-  DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
+  explicit DFG_GraphvizDrawPass(const Config &config) : config_(config) {}
 
   bool Initialize(Argument *argument) override { return true; }
   void Run(DataFlowGraph *graph) override;
diff --git a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
index 5d7eb43b7cbd7bc45b5f0c940bf80ad72348e1b9..e918622d74cfb11d83090555be2a768cc14e7742 100644
--- a/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
+++ b/paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "analyzer.h"
+#include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
 
@@ -88,7 +88,8 @@ namespace {
 class DFG_DebuggerPass : public DFG_GraphvizDrawPass {
  public:
   using Config = DFG_GraphvizDrawPass::Config;
-  DFG_DebuggerPass(const Config &config) : DFG_GraphvizDrawPass(config) {}
+  explicit DFG_DebuggerPass(const Config &config)
+      : DFG_GraphvizDrawPass(config) {}
   std::string repr() const override { return "fluid-to-dfg-debuger-pass"; }
   bool Finalize() override { return true; }
 };
diff --git a/paddle/fluid/inference/analysis/pass_manager_tester.cc b/paddle/fluid/inference/analysis/pass_manager_tester.cc
index 6caba8f04237e014c5ddf1a3a077bcbadb0ddb71..dac1c509d728114bd24a2ea1150c407646026fd4 100644
--- a/paddle/fluid/inference/analysis/pass_manager_tester.cc
+++ b/paddle/fluid/inference/analysis/pass_manager_tester.cc
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/inference/analysis/pass_manager.h"
+#include <gtest/gtest.h>
+
 #include "paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/fluid_to_data_flow_graph_pass.h"
+#include "paddle/fluid/inference/analysis/pass_manager.h"
 #include "paddle/fluid/inference/analysis/ut_helper.h"
 
-#include <gtest/gtest.h>
-
 namespace paddle {
 namespace inference {
 namespace analysis {
diff --git a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
index 5ad092a9ed201e5e6ab7770bcfd9ddf871779c12..f736e385c11add152dc9ab9485bf1de40f80b2f3 100644
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.cc
@@ -12,10 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
+#include <string>
+
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/analysis/dfg_graphviz_draw_pass.h"
 #include "paddle/fluid/inference/analysis/node_attr_flags.h"
+#include "paddle/fluid/inference/analysis/tensorrt_subgraph_node_mark_pass.h"
 
 namespace paddle {
 namespace inference {
@@ -29,7 +31,7 @@ void TensorRTSubgraphNodeMarkPass::Run(DataFlowGraph *graph) {
 
 class DfgDebuggerPass : public DFG_GraphvizDrawPass {
  public:
-  DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
+  explicit DfgDebuggerPass(const DFG_GraphvizDrawPass::Config &config)
       : DFG_GraphvizDrawPass(config) {}
 
   std::string repr() const override {
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 0c74f62de5c6f5d432ee928945db6dcf385ca209..bd98ed81899440a46415d30b6d74fec2dac4c155 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -20,6 +20,12 @@ limitations under the License. */
 #include "paddle/fluid/memory/detail/system_allocator.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
+DEFINE_bool(init_allocated_mem, false,
+            "It is a mistake that the values of the memory allocated by "
+            "BuddyAllocator are always zeroed in some op's implementation. "
+            "To find this error in time, we use init_allocated_mem to indicate "
+            "that initializing the allocated memory with a small value "
+            "during unit testing.");
 DECLARE_double(fraction_of_gpu_memory_to_use);
 
 namespace paddle {
@@ -41,6 +47,9 @@ template <>
 void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
   VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
   void* p = GetCPUBuddyAllocator()->Alloc(size);
+  if (FLAGS_init_allocated_mem) {
+    memset(p, 0xEF, size);
+  }
   VLOG(10) << "  pointer=" << p;
   return p;
 }
@@ -104,6 +113,9 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
     LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
     platform::SetDeviceId(cur_dev);
   }
+  if (FLAGS_init_allocated_mem) {
+    cudaMemset(ptr, 0xEF, size);
+  }
   return ptr;
 }
 
@@ -137,6 +149,9 @@ void* Alloc<platform::CUDAPinnedPlace>(platform::CUDAPinnedPlace place,
     LOG(WARNING) << "cudaMallocHost Cannot allocate " << size
                  << " bytes in CUDAPinnedPlace";
   }
+  if (FLAGS_init_allocated_mem) {
+    memset(ptr, 0xEF, size);
+  }
   return ptr;
 }
 
diff --git a/paddle/fluid/operators/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
index 6ecb43c49c30f9da2a273d506f7b85c0a4f5fa2c..9ab2179b5fe689762704039c5f67dd080e530aa5 100644
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
@@ -115,9 +115,12 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     if (fuse_with_relu) flags |= mkldnn::fuse_bn_relu;
 
     // create mkldnn memory from input x tensor
-    auto src_memory =
-        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
-               to_void_cast(x_data));
+    mkldnn::memory::format input_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
+
+    auto src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+        to_void_cast(x_data));
 
     // create primitive descriptor for batch norm forward
     using bn_fwd_types = bn_type_traits<mkldnn::batch_normalization_forward>;
@@ -251,15 +254,21 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     using bn_bwd_types = bn_type_traits<mkldnn::batch_normalization_backward>;
 
     // create mkldnn memory from input diff_y tensor
-    auto user_diff_dst_memory =
-        memory({{{diff_dst_tz}, memory::data_type::f32, diff_y->format()},
-                mkldnn_engine},
-               to_void_cast(diff_y_data));
+
+    mkldnn::memory::format dst_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), diff_y->format());
+
+    auto user_diff_dst_memory = memory(
+        {{{diff_dst_tz}, memory::data_type::f32, dst_format}, mkldnn_engine},
+        to_void_cast(diff_y_data));
 
     // create mkldnn memory from input x tensor
-    auto src_memory =
-        memory({{{src_tz}, memory::data_type::f32, x->format()}, mkldnn_engine},
-               to_void_cast(x_data));
+    mkldnn::memory::format input_format =
+        platform::MKLDNNFormatForSize(src_tz.size(), x->format());
+
+    auto src_memory = memory(
+        {{{src_tz}, memory::data_type::f32, input_format}, mkldnn_engine},
+        to_void_cast(x_data));
 
     // for diff_dst, try to use same format as dst in forward pass
     auto diff_dst_pd = batch_norm_fwd_pd.get()->dst_primitive_desc();
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index ed99932546446eb877c9701de15e2d37d29b5f88..a6cccc31219104767ac38bdebeb1d4c0e8c2ac01 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -228,7 +228,7 @@ class MKLDNNHandler {
       return dstr;
     };
     return dims2str(operand_dims) + suffix;
-  };
+  }
 
  protected:
   const MKLDNNDeviceContext& dev_ctx_;
@@ -237,5 +237,15 @@ class MKLDNNHandler {
   bool is_reusing_;
 };
 
+inline mkldnn::memory::format MKLDNNFormatForSize(
+    size_t dims_size, mkldnn::memory::format data_format) {
+  if (dims_size == 1) {
+    return mkldnn::memory::format::x;
+  } else if (dims_size == 2) {
+    return mkldnn::memory::format::nc;
+  }
+  return data_format;
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 45af83708ea63fc1b6aa86f1e8423bb44b7388a6..3034c1a0875a71421bcba172c16ee32d809df152 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -118,7 +118,8 @@ def __bootstrap__():
 
     read_env_flags = [
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
-        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb'
+        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
+        'init_allocated_mem'
     ]
     if core.is_compiled_with_cuda():
         read_env_flags += [
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 21f2037ad408b0a92718c0ea2bae5e8bf563c665..cddf00765f4894126988c794763c34629449e8e6 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -18,6 +18,8 @@ import unittest
 import paddle.fluid as fluid
 import time
 import numpy as np
+import math
+import sys
 
 __all__ = ['TestParallelExecutorBase']
 
@@ -93,6 +95,12 @@ class TestParallelExecutorBase(unittest.TestCase):
                 print "%.4f Instance per second" % (
                     (batch_size * iter + 2) / (end - begin))
 
+            avg_last_loss_val = np.array(last_loss).mean()
+            avg_first_loss_val = np.array(first_loss).mean()
+            if math.isnan(float(avg_last_loss_val)) or math.isnan(
+                    float(avg_first_loss_val)):
+                sys.exit("got NaN loss, training failed.")
+
             print first_loss, last_loss
             # self.assertGreater(first_loss[0], last_loss[0])
             return first_loss, last_loss
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index 252793944462244539084a288e5259f216359650..9a2733927d38f1a2b1af92fcc12f036158b4d06f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -16,6 +16,8 @@ import paddle.fluid as fluid
 import numpy as np
 import unittest
 import os
+import sys
+import math
 
 
 def simple_fc_net():
@@ -73,6 +75,14 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
 
                 train_loss, = train_exe.run([loss.name], feed=feed_dict)
 
+                avg_test_loss_val = np.array(test_loss).mean()
+                if math.isnan(float(avg_test_loss_val)):
+                    sys.exit("got NaN loss, testing failed.")
+
+                avg_train_loss_val = np.array(train_loss).mean()
+                if math.isnan(float(avg_train_loss_val)):
+                    sys.exit("got NaN loss, training failed.")
+
                 self.assertTrue(
                     np.allclose(
                         train_loss, test_loss, atol=1e-8),