diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py
index f8aed5a5e06c5e29dbdfb5db9f2ea0344c7eed6d..6b22f8f520e3d9c6c89d41a7455a6f9ebbad6d80 100644
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -85,8 +85,7 @@ def dist_transpile(trainer_id, args):
         trainer_id,
         pservers=pserver_endpoints,
         trainers=trainers,
-        sync_mode=not args.async_mode,
-        slice_var_up=not args.no_split_var)
+        sync_mode=not args.async_mode)
     if training_role == "PSERVER":
         pserver_program = t.get_pserver_program(current_endpoint)
         pserver_startup_program = t.get_startup_program(current_endpoint,
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
index 82437a84248fece843c3659c9422d9b579b5066f..7fb67afbe15a5a019c978092d5ba3a4a0f66d996 100644
--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -50,7 +50,7 @@ ExternalProject_Add(
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
     BUILD_IN_SOURCE 1
-    PATCH_COMMAND git apply ${PADDLE_SOURCE_DIR}/patches/grpc/fix_too_early_destory.patch
+    PATCH_COMMAND cp ${PADDLE_SOURCE_DIR}/patches/grpc/grpc_library.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/grpc_library.h && cp ${PADDLE_SOURCE_DIR}/patches/grpc/completion_queue.h ${GRPC_SOURCES_DIR}/src/extern_grpc/include/grpcpp/impl/codegen/completion_queue.h
     # NOTE(yuyang18):
     # Disable -Werror, otherwise the compile will fail in MacOS.
     # It seems that we cannot configure that by make command.
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index eafb11b6f21e226fc68556a78d675dea94080140..07bab994d354df834d0667c69f307b2d7684fb22 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -263,7 +263,7 @@ function(cc_test TARGET_NAME)
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (${cc_test_SERIAL})
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     endif()
   endif()
@@ -328,7 +328,7 @@ function(nv_test TARGET_NAME)
     add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(${TARGET_NAME} ${TARGET_NAME})
     if (nv_test_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     endif()
   endif()
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index e2c58cd56055455e7fedc598ca8f56183d4b51dc..aeb081e76e5bc5b9d3d81ce625195c800174ab6c 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -148,18 +148,11 @@ if (WITH_ANAKIN AND WITH_GPU)
      list(APPEND inference_deps anakin_inference_lib)
 endif()
 
-copy(inference_api_lib DEPS paddle_inference_api paddle_inference_api_shared
-  SRCS ${src_dir}/${module}/paddle_inference_api.h 
-       ${src_dir}/${module}/demo_ci
-       ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/libpaddle_inference_api*
-  DSTS ${dst_dir}/inference ${dst_dir}/inference ${dst_dir}/inference
-)
-list(APPEND inference_deps inference_api_lib)
-
 set(module "inference")
 copy(inference_lib DEPS ${inference_deps}
   SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
-  DSTS ${dst_dir}/${module} ${dst_dir}/${module}
+       ${src_dir}/${module}/api/paddle_inference_api.h ${src_dir}/${module}/api/demo_ci
+  DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
 )
 
 set(module "platform")
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 93ec047c8012e41cc9dfb651e8de2b4749f93299..df2a7bf90d9be480c514d9dc70571c7f56fd8db2 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -8,9 +8,9 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 cc_library(data_type SRCS data_type.cc DEPS framework_proto ddim device_context)
 if(WITH_GPU)
-  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
 endif()
 
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@@ -110,7 +110,7 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
-      
+
 # cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 2f2869b1634256c3745e733bb1b99bfe4ddf8924..b7b67916205689753bc3f9fe844945ee3e78eeb4 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -715,6 +715,7 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
       result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
       node->Op()->Type(), places_[op_dev_id]));
 
+  // TODO(panyx0718): This might not be needed anymore.
   if (node->Op()->Type() == "send_barrier") {
     ConnectOp(result, result->Get<GraphOps>("ops").back().get(), "send");
   } else if (node->Op()->Type() == "recv") {
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 740acfafb7594d8d9f3ca5439323ce76c5ed271a..f870fb2b9cf805aba84d6f4573b0574ff361e71c 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -24,6 +24,68 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+std::vector<std::string> FindDistTrainSendVars(
+    const std::vector<ir::Node *> &nodes) {
+  std::vector<std::string> send_vars;
+  // since parameters are all in block 0,
+  // it's enough to only scan send ops in block 0
+  for (auto &node : nodes) {
+    auto op_vars = node->Op()->InputArgumentNames();
+    send_vars.reserve(send_vars.size() +
+                      std::distance(op_vars.begin(), op_vars.end()));
+    send_vars.insert(send_vars.end(), op_vars.begin(), op_vars.end());
+  }
+  return send_vars;
+}
+
+std::vector<std::string> FindDistTrainRecvVars(
+    const std::vector<ir::Node *> &nodes) {
+  std::vector<std::string> recv_vars;
+  for (auto &node : nodes) {
+    auto op_vars = node->Op()->OutputArgumentNames();
+    recv_vars.reserve(recv_vars.size() +
+                      std::distance(op_vars.begin(), op_vars.end()));
+    recv_vars.insert(recv_vars.end(), op_vars.begin(), op_vars.end());
+  }
+  return recv_vars;
+}
+
+bool IsDistTrainOp(ir::Node *node, const std::vector<std::string> &send_vars,
+                   const std::vector<std::string> &recv_vars) {
+  if (send_vars.size() == 0 || recv_vars.size() == 0) {
+    return false;
+  }
+
+  /**
+   * Check any of opvars contains `.block` and in sendvars
+   */
+  auto checker = [](const std::vector<std::string> &opvars,
+                    const std::vector<std::string> &rpc_vars) -> bool {
+    for (auto &var : opvars) {
+      // a variable name with the suffix `.block` means it's a splited
+      // variable by (DistributeTranspiler)
+      // [python/paddle/fluid/transpiler/distribute_transpiler.py]
+      if (var.find(".block") != std::string::npos &&
+          std::find(rpc_vars.begin(), rpc_vars.end(), var) != rpc_vars.end()) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  std::vector<std::string> input_var_names;
+  std::vector<std::string> output_var_names;
+  for (ir::Node *input : node->inputs) {
+    input_var_names.push_back(input->Name());
+  }
+  for (ir::Node *output : node->outputs) {
+    output_var_names.push_back(output->Name());
+  }
+
+  return checker(output_var_names, send_vars) ||
+         checker(input_var_names, recv_vars);
+}
+
 Graph::Graph(const ProgramDesc &program) : program_(program) {
   VLOG(3) << "block in program:" << program_.Size();
   std::unordered_map<std::string, VarDesc *> all_vars;
@@ -61,6 +123,64 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
       var->inputs.push_back(node);
     }
   }
+
+  std::vector<ir::Node *> send_ops;
+  ir::Node *send_bar = nullptr;
+  std::vector<ir::Node *> recv_ops;
+  ir::Node *fetch_bar = nullptr;
+  for (ir::Node *node : Nodes()) {
+    if (node->Name() == "send") {
+      send_ops.push_back(node);
+    } else if (node->Name() == "send_barrier") {
+      PADDLE_ENFORCE(!send_bar, "only has one send barrier");
+      send_bar = node;
+    } else if (node->Name() == "recv") {
+      recv_ops.push_back(node);
+    } else if (node->Name() == "fetch_barrier") {
+      PADDLE_ENFORCE(!fetch_bar, "only has one fetch barrier");
+      fetch_bar = node;
+    }
+  }
+  if (send_bar) {
+    for (ir::Node *send : send_ops) {
+      ir::Node *dep_var = CreateControlDepVar();
+      send->outputs.push_back(dep_var);
+      dep_var->inputs.push_back(send);
+      send_bar->inputs.push_back(dep_var);
+      dep_var->outputs.push_back(send_bar);
+    }
+    for (ir::Node *recv : recv_ops) {
+      ir::Node *dep_var = CreateControlDepVar();
+      recv->inputs.push_back(dep_var);
+      dep_var->outputs.push_back(recv);
+      send_bar->outputs.push_back(dep_var);
+      dep_var->inputs.push_back(send_bar);
+    }
+  }
+  if (fetch_bar) {
+    for (ir::Node *recv : recv_ops) {
+      ir::Node *dep_var = CreateControlDepVar();
+      recv->outputs.push_back(dep_var);
+      dep_var->inputs.push_back(recv);
+      fetch_bar->inputs.push_back(dep_var);
+      dep_var->outputs.push_back(fetch_bar);
+    }
+  }
+
+  std::vector<std::string> send_vars = FindDistTrainSendVars(send_ops);
+  std::vector<std::string> recv_vars = FindDistTrainRecvVars(recv_ops);
+  for (ir::Node *node : Nodes()) {
+    if (IsDistTrainOp(node, send_vars, recv_vars)) {
+      if (fetch_bar && node->Name() == "concat") {
+        ir::Node *dep_var = CreateControlDepVar();
+        fetch_bar->outputs.push_back(dep_var);
+        dep_var->inputs.push_back(fetch_bar);
+        node->inputs.push_back(dep_var);
+        dep_var->outputs.push_back(node);
+      }
+    }
+  }
+
   /**
    * We only handle write after read(WAR), since it should not have a write
    * after write in program. If there are write after write operators, we need
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d1dc5fcd97b77fb7707c7d48f6eaeef140d3f306..7c1c29fd9a81c558f7fd05abf52cd0a6dd522190 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -679,6 +679,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
       if (var == nullptr) continue;
       if (var->IsType<framework::LoDTensor>()) {
         CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+      } else if (var->IsType<framework::SelectedRows>()) {
+        CheckTensorNANOrInf(vname, var->Get<framework::SelectedRows>().value());
       }
     }
   }
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 16c9c81258a9fdb7730b9b3e34be990798c91639..ba7645aa02413f28a648f35e381da7824604a455 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -14,8 +14,15 @@ cc_library(paddle_fluid_api
 
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 
+# paddle_fluid_origin exclude inference api interface
+cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+
+if(NOT APPLE)
+  add_subdirectory(api)
+endif()
+
 # Create static library
-cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api)
+cc_library(paddle_fluid DEPS ${fluid_modules} paddle_fluid_api paddle_inference_api)
 if(NOT APPLE)
   # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
   set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
@@ -24,7 +31,7 @@ endif()
 
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
-    SRCS io.cc
+    SRCS io.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api.cc ${CMAKE_CURRENT_SOURCE_DIR}/api/api_impl.cc
     DEPS ${fluid_modules} paddle_fluid_api)
 
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
@@ -32,12 +39,21 @@ if(NOT APPLE)
   # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.
   set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.map")
   set_target_properties(paddle_fluid_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  # check symbol hidden
+  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
+    "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
+    " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_fluid.so\" RESULT_VARIABLE symbol_res)\n"
+    "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
+    "  message(FATAL_ERROR \"Check symbol failed.\")\n"
+    "endif()\n")
+  add_custom_command(
+    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
+    COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
+    DEPENDS paddle_fluid_shared)
+  add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
 endif()
 
 if(WITH_TESTING)
   # both tests/book and analysis depends the models that generated by python/paddle/fluid/tests/book
   add_subdirectory(tests/book)
 endif()
-if(NOT APPLE)
-  add_subdirectory(api)
-endif()
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 7e4b3e9a2dcae6b34d1af089bc7da55e09315c58..3e60a61793339990648737c3d549d46cc5f5a887 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -42,35 +42,8 @@ function(inference_api_test TARGET_NAME)
     endif(WITH_TESTING)
 endfunction(inference_api_test)
 
-cc_library(paddle_inference_api
-    SRCS api.cc api_impl.cc
-    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
-if(NOT APPLE)
-  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/api.sym")
-  set_target_properties(paddle_inference_api PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-endif()
-
-# Here the shared library doesn't depend on other fluid libraries, or double free will occur.
-cc_library(paddle_inference_api_shared SHARED
-    SRCS api.cc api_impl.cc)
-add_dependencies(paddle_inference_api_shared ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
-set_target_properties(paddle_inference_api_shared PROPERTIES OUTPUT_NAME paddle_inference_api)
+cc_library(paddle_inference_api SRCS api.cc api_impl.cc DEPS lod_tensor)
 
-if(NOT APPLE)
-  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/api.map")
-  set_target_properties(paddle_inference_api_shared PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-  FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake
-    "execute_process(COMMAND bash -c \"${CMAKE_CURRENT_SOURCE_DIR}/check_symbol.sh"
-    " ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_inference_api.so\" RESULT_VARIABLE symbol_res)\n"
-    "if(NOT \"\${symbol_res}\" STREQUAL \"0\")\n"
-    "  message(FATAL_ERROR \"Check symbol failed.\")\n"
-    "endif()\n")
-  add_custom_command(
-    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol"
-    COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/check_symbol.cmake"
-    DEPENDS paddle_inference_api_shared)
-  add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
-endif()
 
 cc_test(test_paddle_inference_api
         SRCS api_tester.cc
diff --git a/paddle/fluid/inference/api/api.map b/paddle/fluid/inference/api/api.map
deleted file mode 100644
index 5203784dc1fcb672eb6a26d9dfd3ffbe02e08038..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/api.map
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-	global:
-		*paddle*;
-	local:
-		*;
-};
diff --git a/paddle/fluid/inference/api/api.sym b/paddle/fluid/inference/api/api.sym
deleted file mode 100644
index ef2a04d788aa86b7f6a61c4af479d70d1137f374..0000000000000000000000000000000000000000
--- a/paddle/fluid/inference/api/api.sym
+++ /dev/null
@@ -1 +0,0 @@
-*paddle*
diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
index 7f9bb4b33e97b5ea37e9216b00ce0c82ca3ce230..ba73a6eaa6fc885b6b56c2d6330394e2f9c384bf 100644
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -55,11 +55,9 @@ endif()
 # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a
 if(WITH_STATIC_LIB)
   set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference_api.a
       ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.a)
 else()
   set(DEPS
-      ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_inference_api.so
       ${PADDLE_LIB}/paddle/fluid/inference/libpaddle_fluid.so)
 endif()
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
diff --git a/paddle/fluid/inference/api/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh
similarity index 64%
rename from paddle/fluid/inference/api/check_symbol.sh
rename to paddle/fluid/inference/check_symbol.sh
index 6547ca1413649968e8a0be146915e07192a99898..12b7b3e7e5982f193e48596b867953fc93841b61 100755
--- a/paddle/fluid/inference/api/check_symbol.sh
+++ b/paddle/fluid/inference/check_symbol.sh
@@ -3,8 +3,8 @@
 lib=$1
 if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
 
-num_paddle_syms=$(nm -D --defined-only ${lib} | grep paddle | wc -l)
-num_google_syms=$(nm -D --defined-only ${lib} | grep google | wc -l)
+num_paddle_syms=$(nm -D ${lib} | grep paddle | wc -l)
+num_google_syms=$(nm -D ${lib} | grep google | grep -v paddle | grep T | wc -l)
 
 if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
 if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 3e5e6e7fc3acfb60bf1cf884598e43e021a4b69e..7f5e5a622b9e8578f9b39aa9024d653d7b17138f 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,7 +1,7 @@
 # Add TRT tests
 nv_library(tensorrt_converter
   SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc
-  DEPS tensorrt_engine mul_op)
+  DEPS tensorrt_engine operator scope framework_proto op_registry)
 
 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
   ${FLUID_CORE_MODULES} tensorrt_engine tensorrt_converter)
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
index 3c342957360ad4192d838147bf37e84d233c2629..514eb659a8da73b6e56b5d17148ec0cb2aeaa135 100644
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
@@ -49,5 +49,4 @@ class MulOpConverter : public OpConverter {
 }  // namespace inference
 }  // namespace paddle
 
-USE_OP(mul);
 REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
diff --git a/paddle/fluid/inference/tests/book/CMakeLists.txt b/paddle/fluid/inference/tests/book/CMakeLists.txt
index 2fa5a9540ba1311c7f87e6675a53044b23dd8276..017fc4cd7b11c150cb941fffca2606a4d707330f 100644
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -17,7 +17,7 @@ function(inference_test TARGET_NAME)
     string(REGEX REPLACE "^_$" "" arg "${arg}")
     cc_test(test_inference_${TARGET_NAME}${arg}
         SRCS test_inference_${TARGET_NAME}.cc
-        DEPS paddle_fluid
+        DEPS paddle_fluid_origin
         ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
     set_tests_properties(test_inference_${TARGET_NAME}${arg}
         PROPERTIES DEPENDS test_${TARGET_NAME})
@@ -43,6 +43,6 @@ inference_test(word2vec)
 # TODO(TJ): clean me up
 cc_test(test_inference_nlp
   SRCS test_inference_nlp.cc
-  DEPS paddle_fluid
+  DEPS paddle_fluid_origin
   ARGS
   --model_path=${PADDLE_BINARY_DIR}/python/paddle/fluid/tests/book/recognize_digits_mlp.inference.model)
diff --git a/paddle/fluid/operators/.flatten_op.cc.swp b/paddle/fluid/operators/.flatten_op.cc.swp
new file mode 100644
index 0000000000000000000000000000000000000000..3395b6074b6a4c684a97674af702ca8b91dc85e9
Binary files /dev/null and b/paddle/fluid/operators/.flatten_op.cc.swp differ
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 2da52dbf48c870353a06efe29675f3b225aefa1d..4c3b8ec78190723598a56f7633764f10dd5047f3 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -271,6 +271,8 @@ op_library(parallel_do_op DEPS executor)
 op_library(unsqueeze_op DEPS reshape_op)
 op_library(squeeze_op DEPS reshape_op)
 op_library(extract_rows_op DEPS memory)
+op_library(flatten_op DEPS reshape_op)
+
 
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc
index 265f964ddc682868c64669744b130aebbbf86692..b4f60c9ff9a41d5cb7dbe4e7a7694a84bab8e940 100644
--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -49,6 +49,7 @@ void GRPCClient::SendComplete() {
 }
 
 GRPCClient::~GRPCClient() {
+  stopped_ = true;
   Wait();
   cq_.Shutdown();
   {
@@ -275,7 +276,7 @@ void GRPCClient::Proceed() {
   void* tag = nullptr;
   bool ok = false;
 
-  while (cq_.Next(&tag, &ok)) {
+  while (!stopped_ && cq_.Next(&tag, &ok)) {
     BaseProcessor* c = static_cast<BaseProcessor*>(tag);
     GPR_ASSERT(ok);
     PADDLE_ENFORCE(c);
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
index 8351d825f817437e1b3691e916952dd9a86af491..0c95ffeb5ce7e1586c5968fb122acd12c0c0196e 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -174,7 +174,7 @@ class CheckpointNotifyProcessor : public BaseProcessor {
 
 class GRPCClient : public RPCClient {
  public:
-  GRPCClient() : ok_(true), completed_(false) {}
+  GRPCClient() : ok_(true), completed_(false), stopped_(false) {}
   virtual ~GRPCClient();
 
   bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx,
@@ -237,6 +237,8 @@ class GRPCClient : public RPCClient {
   // mutex for sending complete message only once
   std::mutex completed_mutex_;
   bool completed_;
+
+  volatile bool stopped_;
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fdda01381e117cecffb2a05f8399f3ad82a46339
--- /dev/null
+++ b/paddle/fluid/operators/flatten_op.cc
@@ -0,0 +1,169 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class FlattenOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input (X) of Flatten op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output (Output) of Flatten op should not be null.");
+    const auto &axis = ctx->Attrs().Get<int>("axis");
+    const auto &in_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(axis >= 0, "The axis should be greater than or equal to 0.");
+    PADDLE_ENFORCE(
+        axis <= in_dims.size(),
+        "The axis should be less than or equal to input tensor's rank.");
+
+    const auto &out_dims = GetOutputShape(axis, in_dims);
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    if (in_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+
+  static std::vector<int32_t> GetOutputShape(const int axis,
+                                             const framework::DDim &in_dims) {
+    int64_t outer = 1, inner = 1;
+    for (int i = 0; i < in_dims.size(); ++i) {
+      if (i < axis) {
+        outer *= in_dims[i];
+      } else {
+        inner *= in_dims[i];
+      }
+    }
+    std::vector<int32_t> out_shape(2);
+    out_shape[0] = outer;
+    out_shape[1] = inner;
+    return out_shape;
+  }
+};
+
+class FlattenOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axis = Attr<int>("axis");
+    auto in_dims =
+        scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    const auto &out_dims = FlattenOpInferShape::GetOutputShape(axis, in_dims);
+
+    framework::AttributeMap attrs;
+    attrs["shape"] = out_dims;
+    attrs["inplace"] = false;
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+class FlattenOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) A tensor of rank >= axis.");
+    AddOutput("Out",
+              "A 2D tensor is reshaped input tensor. The input dimensions"
+              "up to axis are flattened to the outer dimension of the output"
+              "and the remaining input dimensions are flattened into the inner"
+              "dimension of the output.");
+    AddAttr<int>("axis",
+                 "(int)"
+                 "Indicate up to which input dimensions (exclusive) should be"
+                 "flattened to the outer dimension of the output. The value"
+                 "for axis must be in the range [0, R], where R is the rank of"
+                 "the input tensor. When axis = 0, the shape of the output"
+                 "tensor is (1, (d_0 X d_1 ... d_n), where the shape of the"
+                 "input tensor is (d_0, d_1, ... d_n).")
+        .SetDefault(1);
+    AddComment(R"DOC(
+Flatten Operator
+
+Flattens the input tensor into a 2D matrix.
+
+Examples:
+Case 1:
+  Given
+    X.shape = (3, 100, 100, 4)
+  and
+    axis = 2
+  We get:
+    Out.shape = (3 * 100, 4 * 100)
+
+Case 2:
+  Given
+    X.shape = (3, 100, 100, 4)
+  and
+    axis = 0
+  We get:
+    Out.shape = (1, 3 * 100 * 100 * 4)
+)DOC");
+  }
+};
+
+class FlattenGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim(framework::GradVarName("X"),
+                          context->GetInputDim("X"));
+    context->ShareLoD("X", framework::GradVarName("X"));
+  }
+};
+
+class FlattenGradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto in_dims =
+        scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(in_dims);
+    attrs["inplace"] = false;
+
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
+        attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+USE_OP(reshape);
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(flatten, ops::FlattenOp, ops::FlattenOpMaker,
+                  ops::FlattenOpInferShape,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(flatten_grad, ops::FlattenGradOp, ops::FlattenGradInferShape);
diff --git a/paddle/fluid/operators/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt_engine_op.cc
index db641a4bc2c637e0babee6b6bc6e67b068759ff5..1172822e12222ded219104e3bad2613d30f891b8 100644
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
@@ -163,7 +163,4 @@ REGISTER_OP_CPU_KERNEL(
     ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int>,
     ops::TensorRTEngineKernel<paddle::platform::CPUDeviceContext, int64_t>);
 
-// A trick to compile with the needed TensorRT op converter.
-USE_TRT_CONVERTER(mul)
-
 #endif  // PADDLE_WITH_CUDA
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index e0d7937ae2f3ce4bda12f3771727e2992d63cb9b..a6f68f8b0c0a9b07c326888e30c0c911e7861607 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -60,3 +60,7 @@ cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
 
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
+
+IF(WITH_GPU)
+  nv_test(cuda_helper_test SRCS cuda_helper_test.cu)
+ENDIF()
diff --git a/paddle/fluid/platform/cuda_device_function.h b/paddle/fluid/platform/cuda_device_function.h
index ecec4178f2d9937920e52eb74bf9068b84e741a0..23457ff5fe1ec27094113ba0dde26adc64c716b5 100644
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -14,6 +14,10 @@ limitations under the License. */
 
 #pragma once
 #include <cuda.h>
+// NOTE(): support float16 to half in header file.
+#define PADDLE_CUDA_FP16
+#include <cuda_fp16.h>
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace platform {
@@ -36,6 +40,18 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
 #endif
 }
 
+// CUDA 9.0 have native compatible float16 shfl_down
+#if CUDA_VERSION < 9000
+template <>
+__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
+                                                       float16 val, int delta,
+                                                       int width) {
+  half tmp = static_cast<half>(val);
+  __shfl_down(tmp, static_cast<unsigned>(delta), width);
+  return float16(tmp);
+}
+#endif
+
 template <typename T>
 __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
                                              int width = 32) {
@@ -46,6 +62,11 @@ __forceinline__ __device__ T CudaShuffleSync(unsigned mask, T val, int src_line,
 #endif
 }
 
+template <typename T>
+HOSTDEVICE T Infinity() {
+  return INFINITY;
+}
+
 template <typename T>
 __device__ T reduceSum(T val, int tid, int len) {
   // NOTE(zcd): The warp size should be taken from the
diff --git a/paddle/fluid/platform/cuda_helper_test.cu b/paddle/fluid/platform/cuda_helper_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4a47ba5ccad4de338844e60f6fcbd6b7c11e891b
--- /dev/null
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -0,0 +1,118 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <bitset>
+#include <iostream>
+#include <random>
+
+#define PADDLE_CUDA_FP16
+#include "paddle/fluid/platform/cuda_device_function.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/float16.h"
+
+using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using paddle::platform::float16;
+
+#define CUDA_ATOMIC_KERNEL(op, T)                                      \
+  __global__ void op##Kernel(const T* data_a, T* data_b, size_t num) { \
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;       \
+         i += blockDim.x * gridDim.x) {                                \
+      paddle::platform::CudaAtomic##op(&data_b[i], data_a[i]);         \
+    }                                                                  \
+  }
+
+template <typename T>
+struct AddFunctor {
+  T operator()(const T& a, const T& b) { return a + b; }
+};
+
+template <typename T>
+struct SubFunctor {
+  T operator()(const T& a, const T& b) { return a - b; }
+};
+
+// NOTE(dzhwinter): the float16 add has small underflow/overflow
+// so we use EXPECT_NEAR to check the result.
+#define ARITHMETIC_KERNEL_LAUNCH(op, T)                                 \
+  void Test##T##op(size_t num) {                                        \
+    T *in1, *in2, *out;                                                 \
+    T *d_in1, *d_in2;                                                   \
+    size_t size = sizeof(T) * num;                                      \
+    cudaMalloc(reinterpret_cast<void**>(&d_in1), size);                 \
+    cudaMalloc(reinterpret_cast<void**>(&d_in2), size);                 \
+    in1 = reinterpret_cast<T*>(malloc(size));                           \
+    in2 = reinterpret_cast<T*>(malloc(size));                           \
+    out = reinterpret_cast<T*>(malloc(size));                           \
+    std::minstd_rand engine;                                            \
+    std::uniform_real_distribution<double> dist(0.0, 1.0);              \
+    for (size_t i = 0; i < num; ++i) {                                  \
+      in1[i] = static_cast<T>(dist(engine));                            \
+      in2[i] = static_cast<T>(dist(engine));                            \
+    }                                                                   \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);               \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);               \
+    op##Kernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);      \
+    cudaDeviceSynchronize();                                            \
+    cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);               \
+    cudaDeviceSynchronize();                                            \
+    for (size_t i = 0; i < num; ++i) {                                  \
+      EXPECT_NEAR(static_cast<float>(out[i]),                           \
+                  static_cast<float>(op##Functor<T>()(in1[i], in2[i])), \
+                  0.001);                                               \
+    }                                                                   \
+    free(in1);                                                          \
+    free(in2);                                                          \
+    free(out);                                                          \
+    cudaFree(d_in1);                                                    \
+    cudaFree(d_in2);                                                    \
+  }
+CUDA_ATOMIC_KERNEL(Add, float);
+CUDA_ATOMIC_KERNEL(Add, double);
+CUDA_ATOMIC_KERNEL(Add, float16);
+
+ARITHMETIC_KERNEL_LAUNCH(Add, float);
+ARITHMETIC_KERNEL_LAUNCH(Add, double);
+ARITHMETIC_KERNEL_LAUNCH(Add, float16);
+
+namespace paddle {
+namespace platform {
+USE_CUDA_ATOMIC(Sub, int);
+};
+};
+CUDA_ATOMIC_KERNEL(Sub, int);
+ARITHMETIC_KERNEL_LAUNCH(Sub, int);
+
+// cuda primitives
+TEST(CudaAtomic, Add) {
+  TestfloatAdd(static_cast<size_t>(10));
+  TestfloatAdd(static_cast<size_t>(1024 * 1024));
+  TestdoubleAdd(static_cast<size_t>(10));
+  TestdoubleAdd(static_cast<size_t>(1024 * 1024));
+}
+
+TEST(CudaAtomic, Sub) {
+  TestintSub(static_cast<size_t>(10));
+  TestintSub(static_cast<size_t>(1024 * 1024));
+}
+
+TEST(CudaAtomic, float16) {
+  using paddle::platform::float16;
+  Testfloat16Add(static_cast<size_t>(1));
+  Testfloat16Add(static_cast<size_t>(2));
+  Testfloat16Add(static_cast<size_t>(3));
+
+  Testfloat16Add(static_cast<size_t>(10));
+  Testfloat16Add(static_cast<size_t>(1024 * 1024));
+}
diff --git a/paddle/fluid/platform/cuda_primitives.h b/paddle/fluid/platform/cuda_primitives.h
index d535ed2f89df6a0b311ec068ecd92c8e3183cee7..94ce83975a7f13daa2b6a4d480cb22cc95811b9b 100644
--- a/paddle/fluid/platform/cuda_primitives.h
+++ b/paddle/fluid/platform/cuda_primitives.h
@@ -14,12 +14,14 @@ limitations under the License. */
 
 #pragma once
 #include <cuda.h>
+#include <stdio.h>
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 namespace platform {
 
 #define CUDA_ATOMIC_WRAPPER(op, T) \
-  __device__ __forceinline__ T CudaAtomic##op(T* address, const T val)
+  __device__ __forceinline__ T CudaAtomic##op(T *address, const T val)
 
 #define USE_CUDA_ATOMIC(op, T) \
   CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
@@ -42,17 +44,17 @@ CUDA_ATOMIC_WRAPPER(Add, int64_t) {
   static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
                 "long long should be int64");
   return CudaAtomicAdd(
-      reinterpret_cast<unsigned long long int*>(address),  // NOLINT
-      static_cast<unsigned long long int>(val));           // NOLINT
+      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
+      static_cast<unsigned long long int>(val));            // NOLINT
 }
 
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
 USE_CUDA_ATOMIC(Add, double);
 #else
 CUDA_ATOMIC_WRAPPER(Add, double) {
-  unsigned long long int* address_as_ull =                 // NOLINT
-      reinterpret_cast<unsigned long long int*>(address);  // NOLINT
-  unsigned long long int old = *address_as_ull, assumed;   // NOLINT
+  unsigned long long int *address_as_ull =                  // NOLINT
+      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
+  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
 
   do {
     assumed = old;
@@ -64,6 +66,67 @@ CUDA_ATOMIC_WRAPPER(Add, double) {
 
   return __longlong_as_double(old);
 }
+#endif
+
+#ifdef PADDLE_CUDA_FP16
+// NOTE(dzhwinter): cuda do not have atomicCAS for half.
+// Just use the half address as a unsigned value address and
+// do the atomicCAS. According to the value store at high 16 bits
+// or low 16 bits, then do a different sum and CAS.
+// Given most warp-threads will failed on the atomicCAS, so this
+// implemented should be avoided in high concurrency. It's will be
+// slower than the way convert value into 32bits and do a full atomicCAS.
+
+// convert the value into float and do the add arithmetic.
+// then store the result into a uint32.
+inline __device__ uint32_t add_to_low_half(uint32_t val, float x) {
+  float16 low_half;
+  // the float16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xffffu);
+  low_half = static_cast<float16>(static_cast<float>(low_half) + x);
+  return (val & 0xffff0000u) | low_half.x;
+}
+
+inline __device__ uint32_t add_to_high_half(uint32_t val, float x) {
+  float16 high_half;
+  // the float16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half = static_cast<float16>(static_cast<float>(high_half) + x);
+  return (val & 0xffffu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, float16) {
+  // concrete packed float16 value may exsits in lower or higher 16bits
+  // of the 32bits address.
+  uint32_t *address_as_ui =
+      reinterpret_cast<uint32_t *>(reinterpret_cast<char *>(address) -
+                                   (reinterpret_cast<size_t>(address) & 2));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t sum;
+  uint32_t newval;
+  uint32_t assumed;
+  if (((size_t)address & 2) == 0) {
+    // the float16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    float16 ret;
+    ret.x = old & 0xffffu;
+    return ret;
+  } else {
+    // the float16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed, add_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    float16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+
 #endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h
index ffd183af68514dbb1a8b3de39000c9ca3f56ddc3..efb021c838e3680ab2cdd1c4b298cf7ec2186478 100644
--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -67,8 +67,11 @@ struct float16;
 }  // namespace platform
 }  // namespace paddle
 
+// NOTE():
+// Do not move the eigen.h header, otherwise the eigen_vector<bool> will failed.
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/hostdevice.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace platform {
@@ -898,6 +901,30 @@ struct is_pod<paddle::platform::float16> {
       is_standard_layout<paddle::platform::float16>::value;
 };
 
+template <>
+struct is_floating_point<paddle::platform::float16>
+    : std::integral_constant<
+          bool, std::is_same<paddle::platform::float16,
+                             typename std::remove_cv<
+                                 paddle::platform::float16>::type>::value> {};
+template <>
+struct is_signed<paddle::platform::float16> {
+  static const bool value = true;
+};
+
+template <>
+struct is_unsigned<paddle::platform::float16> {
+  static const bool value = false;
+};
+
+inline bool isnan(const paddle::platform::float16& a) {
+  return paddle::platform::isnan(a);
+}
+
+inline bool isinf(const paddle::platform::float16& a) {
+  return paddle::platform::isinf(a);
+}
+
 template <>
 struct numeric_limits<paddle::platform::float16> {
   static const bool is_specialized = true;
diff --git a/paddle/fluid/platform/float16_test.cc b/paddle/fluid/platform/float16_test.cc
index ede294be1e2e26693bd3ead2ccd5e6a6c8a075bc..27e930e6e0a76982b3f27619f38a4a08d82cafa1 100644
--- a/paddle/fluid/platform/float16_test.cc
+++ b/paddle/fluid/platform/float16_test.cc
@@ -141,10 +141,36 @@ TEST(float16, lod_tensor_cpu) {
   }
 }
 
+TEST(float16, floating) {
+  // compile time assert.
+  PADDLE_ASSERT(std::is_floating_point<float16>::value);
+}
+
 TEST(float16, print) {
   float16 a = float16(1.0f);
   std::cout << a << std::endl;
 }
 
+// CPU test
+TEST(float16, isinf) {
+  float16 a;
+  a.x = 0x7c00;
+  float16 b = float16(INFINITY);
+  float16 c = static_cast<float16>(INFINITY);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(c), true);
+}
+
+TEST(float16, isnan) {
+  float16 a;
+  a.x = 0x7fff;
+  float16 b = float16(NAN);
+  float16 c = static_cast<float16>(NAN);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(c), true);
+}
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/float16_test.cu b/paddle/fluid/platform/float16_test.cu
index 1b9cf9b5d3fa2121b588c31d7cf2f4c50cb951bc..e2b7ca9b03809113c31af8ff4d3ad3713748f330 100644
--- a/paddle/fluid/platform/float16_test.cu
+++ b/paddle/fluid/platform/float16_test.cu
@@ -11,11 +11,13 @@ limitations under the License. */
 
 #include "paddle/fluid/platform/float16.h"
 
+#include <glog/logging.h>
 #include <gtest/gtest.h>
+#include <bitset>
+#include <iostream>
 
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/legacy/utils/Logging.h"
 
 #define ARITHMETIC_KERNEL(op_type, sign)                                 \
   __global__ void op_type(const half* in1, const half* in2, half* out) { \
@@ -241,6 +243,72 @@ TEST(float16, lod_tensor_on_gpu) {
   }
 }
 
+template <typename T>
+struct Functor {
+  bool operator()(const T& val) {
+    return std::type_index(typeid(T)) ==
+           std::type_index(typeid(platform::float16));
+  }
+};
+
+TEST(float16, typeid) {
+  // the framework heavily used typeid hash
+  Functor<float16> functor;
+  float16 a = float16(.0f);
+  Functor<int> functor2;
+  int b(0);
+
+  // compile time assert
+  PADDLE_ASSERT(functor(a) == true);
+  PADDLE_ASSERT(functor2(b) == false);
+}
+
+// GPU test
+TEST(float16, isinf) {
+  float16 a;
+  a.x = 0x7c00;
+  float16 b = float16(INFINITY);
+  // underflow to 0
+  float16 native_a(5e-40f);
+  // overflow to inf
+  float16 native_b(5e40f);
+  EXPECT_EQ(std::isinf(a), true);
+  EXPECT_EQ(std::isinf(b), true);
+  EXPECT_EQ(std::isinf(native_b), true);
+  EXPECT_EQ(native_a, float16(0));
+}
+
+TEST(float16, isnan) {
+  float16 a;
+  a.x = 0x7fff;
+  float16 b = float16(NAN);
+  float16 c = float16(5e40);
+  // inf * +-0 will get a nan
+  float16 d = c * float16(0);
+  EXPECT_EQ(std::isnan(a), true);
+  EXPECT_EQ(std::isnan(b), true);
+  EXPECT_EQ(std::isnan(d), true);
+}
+
+TEST(float16, cast) {
+  float16 a;
+  a.x = 0x0070;
+  auto b = a;
+  {
+    // change semantic, keep the same value
+    float16 c = reinterpret_cast<float16&>(reinterpret_cast<unsigned&>(b));
+    EXPECT_EQ(b, c);
+  }
+
+  {
+    // use uint32 low 16 bit store float16
+    uint32_t c = reinterpret_cast<uint32_t&>(b);
+    float16 d;
+    d.x = c;
+    EXPECT_EQ(b, d);
+  }
+}
+
 }  // namespace platform
 }  // namespace paddle
 #endif  // PADDLE_CUDA_FP16
diff --git a/patches/grpc/completion_queue.h b/patches/grpc/completion_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e92c60ea2db00cc6e227830228888f9a06735c4
--- /dev/null
+++ b/patches/grpc/completion_queue.h
@@ -0,0 +1,386 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+/// A completion queue implements a concurrent producer-consumer queue, with
+/// two main API-exposed methods: \a Next and \a AsyncNext. These
+/// methods are the essential component of the gRPC C++ asynchronous API.
+/// There is also a \a Shutdown method to indicate that a given completion queue
+/// will no longer have regular events. This must be called before the
+/// completion queue is destroyed.
+/// All completion queue APIs are thread-safe and may be used concurrently with
+/// any other completion queue API invocation; it is acceptable to have
+/// multiple threads calling \a Next or \a AsyncNext on the same or different
+/// completion queues, or to call these methods concurrently with a \a Shutdown
+/// elsewhere.
+/// \remark{All other API calls on completion queue should be completed before
+/// a completion queue destructor is called.}
+#ifndef GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
+#define GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
+
+#include <typeinfo>
+
+#include <grpc/impl/codegen/atm.h>
+#include <grpcpp/impl/codegen/completion_queue_tag.h>
+#include <grpcpp/impl/codegen/core_codegen_interface.h>
+#include <grpcpp/impl/codegen/grpc_library.h>
+#include <grpcpp/impl/codegen/status.h>
+#include <grpcpp/impl/codegen/time.h>
+
+struct grpc_completion_queue;
+
+namespace grpc {
+
+template <class R>
+class ClientReader;
+template <class W>
+class ClientWriter;
+template <class W, class R>
+class ClientReaderWriter;
+template <class R>
+class ServerReader;
+template <class W>
+class ServerWriter;
+namespace internal {
+template <class W, class R>
+class ServerReaderWriterBody;
+}  // namespace internal
+
+class Channel;
+class ChannelInterface;
+class ClientContext;
+class CompletionQueue;
+class Server;
+class ServerBuilder;
+class ServerContext;
+class ServerInterface;
+
+namespace internal {
+class CompletionQueueTag;
+class RpcMethod;
+template <class ServiceType, class RequestType, class ResponseType>
+class RpcMethodHandler;
+template <class ServiceType, class RequestType, class ResponseType>
+class ClientStreamingHandler;
+template <class ServiceType, class RequestType, class ResponseType>
+class ServerStreamingHandler;
+template <class ServiceType, class RequestType, class ResponseType>
+class BidiStreamingHandler;
+class UnknownMethodHandler;
+template <class Streamer, bool WriteNeeded>
+class TemplatedBidiStreamingHandler;
+template <class InputMessage, class OutputMessage>
+class BlockingUnaryCallImpl;
+}  // namespace internal
+
+extern CoreCodegenInterface* g_core_codegen_interface;
+
+/// A thin wrapper around \ref grpc_completion_queue (see \ref
+/// src/core/lib/surface/completion_queue.h).
+/// See \ref doc/cpp/perf_notes.md for notes on best practices for high
+/// performance servers.
+class CompletionQueue : private GrpcLibraryCodegen {
+ public:
+  /// Default constructor. Implicitly creates a \a grpc_completion_queue
+  /// instance.
+  CompletionQueue()
+      : CompletionQueue(grpc_completion_queue_attributes{
+            GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, GRPC_CQ_DEFAULT_POLLING}) {}
+
+  /// Wrap \a take, taking ownership of the instance.
+  ///
+  /// \param take The completion queue instance to wrap. Ownership is taken.
+  explicit CompletionQueue(grpc_completion_queue* take);
+
+  /// Destructor. Destroys the owned wrapped completion queue / instance.
+  ~CompletionQueue() {
+    if (typeid(*g_core_codegen_interface).hash_code() !=
+        typeid(CoreCodegenInterface).hash_code()) {
+      g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
+    }
+  }
+
+  /// Tri-state return for AsyncNext: SHUTDOWN, GOT_EVENT, TIMEOUT.
+  enum NextStatus {
+    SHUTDOWN,   ///< The completion queue has been shutdown and fully-drained
+    GOT_EVENT,  ///< Got a new event; \a tag will be filled in with its
+                ///< associated value; \a ok indicating its success.
+    TIMEOUT     ///< deadline was reached.
+  };
+
+  /// Read from the queue, blocking until an event is available or the queue is
+  /// shutting down.
+  ///
+  /// \param tag[out] Updated to point to the read event's tag.
+  /// \param ok[out] true if read a successful event, false otherwise.
+  ///
+  /// Note that each tag sent to the completion queue (through RPC operations
+  /// or alarms) will be delivered out of the completion queue by a call to
+  /// Next (or a related method), regardless of whether the operation succeeded
+  /// or not. Success here means that this operation completed in the normal
+  /// valid manner.
+  ///
+  /// Server-side RPC request: \a ok indicates that the RPC has indeed
+  /// been started. If it is false, the server has been Shutdown
+  /// before this particular call got matched to an incoming RPC.
+  ///
+  /// Client-side StartCall/RPC invocation: \a ok indicates that the RPC is
+  /// going to go to the wire. If it is false, it not going to the wire. This
+  /// would happen if the channel is either permanently broken or
+  /// transiently broken but with the fail-fast option. (Note that async unary
+  /// RPCs don't post a CQ tag at this point, nor do client-streaming
+  /// or bidi-streaming RPCs that have the initial metadata corked option set.)
+  ///
+  /// Client-side Write, Client-side WritesDone, Server-side Write,
+  /// Server-side Finish, Server-side SendInitialMetadata (which is
+  /// typically included in Write or Finish when not done explicitly):
+  /// \a ok means that the data/metadata/status/etc is going to go to the
+  /// wire. If it is false, it not going to the wire because the call
+  /// is already dead (i.e., canceled, deadline expired, other side
+  /// dropped the channel, etc).
+  ///
+  /// Client-side Read, Server-side Read, Client-side
+  /// RecvInitialMetadata (which is typically included in Read if not
+  /// done explicitly): \a ok indicates whether there is a valid message
+  /// that got read. If not, you know that there are certainly no more
+  /// messages that can ever be read from this stream. For the client-side
+  /// operations, this only happens because the call is dead. For the
+  /// server-sider operation, though, this could happen because the client
+  /// has done a WritesDone already.
+  ///
+  /// Client-side Finish: \a ok should always be true
+  ///
+  /// Server-side AsyncNotifyWhenDone: \a ok should always be true
+  ///
+  /// Alarm: \a ok is true if it expired, false if it was canceled
+  ///
+  /// \return true if got an event, false if the queue is fully drained and
+  ///         shut down.
+  bool Next(void** tag, bool* ok) {
+    return (AsyncNextInternal(tag,
+                              ok,
+                              g_core_codegen_interface->gpr_inf_future(
+                                  GPR_CLOCK_REALTIME)) != SHUTDOWN);
+  }
+
+  /// Read from the queue, blocking up to \a deadline (or the queue's shutdown).
+  /// Both \a tag and \a ok are updated upon success (if an event is available
+  /// within the \a deadline).  A \a tag points to an arbitrary location usually
+  /// employed to uniquely identify an event.
+  ///
+  /// \param tag[out] Upon sucess, updated to point to the event's tag.
+  /// \param ok[out] Upon sucess, true if a successful event, false otherwise
+  ///        See documentation for CompletionQueue::Next for explanation of ok
+  /// \param deadline[in] How long to block in wait for an event.
+  ///
+  /// \return The type of event read.
+  template <typename T>
+  NextStatus AsyncNext(void** tag, bool* ok, const T& deadline) {
+    TimePoint<T> deadline_tp(deadline);
+    return AsyncNextInternal(tag, ok, deadline_tp.raw_time());
+  }
+
+  /// EXPERIMENTAL
+  /// First executes \a F, then reads from the queue, blocking up to
+  /// \a deadline (or the queue's shutdown).
+  /// Both \a tag and \a ok are updated upon success (if an event is available
+  /// within the \a deadline).  A \a tag points to an arbitrary location usually
+  /// employed to uniquely identify an event.
+  ///
+  /// \param F[in] Function to execute before calling AsyncNext on this queue.
+  /// \param tag[out] Upon sucess, updated to point to the event's tag.
+  /// \param ok[out] Upon sucess, true if read a regular event, false otherwise.
+  /// \param deadline[in] How long to block in wait for an event.
+  ///
+  /// \return The type of event read.
+  template <typename T, typename F>
+  NextStatus DoThenAsyncNext(F&& f, void** tag, bool* ok, const T& deadline) {
+    CompletionQueueTLSCache cache = CompletionQueueTLSCache(this);
+    f();
+    if (cache.Flush(tag, ok)) {
+      return GOT_EVENT;
+    } else {
+      return AsyncNext(tag, ok, deadline);
+    }
+  }
+
+  /// Request the shutdown of the queue.
+  ///
+  /// \warning This method must be called at some point if this completion queue
+  /// is accessed with Next or AsyncNext. \a Next will not return false
+  /// until this method has been called and all pending tags have been drained.
+  /// (Likewise for \a AsyncNext returning \a NextStatus::SHUTDOWN .)
+  /// Only once either one of these methods does that (that is, once the queue
+  /// has been \em drained) can an instance of this class be destroyed.
+  /// Also note that applications must ensure that no work is enqueued on this
+  /// completion queue after this method is called.
+  void Shutdown();
+
+  /// Returns a \em raw pointer to the underlying \a grpc_completion_queue
+  /// instance.
+  ///
+  /// \warning Remember that the returned instance is owned. No transfer of
+  /// owership is performed.
+  grpc_completion_queue* cq() { return cq_; }
+
+ protected:
+  /// Private constructor of CompletionQueue only visible to friend classes
+  CompletionQueue(const grpc_completion_queue_attributes& attributes) {
+    cq_ = g_core_codegen_interface->grpc_completion_queue_create(
+        g_core_codegen_interface->grpc_completion_queue_factory_lookup(
+            &attributes),
+        &attributes,
+        NULL);
+    InitialAvalanching();  // reserve this for the future shutdown
+  }
+
+ private:
+  // Friend synchronous wrappers so that they can access Pluck(), which is
+  // a semi-private API geared towards the synchronous implementation.
+  template <class R>
+  friend class ::grpc::ClientReader;
+  template <class W>
+  friend class ::grpc::ClientWriter;
+  template <class W, class R>
+  friend class ::grpc::ClientReaderWriter;
+  template <class R>
+  friend class ::grpc::ServerReader;
+  template <class W>
+  friend class ::grpc::ServerWriter;
+  template <class W, class R>
+  friend class ::grpc::internal::ServerReaderWriterBody;
+  template <class ServiceType, class RequestType, class ResponseType>
+  friend class ::grpc::internal::RpcMethodHandler;
+  template <class ServiceType, class RequestType, class ResponseType>
+  friend class ::grpc::internal::ClientStreamingHandler;
+  template <class ServiceType, class RequestType, class ResponseType>
+  friend class ::grpc::internal::ServerStreamingHandler;
+  template <class Streamer, bool WriteNeeded>
+  friend class ::grpc::internal::TemplatedBidiStreamingHandler;
+  friend class ::grpc::internal::UnknownMethodHandler;
+  friend class ::grpc::Server;
+  friend class ::grpc::ServerContext;
+  friend class ::grpc::ServerInterface;
+  template <class InputMessage, class OutputMessage>
+  friend class ::grpc::internal::BlockingUnaryCallImpl;
+
+  /// EXPERIMENTAL
+  /// Creates a Thread Local cache to store the first event
+  /// On this completion queue queued from this thread.  Once
+  /// initialized, it must be flushed on the same thread.
+  class CompletionQueueTLSCache {
+   public:
+    CompletionQueueTLSCache(CompletionQueue* cq);
+    ~CompletionQueueTLSCache();
+    bool Flush(void** tag, bool* ok);
+
+   private:
+    CompletionQueue* cq_;
+    bool flushed_;
+  };
+
+  NextStatus AsyncNextInternal(void** tag, bool* ok, gpr_timespec deadline);
+
+  /// Wraps \a grpc_completion_queue_pluck.
+  /// \warning Must not be mixed with calls to \a Next.
+  bool Pluck(internal::CompletionQueueTag* tag) {
+    auto deadline =
+        g_core_codegen_interface->gpr_inf_future(GPR_CLOCK_REALTIME);
+    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
+        cq_, tag, deadline, nullptr);
+    bool ok = ev.success != 0;
+    void* ignored = tag;
+    GPR_CODEGEN_ASSERT(tag->FinalizeResult(&ignored, &ok));
+    GPR_CODEGEN_ASSERT(ignored == tag);
+    // Ignore mutations by FinalizeResult: Pluck returns the C API status
+    return ev.success != 0;
+  }
+
+  /// Performs a single polling pluck on \a tag.
+  /// \warning Must not be mixed with calls to \a Next.
+  ///
+  /// TODO: sreek - This calls tag->FinalizeResult() even if the cq_ is already
+  /// shutdown. This is most likely a bug and if it is a bug, then change this
+  /// implementation to simple call the other TryPluck function with a zero
+  /// timeout. i.e:
+  ///      TryPluck(tag, gpr_time_0(GPR_CLOCK_REALTIME))
+  void TryPluck(internal::CompletionQueueTag* tag) {
+    auto deadline = g_core_codegen_interface->gpr_time_0(GPR_CLOCK_REALTIME);
+    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
+        cq_, tag, deadline, nullptr);
+    if (ev.type == GRPC_QUEUE_TIMEOUT) return;
+    bool ok = ev.success != 0;
+    void* ignored = tag;
+    // the tag must be swallowed if using TryPluck
+    GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok));
+  }
+
+  /// Performs a single polling pluck on \a tag. Calls tag->FinalizeResult if
+  /// the pluck() was successful and returned the tag.
+  ///
+  /// This exects tag->FinalizeResult (if called) to return 'false' i.e expects
+  /// that the tag is internal not something that is returned to the user.
+  void TryPluck(internal::CompletionQueueTag* tag, gpr_timespec deadline) {
+    auto ev = g_core_codegen_interface->grpc_completion_queue_pluck(
+        cq_, tag, deadline, nullptr);
+    if (ev.type == GRPC_QUEUE_TIMEOUT || ev.type == GRPC_QUEUE_SHUTDOWN) {
+      return;
+    }
+
+    bool ok = ev.success != 0;
+    void* ignored = tag;
+    GPR_CODEGEN_ASSERT(!tag->FinalizeResult(&ignored, &ok));
+  }
+
+  /// Manage state of avalanching operations : completion queue tags that
+  /// trigger other completion queue operations. The underlying core completion
+  /// queue should not really shutdown until all avalanching operations have
+  /// been finalized. Note that we maintain the requirement that an avalanche
+  /// registration must take place before CQ shutdown (which must be maintained
+  /// elsehwere)
+  void InitialAvalanching() {
+    gpr_atm_rel_store(&avalanches_in_flight_, static_cast<gpr_atm>(1));
+  }
+  void RegisterAvalanching() {
+    gpr_atm_no_barrier_fetch_add(&avalanches_in_flight_,
+                                 static_cast<gpr_atm>(1));
+  }
+  void CompleteAvalanching();
+
+  grpc_completion_queue* cq_;  // owned
+
+  gpr_atm avalanches_in_flight_;
+};
+
+/// A specific type of completion queue used by the processing of notifications
+/// by servers. Instantiated by \a ServerBuilder.
+class ServerCompletionQueue : public CompletionQueue {
+ public:
+  bool IsFrequentlyPolled() { return polling_type_ != GRPC_CQ_NON_LISTENING; }
+
+ private:
+  grpc_cq_polling_type polling_type_;
+  friend class ServerBuilder;
+  /// \param is_frequently_polled Informs the GRPC library about whether the
+  /// server completion queue would be actively polled (by calling Next() or
+  /// AsyncNext()). By default all server completion queues are assumed to be
+  /// frequently polled.
+  ServerCompletionQueue(grpc_cq_polling_type polling_type)
+      : CompletionQueue(grpc_completion_queue_attributes{
+            GRPC_CQ_CURRENT_VERSION, GRPC_CQ_NEXT, polling_type}),
+        polling_type_(polling_type) {}
+};
+
+}  // namespace grpc
+
+#endif  // GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
diff --git a/patches/grpc/fix_too_early_destory.patch b/patches/grpc/fix_too_early_destory.patch
deleted file mode 100644
index d7790d56b07551b8daae9b9a41be5432e5b8b9cc..0000000000000000000000000000000000000000
--- a/patches/grpc/fix_too_early_destory.patch
+++ /dev/null
@@ -1,47 +0,0 @@
-diff --git a/include/grpcpp/impl/codegen/completion_queue.h b/include/grpcpp/impl/codegen/completion_queue.h
-index 80c7c41982..3f7d8a7714 100644
---- a/include/grpcpp/impl/codegen/completion_queue.h
-+++ b/include/grpcpp/impl/codegen/completion_queue.h
-@@ -32,6 +32,8 @@
- #ifndef GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
- #define GRPCPP_IMPL_CODEGEN_COMPLETION_QUEUE_H
- 
-+#include <typeinfo>
-+
- #include <grpc/impl/codegen/atm.h>
- #include <grpcpp/impl/codegen/completion_queue_tag.h>
- #include <grpcpp/impl/codegen/core_codegen_interface.h>
-@@ -106,7 +108,9 @@ class CompletionQueue : private GrpcLibraryCodegen {
- 
-   /// Destructor. Destroys the owned wrapped completion queue / instance.
-   ~CompletionQueue() {
--    g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
-+	if (typeid(*g_core_codegen_interface).hash_code() != typeid(CoreCodegenInterface).hash_code()) {
-+      g_core_codegen_interface->grpc_completion_queue_destroy(cq_);
-+	}
-   }
- 
-   /// Tri-state return for AsyncNext: SHUTDOWN, GOT_EVENT, TIMEOUT.
-diff --git a/include/grpcpp/impl/codegen/grpc_library.h b/include/grpcpp/impl/codegen/grpc_library.h
-index 17c904d71a..a092b2204d 100644
---- a/include/grpcpp/impl/codegen/grpc_library.h
-+++ b/include/grpcpp/impl/codegen/grpc_library.h
-@@ -19,6 +19,8 @@
- #ifndef GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
- #define GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
- 
-+#include <typeinfo>
-+
- #include <grpcpp/impl/codegen/core_codegen_interface.h>
- 
- namespace grpc {
-@@ -47,7 +49,8 @@ class GrpcLibraryCodegen {
-     }
-   }
-   virtual ~GrpcLibraryCodegen() {
--    if (grpc_init_called_) {
-+    if (grpc_init_called_ &&
-+		typeid(*g_glip).hash_code() != typeid(GrpcLibraryInterface).hash_code()) {
-       GPR_CODEGEN_ASSERT(g_glip &&
-                          "gRPC library not initialized. See "
-                          "grpc::internal::GrpcLibraryInitializer.");
diff --git a/patches/grpc/grpc_library.h b/patches/grpc/grpc_library.h
new file mode 100644
index 0000000000000000000000000000000000000000..4870a1cda4b2a6489bc379fe53cf3e9659fffc47
--- /dev/null
+++ b/patches/grpc/grpc_library.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
+#define GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
+
+#include <typeinfo>
+
+#include <grpcpp/impl/codegen/core_codegen_interface.h>
+
+namespace grpc {
+
+class GrpcLibraryInterface {
+ public:
+  virtual ~GrpcLibraryInterface() = default;
+  virtual void init() = 0;
+  virtual void shutdown() = 0;
+};
+
+/// Initialized by \a grpc::GrpcLibraryInitializer from
+/// <grpcpp/impl/grpc_library.h>
+extern GrpcLibraryInterface* g_glip;
+
+/// Classes that require gRPC to be initialized should inherit from this class.
+class GrpcLibraryCodegen {
+ public:
+  GrpcLibraryCodegen(bool call_grpc_init = true) : grpc_init_called_(false) {
+    if (call_grpc_init) {
+      GPR_CODEGEN_ASSERT(g_glip &&
+                         "gRPC library not initialized. See "
+                         "grpc::internal::GrpcLibraryInitializer.");
+      g_glip->init();
+      grpc_init_called_ = true;
+    }
+  }
+  virtual ~GrpcLibraryCodegen() {
+    if (grpc_init_called_ &&
+        typeid(*g_glip).hash_code() !=
+            typeid(GrpcLibraryInterface).hash_code()) {
+      GPR_CODEGEN_ASSERT(g_glip &&
+                         "gRPC library not initialized. See "
+                         "grpc::internal::GrpcLibraryInitializer.");
+      g_glip->shutdown();
+    }
+  }
+
+ private:
+  bool grpc_init_called_;
+};
+
+}  // namespace grpc
+
+#endif  // GRPCPP_IMPL_CODEGEN_GRPC_LIBRARY_H
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 322d76515e76c3d322ac7c4f989bbc95875cb654..43f68ff4592df6757691b06db52cf5e0e2ebc6d7 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -40,7 +40,7 @@ function(py_test_modules TARGET_NAME)
              ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
              WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     if (py_test_modules_SERIAL)
-        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     endif()
   endif()
 endfunction()
diff --git a/python/paddle/fluid/tests/unittests/dist_se_resnext.py b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
index 72bc1729b0f63b23ad7ecb5ad703b984a4c614ac..bf7816b2466edd7db836c738da90f5f97b631843 100644
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
@@ -278,7 +278,7 @@ class DistSeResneXt2x2:
 
     def run_trainer(self, place, endpoints, trainer_id, trainers, is_dist=True):
         test_program, avg_cost, train_reader, test_reader, batch_acc, predict = get_model(
-            batch_size=20)
+            batch_size=2)
         if is_dist:
             t = get_transpiler(trainer_id,
                                fluid.default_main_program(), endpoints,
@@ -294,11 +294,7 @@ class DistSeResneXt2x2:
         strategy.num_threads = 1
         strategy.allow_op_delay = False
         exe = fluid.ParallelExecutor(
-            True,
-            loss_name=avg_cost.name,
-            exec_strategy=strategy,
-            num_trainers=trainers,
-            trainer_id=trainer_id)
+            True, loss_name=avg_cost.name, exec_strategy=strategy)
 
         feed_var_list = [
             var for var in trainer_prog.global_block().vars.itervalues()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
index e3e7036f08cb88087ae45fe7d7c7565c102dab8a..3b67b3f5ccd67f86f87f292d83a6039ff46260bd 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
@@ -19,6 +19,7 @@ import math
 
 import unittest
 import os
+import sys
 import signal
 import subprocess
 
@@ -56,7 +57,7 @@ class TestDistSeResneXt2x2(unittest.TestCase):
             except os.error:
                 retry_times -= 1
 
-    def non_test_with_place(self):
+    def test_with_place(self):
         # *ATTENTION* THIS TEST NEEDS AT LEAST 2GPUS TO RUN
         required_envs = {
             "PATH": os.getenv("PATH"),
@@ -70,9 +71,15 @@ class TestDistSeResneXt2x2(unittest.TestCase):
         local_cmd = "%s dist_se_resnext.py trainer %s 0 %s %d FLASE" % \
             (self._python_interp, "127.0.0.1:1234", "127.0.0.1:1234", 1)
         local_proc = subprocess.Popen(
-            local_cmd.split(" "), stdout=subprocess.PIPE, env=env_local)
+            local_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env_local)
         local_proc.wait()
-        local_ret = local_proc.stdout.read()
+        out, err = local_proc.communicate()
+        local_ret = out
+        sys.stderr.write('local_loss: %s\n' % local_ret)
+        sys.stderr.write('local_stderr: %s\n' % err)
 
         # Run dist train to compare with local results
         ps0, ps1 = self.start_pserver()
@@ -92,13 +99,22 @@ class TestDistSeResneXt2x2(unittest.TestCase):
         FNULL = open(os.devnull, 'w')
 
         tr0_proc = subprocess.Popen(
-            tr0_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env0)
+            tr0_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env0)
         tr1_proc = subprocess.Popen(
-            tr1_cmd.split(" "), stdout=subprocess.PIPE, stderr=FNULL, env=env1)
+            tr1_cmd.split(" "),
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            env=env1)
 
         tr0_proc.wait()
         tr1_proc.wait()
-        loss_data0 = tr0_proc.stdout.read()
+        out, err = tr0_proc.communicate()
+        sys.stderr.write('dist_stderr: %s\n' % err)
+        loss_data0 = out
+        sys.stderr.write('dist_loss: %s\n' % loss_data0)
         lines = loss_data0.split("\n")
         dist_first_loss = eval(lines[0].replace(" ", ","))[0]
         dist_last_loss = eval(lines[1].replace(" ", ","))[0]
diff --git a/python/paddle/fluid/tests/unittests/test_flatten_op.py b/python/paddle/fluid/tests/unittests/test_flatten_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8692ce2ea66ef61c63bc41e77df050398ac63fd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py
@@ -0,0 +1,68 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+
+
+class TestFlattenOp(OpTest):
+    def setUp(self):
+        self.op_type = "flatten"
+        self.init_test_case()
+        self.inputs = {"X": np.random.random(self.in_shape).astype("float32")}
+        self.init_attrs()
+        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+    def init_test_case(self):
+        self.in_shape = (3, 2, 2, 5)
+        self.axis = 1
+        self.new_shape = (3, 20)
+
+    def init_attrs(self):
+        self.attrs = {"axis": self.axis}
+
+
+class TestFlattenOp(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 2, 3)
+        self.axis = 0
+        self.new_shape = (1, 36)
+
+
+class TestFlattenOpWithDefaultAxis(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 2, 3)
+        self.new_shape = (3, 12)
+
+    def init_attrs(self):
+        self.attrs = {}
+
+
+class TestFlattenOpSixDims(TestFlattenOp):
+    def init_test_case(self):
+        self.in_shape = (3, 2, 3, 2, 4, 4)
+        self.axis = 4
+        self.new_shape = (36, 16)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py
index 4a9ea6af747c36e5817ede5fafbadeea79fb07ac..2d9c089c0b7667c875aae05cb4e6040b007f3d55 100644
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -347,6 +347,7 @@ class DistributeTranspiler(object):
 
         # step1
         pserver_program = Program()
+        pserver_program.random_seed = self.origin_program.random_seed
         # step2: Create vars to receive vars at parameter servers.
         recv_inputs = []
         for v in self.param_grad_ep_mapping[endpoint]["params"]:
@@ -544,6 +545,7 @@ class DistributeTranspiler(object):
         """
         s_prog = Program()
         orig_s_prog = default_startup_program()
+        s_prog.random_seed = orig_s_prog.random_seed
         params = self.param_grad_ep_mapping[endpoint]["params"]
 
         def _get_splited_name_and_shape(varname):
diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index 2c65222c8aa7a019f0f8fea68fe02612f70bd41f..aa14d3a2a12208eda11e82d88bc582eb3d2f5893 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -4,7 +4,7 @@ TOTAL_ERRORS=0
 
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
 for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do
-    if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*) ]]; then
+    if [[ $file =~ ^(paddle/legacy/api/.*|paddle/legacy/capi/.*|paddle/contrib/.*|paddle/legacy/cuda/.*|paddle/legacy/function/.*|paddle/legacy/gserver/.*|paddle/legacy/math/.*|paddle/legacy/optimizer/.*|paddle/legacy/parameter/.*|paddle/legacy/pserver/.*|paddle/legacy/trainer/.*|paddle/legacy/utils/.*|paddle/testing/TestUtil.*|patches/grpc/.*) ]]; then
         continue;
     else
         cpplint --filter=-readability/fn_size $file;