Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/add_gather_and_BCast_op_handle

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into...
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/add_gather_and_BCast_op_handle
8b597d9d · chengduoZH · e26c6d78 · 4c55a602 · 8b597d9d · 8b597d9d
91 changed file
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -62,29 +62,33 @@ endif()
 ## Then find the reference-cblas.  www.netlib.org/blas/
 set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
  "Folder contains reference-cblas")
-set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
+if(NOT CMAKE_CROSSCOMPILING)
-  ${REFERENCE_CBLAS_ROOT}/include
+  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
-  /usr/include
+    ${REFERENCE_CBLAS_ROOT}/include
-  /usr/include/cblas
+    /usr/include
-)
+    /usr/include/cblas
+  )
-set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
-  ${REFERENCE_CBLAS_ROOT}/lib
+  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
-  /usr/lib
+    ${REFERENCE_CBLAS_ROOT}/lib
-  /usr/lib/blas/reference/
+    /usr/lib
-  /usr/lib/reference/
+    /usr/lib/blas/reference/
-)
+    /usr/lib/reference/
+  )
+else()
+  # Diable the finding of reference cblas under host's system path
+  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
+  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
+endif()
 find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
        ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
 find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
        ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
-if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
+if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
  set(CBLAS_FOUND ON)
  set(CBLAS_PROVIDER REFERENCE)
  set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})

--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -24,16 +24,16 @@ SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
 SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
 SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
 IF(APPLE)
-  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
 ELSE()
-  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin)
 ENDIF()
 ExternalProject_Add(
    extern_grpc
    DEPENDS protobuf zlib
    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.8.x"
+    GIT_TAG "v1.11.x"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""

--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -11,19 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
-IF(MOBILE_INFERENCE)
+if(MOBILE_INFERENCE OR RPI)
    return()
-ENDIF()
+endif()
 include (ExternalProject)
 # NOTE: snappy is needed when linking with recordio
-SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
+set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
-SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
+set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
-SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE)
+set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
+set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
 ExternalProject_Add(
    extern_snappy
@@ -51,8 +52,7 @@ ExternalProject_Add(
 )
 add_library(snappy STATIC IMPORTED GLOBAL)
-set_property(TARGET snappy PROPERTY IMPORTED_LOCATION
+set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
-             "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
 include_directories(${SNAPPY_INCLUDE_DIR})
 add_dependencies(snappy extern_snappy)
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -11,9 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
-IF(MOBILE_INFERENCE)
+IF(MOBILE_INFERENCE OR RPI)
    return()
 ENDIF()
@@ -21,9 +20,11 @@ include (ExternalProject)
 # NOTE: snappy is needed when linking with recordio
-SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
+set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
-SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
+set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
-SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE)
+set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
+set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
 ExternalProject_Add(
        extern_snappystream
@@ -51,8 +52,7 @@ ExternalProject_Add(
 )
 add_library(snappystream STATIC IMPORTED GLOBAL)
-set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
+set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
-        "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
 include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
 include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -195,14 +195,7 @@ function(cc_library TARGET_NAME)
        list(REMOVE_ITEM cc_library_DEPS warpctc)
        add_dependencies(${TARGET_NAME} warpctc)
      endif()
-      if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
+      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
-        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
-        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
-        target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
-        list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
-      else()
-        target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
-      endif()
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
    endif()
@@ -243,11 +236,7 @@ function(cc_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS ARGS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
-      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
-    endif()
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 set_property(GLOBAL PROPERTY FLUID_MODULES "")
 # find all fluid modules is used for paddle fluid static library
 function(find_fluid_modules TARGET_NAME)
  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
  string(FIND "${__target_path}" "fluid" pos)
  if(pos GREATER 1)
    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
@@ -77,6 +92,23 @@ elseif (WITH_MKLML)
    )
 endif()
+if(NOT MOBILE_INFERENCE AND NOT RPI)
+  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy")
+  copy(snappy_lib
+    SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappystream")
+  copy(snappystream_lib
+    SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/zlib")
+  copy(zlib_lib
+    SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+endif()
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")

--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -24,6 +24,6 @@ if(NOT WITH_FLUID_ONLY)
 endif()
 add_subdirectory(testing)
-if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS)
+if(NOT MOBILE_INFERENCE AND NOT RPI)
  add_subdirectory(fluid)
 endif()
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -3,6 +3,7 @@ add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(pybind)
-add_subdirectory(inference)
 add_subdirectory(string)
 add_subdirectory(recordio)
+# NOTE: please add subdirectory inference at last.
+add_subdirectory(inference)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -79,14 +79,12 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
    COMMENT "Copy generated python proto into directory paddle/fluid/proto."
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-cc_library(backward SRCS backward.cc DEPS net_op)
-cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
-framework_proto backward glog lod_rank_table feed_fetch_method)
+framework_proto glog lod_rank_table feed_fetch_method)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)

--- a/paddle/fluid/framework/backward.cc
+++ b/paddle/fluid/framework/backward.cc
--- a/paddle/fluid/framework/backward.h
+++ b/paddle/fluid/framework/backward.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-namespace paddle {
-namespace framework {
-// Create the backward operator from a forward operator.
-// TODO(yuyang18): Add more API reference comment.
-extern std::unique_ptr<OperatorBase> Backward(
-    const OperatorBase& forwardOp,
-    const std::unordered_set<std::string>& no_grad_vars);
-struct GradVarInfo {
-  GradVarInfo() {}
-  GradVarInfo(const std::string& name, int block_idx, int op_idx)
-      : name_(name), block_idx_(block_idx), op_idx_(op_idx) {}
-  bool operator==(const GradVarInfo& b) const {
-    return name_ == b.name_ && block_idx_ == b.block_idx_ &&
-           op_idx_ == b.op_idx_;
-  }
-  std::string name_;
-  int block_idx_;
-  int op_idx_;
-};
-using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
-                                            GradVarInfo /*grad_var_info*/>;
-ParamGradInfoMap AppendBackward(
-    ProgramDesc& program_desc, const VarDesc& target,
-    const std::unordered_set<std::string>& no_grad_vars);
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/backward_test.cc
+++ b/paddle/fluid/framework/backward_test.cc
--- a/paddle/fluid/framework/block_desc.h
+++ b/paddle/fluid/framework/block_desc.h
@@ -92,7 +92,7 @@ class BlockDesc {
  /*
   * Remove Op and its input/output variables.
-   * Note that for either input or ouput variable, if it is also an input or
+   * Note that for either input or output variable, if it is also an input or
   * output variable of other ops, we should remain it.
   */
  void RemoveOp(size_t s, size_t e);

--- a/paddle/fluid/framework/details/broadcast_op_handle.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle.cc
@@ -48,7 +48,7 @@ void BroadcastOpHandle::RunImpl() {
    auto out_scope_idx = out_handle->scope_idx_;
    PADDLE_ENFORCE_LT(out_scope_idx, local_scopes_.size(),
-                      "%s is not the the local_scopes ", out_handle->name_);
+                      "%s is not in the local_scopes ", out_handle->name_);
    auto *s = local_scopes_[out_scope_idx];
    auto out_var = s->FindVar(out_handle->name_);
    PADDLE_ENFORCE_EQ(out_p.which(), in_place.which(),

--- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc
+++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc
@@ -57,15 +57,12 @@ class BroadcastTester : public ::testing::Test {
    }
  }
-  template <class T>
  void BroadcastInitOp(int input_scope_idx) {
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      local_scope_.push_back(&g_scope_.NewScope());
-      auto* out_var = local_scope_[j]->Var("out");
+      local_scope_[j]->Var("out");
-      out_var->GetMutable<T>();
    }
-    auto* in_var = local_scope_[input_scope_idx]->Var("input");
+    local_scope_[input_scope_idx]->Var("input");
-    in_var->GetMutable<T>();
    bc_op_handle_ = new f::details::BroadcastOpHandle(local_scope_, gpu_list_);
@@ -84,7 +81,6 @@ class BroadcastTester : public ::testing::Test {
      out_var_handle->name_ = "out";
      out_var_handle->version_ = 2;
      out_var_handle->scope_idx_ = j;
-      out_var_handle->generated_op_ = bc_op_handle_;
      bc_op_handle_->AddOutput(out_var_handle);
    }
  }
@@ -109,7 +105,7 @@ class BroadcastTester : public ::testing::Test {
  void TestBroadcastLodTensor() {
    int input_scope_idx = 0;
-    BroadcastInitOp<f::LoDTensor>(input_scope_idx);
+    BroadcastInitOp(input_scope_idx);
    auto in_var = local_scope_[input_scope_idx]->Var("input");
    auto in_lod_tensor = in_var->GetMutable<f::LoDTensor>();
@@ -148,7 +144,7 @@ class BroadcastTester : public ::testing::Test {
  void TestBroadcastSelectedRows() {
    int input_scope_idx = 0;
-    BroadcastInitOp<f::SelectedRows>(input_scope_idx);
+    BroadcastInitOp(input_scope_idx);
    auto in_var = local_scope_[input_scope_idx]->Var("input");
    auto in_selected_rows = in_var->GetMutable<f::SelectedRows>();

--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -14,6 +14,8 @@
 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include <string>
 namespace paddle {
 namespace framework {
 namespace details {
@@ -33,7 +35,7 @@ void ComputationOpHandle::RunImpl() {
    }
  }
-  op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get<Scope *>(), place_);
+  op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
 }
 std::string ComputationOpHandle::Name() const { return op_->Type(); }

--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -14,6 +14,9 @@
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace framework {
 namespace details {
@@ -57,7 +60,10 @@ void FetchOpHandle::RunImpl() {
  for (size_t i = 0; i < scopes.size(); ++i) {
    auto &scope = scopes[i];
-    auto &t = scope->FindVar(var_name)->Get<framework::LoDTensor>();
+    auto &t = scope->FindVar(kLocalExecScopeName)
+                  ->Get<Scope *>()
+                  ->FindVar(var_name)
+                  ->Get<framework::LoDTensor>();
    if (platform::is_gpu_place(var->place_)) {
 #ifdef PADDLE_WITH_CUDA
      TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);

--- a/paddle/fluid/framework/details/gather_op_handle.cc
+++ b/paddle/fluid/framework/details/gather_op_handle.cc
@@ -28,6 +28,12 @@ void GatherOpHandle::RunImpl() {
      "The number of inputs should be equal to the number of place.");
  PADDLE_ENFORCE_EQ(this->outputs_.size(), 1,
                    "The number of output should be one.");
+  auto in_0_handle = static_cast<VarHandle *>(inputs_[0]);
+  auto pre_in_var =
+      local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
+  PADDLE_ENFORCE(pre_in_var->IsType<framework::SelectedRows>(),
+                 "Currently, gather_op only can gather SelectedRows.");
+  auto pre_place = in_0_handle->place_;
  // Wait input done, this Wait is asynchronous operation
  for (auto *in : inputs_) {
@@ -36,10 +42,6 @@ void GatherOpHandle::RunImpl() {
      in->generated_op_->Wait(dev_ctxes_[p]);
    }
  }
-  auto in_0_handle = static_cast<VarHandle *>(inputs_[0]);
-  auto pre_in_var =
-      local_scopes_[in_0_handle->scope_idx_]->FindVar(in_0_handle->name_);
-  auto pre_place = in_0_handle->place_;
  std::vector<int64_t> out_rows;
  std::vector<Tensor *> in_tensors;
@@ -110,9 +112,9 @@ void GatherOpHandle::RunImpl() {
      s = e;
    }
  } else if (pre_in_var->IsType<framework::LoDTensor>()) {
-    // gather LoDTensor ???
+    PADDLE_THROW("Currently, Var only can be SelectedRows.");
  } else {
-    PADDLE_THROW("Var should be LoDTensor or SelectedRows.");
+    PADDLE_THROW("Var should be SelectedRows.");
  }
 }

--- a/paddle/fluid/framework/details/gather_op_handle_test.cc
+++ b/paddle/fluid/framework/details/gather_op_handle_test.cc
@@ -56,15 +56,12 @@ class GatherTester : public ::testing::Test {
    }
  }
-  template <class T>
  void InitGatherOp(int input_scope_idx) {
    for (size_t j = 0; j < gpu_list_.size(); ++j) {
      local_scope_.push_back(&g_scope_.NewScope());
-      auto* out_var = local_scope_[j]->Var("input");
+      local_scope_[j]->Var("input");
-      out_var->GetMutable<T>();
    }
-    auto* in_var = local_scope_[input_scope_idx]->Var("out");
+    local_scope_[input_scope_idx]->Var("out");
-    in_var->GetMutable<T>();
    gather_op_handle_ = new f::details::GatherOpHandle(local_scope_, gpu_list_);
@@ -106,11 +103,9 @@ class GatherTester : public ::testing::Test {
    }
  }
-  void TestGatherLodTensor() {}
  void TestGatherSelectedRows() {
    int output_scope_idx = 0;
-    InitGatherOp<f::SelectedRows>(output_scope_idx);
+    InitGatherOp(output_scope_idx);
    int height = kDims[0] * 2;
    std::vector<int64_t> rows{0, 1, 2, 3, 3, 0, 14, 7, 3, 1,
@@ -169,21 +164,12 @@ class GatherTester : public ::testing::Test {
  f::details::GatherOpHandle* gather_op_handle_;
 };
-// TEST_F(GatherTester, TestCPUGatherTestLodTensor) {
-//  InitCtxOnGpu(false);
-//  TestGatherLodTensor();
-//}
 TEST_F(GatherTester, TestCPUGatherTestSelectedRows) {
  InitCtxOnGpu(false);
  TestGatherSelectedRows();
 }
 #ifdef PADDLE_WITH_CUDA
-// TEST_F(GatherTester, TestGPUGatherTestLodTensor) {
-//  InitCtxOnGpu(true);
-//  TestGatherLodTensor();
-//}
 TEST_F(GatherTester, TestGPUGatherTestSelectedRows) {
  InitCtxOnGpu(true);

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -32,6 +32,8 @@ namespace details {
 // temporarily place it in op_handle.
 Tensor *GetTensorFromVar(Variable *in_var);
+constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
 class OpHandleBase {
 private:
  DISABLE_COPY_AND_ASSIGN(OpHandleBase);

--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -15,13 +15,15 @@
 #pragma once
 #include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 namespace paddle {
 namespace framework {
 namespace details {
 class SSAGraphExecutor {
  DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -136,12 +136,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    ready_ops.clear();
  };
-  // Create local scopes.
-  for (auto &scope : local_scopes_) {
-    auto &local_scope = scope->NewScope();
-    *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>() = &local_scope;
-  }
  // Step 3. Execution
  while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
    // 1. Run All Ready ops
@@ -189,34 +183,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
  PADDLE_ENFORCE(ready_ops.empty());
  PADDLE_ENFORCE(delayed_ops.empty());
  PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
-  ++computation_count_;
-  auto sync_computation = [&] {
-    computation_count_ = 0;
-    // Wait All computational streams
-    for (auto p : this->places_) {
-      platform::DeviceContextPool::Instance().Get(p)->Wait();
-    }
-    for (auto &scope : local_scopes_) {
-      scope->DropKids();
-    }
-  };
  // Wait FetchOps.
  if (!fetch_ops.empty()) {
    fetch_ops.clear();
-    sync_computation();
-  }
-  if (computation_count_ == max_async_computation) {
-    sync_computation();
-  }
-  // NOTE: the temp scope can be dropped lazily if needed.
-  // Drop tmp scopes;
-  for (auto &scope : local_scopes_) {
-    auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>();
-    kid = nullptr;
  }
  return fetch_data;

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -99,9 +99,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  std::unique_ptr<platform::EnforceNotMet> exception_;
  std::atomic<int> running_ops_;
  bool allow_op_delay_;
-  size_t computation_count_{0};
-  size_t max_async_computation{100};
 };
 }  // namespace details

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -46,7 +46,8 @@ proto::VarType::Type GetDataTypeOfVar(const Variable* var) {
  }
 }
-static DDim GetDims(const Scope& scope, const std::string& name) {
+static DDim GetDims(const Scope& scope, const std::string& name,
+                    bool get_actual_dim = false) {
  Variable* var = scope.FindVar(name);
  if (var == nullptr) {
    return DDim({-1});
@@ -55,7 +56,11 @@ static DDim GetDims(const Scope& scope, const std::string& name) {
  if (var->IsType<LoDTensor>()) {
    return var->Get<LoDTensor>().dims();
  } else if (var->IsType<SelectedRows>()) {
-    return var->Get<SelectedRows>().GetCompleteDims();
+    if (get_actual_dim) {
+      return var->Get<SelectedRows>().value().dims();
+    } else {
+      return var->Get<SelectedRows>().GetCompleteDims();
+    }
  } else {
    return DDim({-1});
  }
@@ -129,7 +134,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    for (size_t i = 0; i < input.second.size(); ++i) {
      ss << input.second[i];
      if (scope) {
-        ss << "[" << GetDims(*scope, input.second[i]) << "]";
+        ss << "[" << GetDims(*scope, input.second[i], true) << "]";
        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
      }
      if (i != input.second.size() - 1) {
@@ -149,7 +154,7 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
    for (size_t i = 0; i < output.second.size(); ++i) {
      ss << output.second[i];
      if (scope) {
-        ss << "[" << GetDims(*scope, output.second[i]) << "]";
+        ss << "[" << GetDims(*scope, output.second[i], true) << "]";
        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
      }
      if (i != output.second.size() - 1) {

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
 #include <string>
+#include <tuple>
 #include <vector>
 #ifdef PADDLE_WITH_CUDA
@@ -41,6 +42,8 @@ class ParallelExecutorPrivate {
 #ifdef PADDLE_WITH_CUDA
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
+  std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
 };
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -97,14 +100,9 @@ ParallelExecutor::ParallelExecutor(
      allow_op_delay));
  // Step 3. Create vars in each scope;
-  for (auto *scope : member_->local_scopes_) {
+  for (auto *var : main_program.Block(0).AllVars()) {
-    for (auto *var : main_program.Block(0).AllVars()) {
+    member_->var_types_.emplace_back(var->Name(), var->GetType(),
-      if (scope->FindVar(var->Name()) != nullptr) {
+                                     var->Persistable());
-        continue;
-      }
-      InitializeVariable(scope->Var(var->Name()), var->GetType());
-    }
  }
 }
@@ -163,9 +161,42 @@ void ParallelExecutor::Run(
    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
  platform::RecordBlock b(0);
  SplitTensorToPlaces(feed_tensors);
+  // Create local scopes.
+  for (auto &scope : member_->local_scopes_) {
+    Scope &local_scope = scope->NewScope();
+    *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
+        &local_scope;
+    for (auto &name_type_pair : member_->var_types_) {
+      if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) {
+        continue;
+      }
+      if (std::get<2>(name_type_pair)) {  // Persistable
+        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+                           std::get<1>(name_type_pair));
+      } else {
+        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+                           std::get<1>(name_type_pair));
+      }
+    }
+  }
  auto fetch_data = member_->executor_->Run(fetch_tensors);
  *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
      fetch_data;
+  // Wait All computational streams
+  for (auto p : member_->places_) {
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  }
+  for (auto &scope : member_->local_scopes_) {
+    auto &local_scope =
+        *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
+    scope->DeleteScope(local_scope);
+    local_scope = nullptr;
+  }
 }
 void ParallelExecutor::SplitTensorToPlaces(

--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
@@ -14,18 +14,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/prune.h"
+#include <gtest/gtest.h>
+#include <string>
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/net_op.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include <gtest/gtest.h>
 namespace f = paddle::framework;
-namespace ops = paddle::operators;
 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
           const f::VariableNameMap &outputs, f::AttributeMap attrs,

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init)
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
 cc_library(paddle_fluid_api
    SRCS io.cc
@@ -11,7 +11,7 @@ cc_library(paddle_fluid DEPS ${fluid_modules})
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
    SRCS io.cc
-    DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
+    DEPS ${fluid_modules})
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE)
  # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.

--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -17,10 +17,16 @@ limitations under the License. */
 #include <fstream>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/pybind/pybind.h"
 namespace paddle {
 namespace inference {
+// Temporarilly add this function for exposing framework::InitDevices() when
+// linking the inference shared library.
+void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
 void ReadBinaryFile(const std::string& filename, std::string& contents) {
  std::ifstream fin(filename, std::ios::in | std::ios::binary);
  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);

--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -18,12 +18,15 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 namespace paddle {
 namespace inference {
+void Init(bool init_p2p);
 void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
                      const framework::ProgramDesc& main_program,
                      const std::string& dirname,

--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -17,7 +17,7 @@ function(inference_test TARGET_NAME)
    string(REGEX REPLACE "^_$" "" arg "${arg}")
    cc_test(test_inference_${TARGET_NAME}${arg}
        SRCS test_inference_${TARGET_NAME}.cc
-        DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+        DEPS paddle_fluid
        ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
    set_tests_properties(test_inference_${TARGET_NAME}${arg}
        PROPERTIES DEPENDS test_${TARGET_NAME})

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -100,7 +100,7 @@ function(op_library TARGET)
    endif()
    # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()
@@ -199,7 +199,6 @@ else()
    set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
 endif()
-op_library(cond_op DEPS framework_proto tensor net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
@@ -259,7 +258,6 @@ endforeach()
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)

--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/concat_op.h"
 #include <string>
 #include <vector>
@@ -34,7 +35,10 @@ class ConcatOp : public framework::OperatorWithKernel {
    size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
    const size_t n = ins.size();
-    PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1.");
+    PADDLE_ENFORCE_GT(n, 0, "Input tensors count should > 0.");
+    if (n == 1) {
+      VLOG(3) << "Warning: concat op have only one input, may waste memory";
+    }
    auto out_dims = ins[0];
    size_t in_zero_dims_size = out_dims.size();

--- a/paddle/fluid/operators/cond_op.cc
+++ b/paddle/fluid/operators/cond_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/cond_op.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
-#include "paddle/fluid/platform/device_context.h"
-namespace paddle {
-namespace operators {
-using Scope = framework::Scope;
-using Variable = framework::Variable;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-framework::Scope& CondOp::AddSubScope(const Scope& scope) const {
-  auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
-  auto& sub_scope = scope.NewScope();
-  sub_scopes->push_back(&sub_scope);
-  return sub_scope;
-}
-std::vector<framework::Scope*>& CondOp::GetSubScopes(
-    const framework::Scope& scope) const {
-  auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  return *sub_scopes_var->GetMutable<std::vector<framework::Scope*>>();
-}
-LoDTensor& CondOp::AddIndexTensor(const Scope& scope) const {
-  auto index_tensors_var = scope.FindVar("IndexTensors");
-  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
-                          "Output(IndexTensors) of CondOp should not be null.");
-  auto& index_tensors =
-      *index_tensors_var->GetMutable<std::vector<LoDTensor>>();
-  index_tensors.push_back(LoDTensor());
-  return index_tensors.back();
-}
-std::vector<framework::LoDTensor>& CondOp::GetIndexTensors(
-    const framework::Scope& scope) const {
-  auto* index_tensors_var = scope.FindVar("IndexTensors");
-  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
-                          "Output(IndexTensors) of CondOp should not be null.");
-  return *index_tensors_var->GetMutable<std::vector<framework::LoDTensor>>();
-}
-void CondOp::PrepareDataForSubnet(
-    const framework::Scope& scope,
-    const platform::DeviceContext& dev_ctx) const {
-  PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs(Xs) of CondOp can't be empty.");
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    // Create two sub scopes for true and false branches
-    //   sub_scopes[0] for the true branch
-    //   sub_scopes[1] for the false branch
-    AddSubScope(scope);
-    // Create two tensors for true and false indices:
-    //   index_tensors[0] for the true branch
-    //   index_tensors[1] for the false branch
-    AddIndexTensor(scope);
-  }
-  Variable* cond_var = scope.FindVar(Input("Cond"));
-  PADDLE_ENFORCE_NOT_NULL(cond_var,
-                          "Input(Cond) of CondOp should not be null.");
-  const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
-  // get the true/false index at runtime according to cond tensor
-  // index_vectors[0]: vector<int>, contains all index for cond[i] == true
-  // index_vectors[1]: vector<int>, contains all index for cond[i] == false
-  std::vector<std::vector<int>> index_vectors;
-  index_vectors.resize(BRANCH_NUM);
-  const int* cond_data = cond->data<int>();
-  for (int i = 0; i < cond->dims()[0]; ++i) {
-    if (cond_data[i])
-      index_vectors[TRUE_BRANCH].push_back(i);
-    else
-      index_vectors[FALSE_BRANCH].push_back(i);
-  }
-  // put index_vectors[0] and index_vectors[1] into two tensors:
-  // index_tensors[0] and index_tensors[1]
-  std::vector<framework::LoDTensor>& index_tensors = GetIndexTensors(scope);
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    DDim dim = {static_cast<int64_t>(index_vectors[i].size())};
-    int* index_tensor_data_ptr =
-        index_tensors[i].mutable_data<int>(dim, platform::CPUPlace());
-    memcpy(index_tensor_data_ptr, index_vectors[i].data(),
-           dim[0] * sizeof(int));
-  }
-  // create input in subscopes according to index_vectors
-  for (auto& input : Inputs("Xs")) {
-    Variable* var_parent = scope.FindVar(input);
-    PADDLE_ENFORCE_NOT_NULL(var_parent);
-    const auto* tensor_parent = &var_parent->Get<LoDTensor>();
-    for (int i = 0; i < BRANCH_NUM; ++i) {
-      Variable* var_child = sub_scopes[i]->FindVar(input);
-      PADDLE_ENFORCE_NOT_NULL(var_child);
-      auto* tensor_child = var_child->GetMutable<LoDTensor>();
-      // Resize child
-      DDim dim = tensor_parent->dims();
-      dim[0] = index_tensors[i].dims()[0];
-      tensor_child->mutable_data<float>(dim, platform::CPUPlace());
-      CPUGather<float>(dev_ctx, *tensor_parent, index_tensors[i], tensor_child);
-    }
-  }
-  // create output_tensors in subscope for sub_net
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    for (auto& output : (*sub_net_op_[i]).Outputs()) {
-      for (auto& var_name : output.second) {
-        sub_scopes[i]->Var(var_name);
-      }
-    }
-  }
-}
-void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
-                                 const platform::DeviceContext& dev_ctx) const {
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-  const std::vector<framework::LoDTensor>& index_tensors =
-      GetIndexTensors(scope);
-  // Infer the output dim, out_dim[0] = true_dim[0] + false_dim[0]
-  PADDLE_ENFORCE(!Outputs("Outs").empty(),
-                 "Outputs(Outs) of CondOp can't be empty.");
-  for (auto& output : Outputs("Outs")) {
-    const LoDTensor* tensor_t_out =
-        &sub_scopes[TRUE_BRANCH]->FindVar(output)->Get<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
-    const LoDTensor* tensor_f_out =
-        &sub_scopes[FALSE_BRANCH]->FindVar(output)->Get<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
-    auto* var_out = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(var_out, "Output not found");
-    LoDTensor* tensor_out = var_out->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
-                            "True output tensor should not be NULL");
-    DDim true_dim = tensor_t_out->dims();
-    DDim false_dim = tensor_f_out->dims();
-    true_dim[0] = 0;
-    false_dim[0] = 0;
-    PADDLE_ENFORCE_EQ(true_dim, false_dim,
-                      "Outputs not of the same shape except the first dim");
-    DDim out_dim = tensor_t_out->dims();
-    out_dim[0] = tensor_t_out->dims()[0] + tensor_f_out->dims()[0];
-    tensor_out->Resize(out_dim);
-    tensor_out->mutable_data<float>(platform::CPUPlace());
-  }
-  // merge output results:
-  // output_tensor = true_output_tensor + false_output_tensor
-  for (auto& output : Outputs("Outs")) {
-    Variable* var_parent = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(var_parent);
-    auto* tensor_parent = var_parent->GetMutable<LoDTensor>();
-    for (int i = 0; i < BRANCH_NUM; ++i) {
-      Variable* var_child = sub_scopes[i]->FindVar(output);
-      PADDLE_ENFORCE_NOT_NULL(var_child);
-      auto* tensor_child = &var_child->Get<LoDTensor>();
-      ScatterAssign<float>(dev_ctx, *tensor_child, index_tensors[i],
-                           tensor_parent);
-    }
-  }
-}
-void CondOp::RunImpl(const Scope& scope, const platform::Place& place) const {
-  // get device context from pool
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Get(place);
-  PrepareDataForSubnet(scope, dev_ctx);
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    sub_net_op_[i]->Run(*sub_scopes[i], place);
-  }
-  MergeDataFromSubnet(scope, dev_ctx);
-}
-class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Cond", "The condition, which is a bool vector");
-    AddInput("Xs", "Inputs of Subnets").AsDuplicable();
-    AddOutput("Outs", "Outputs of Cond_Op after merge").AsDuplicable();
-    AddOutput("SubScopes", "sub scopes for true and false branches");
-    AddOutput("IndexTensors", "Index Tensors contains indices for true/false");
-    AddComment(R"DOC(
-Sample Dependent Conditional Operator.
-Given Cond[i] as a 1/0 vector to indicate true/false:
-Out[i] = subnet_true[i], if Cond[i] == true
-Out[i] = subnet_false[i], if Cond[i] == false
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OP_WITHOUT_GRADIENT(cond, paddle::operators::CondOp,
-                             paddle::operators::CondOpProtoAndCheckerMaker);
--- a/paddle/fluid/operators/cond_op.h
+++ b/paddle/fluid/operators/cond_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <string>
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/net_op.h"
-namespace paddle {
-namespace operators {
-/*
- * @brief CondOp is a dynamic if-else Operator
- *
- * It has a input tensor named cond indicating which netop each instance will
- * run.
- *
- * if cond == 1, it will run true_net, which is a NetOp.
- *
- * if cond == 0, it will run false_net, which is another NetOp.
- */
-class CondOp : public framework::OperatorBase {
- public:
-  CondOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {
-    sub_net_op_.resize(BRANCH_NUM);
-  }
-  CondOp(const CondOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    // TODO(yuyang18): Implement copy ctor well.
-    PADDLE_THROW("Not implemented");
-  }
-  framework::Scope& AddSubScope(const framework::Scope& scope) const;
-  std::vector<framework::Scope*>& GetSubScopes(
-      const framework::Scope& scope) const;
-  framework::LoDTensor& AddIndexTensor(const framework::Scope& scope) const;
-  std::vector<framework::LoDTensor>& GetIndexTensors(
-      const framework::Scope& scope) const;
-  void PrepareDataForSubnet(const framework::Scope& scope,
-                            const platform::DeviceContext& dev_ctx) const;
-  void MergeDataFromSubnet(const framework::Scope& scope,
-                           const platform::DeviceContext& dev_ctx) const;
-  /*
-   * Set True Block
-   */
-  void set_truenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[TRUE_BRANCH] = std::move(net);
-  }
-  /*
-   * Set False Block
-   */
-  void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[FALSE_BRANCH] = std::move(net);
-  }
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override;
- private:
-  const int TRUE_BRANCH = 0;
-  const int FALSE_BRANCH = 1;
-  const int BRANCH_NUM = 2;
-  // sub_net_op_[0]: subnet_t
-  // sub_net_op_[1]: subnet_f
-  std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <stdio.h>
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+#include <vector>
 #include "paddle/fluid/operators/ctc_align_op.h"
 namespace paddle {

--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string.h>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"

--- a/paddle/fluid/operators/detail/grpc_server.cc
+++ b/paddle/fluid/operators/detail/grpc_server.cc
@@ -161,6 +161,7 @@ class RequestPrefetch final : public RequestBase {
    ::grpc::ByteBuffer reply;
    std::string var_name = request_->OutVarname();
+    VLOG(3) << "prefetch var " << var_name;
    auto var_desc = program_->Block(0).FindVar(var_name);
    framework::Scope* local_scope = &scope_->NewScope();
    auto* var = local_scope->FindVar(var_name);

--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -106,18 +107,18 @@ information. However, the output only shares the LoD information with input $X$.
 protected:
  std::string comment_;
-  void Replace(std::string& src, std::string from, std::string to) {
+  void Replace(std::string* src, std::string from, std::string to) {
    std::size_t len_from = std::strlen(from.c_str());
    std::size_t len_to = std::strlen(to.c_str());
-    for (std::size_t pos = src.find(from); pos != std::string::npos;
+    for (std::size_t pos = src->find(from); pos != std::string::npos;
-         pos = src.find(from, pos + len_to)) {
+         pos = src->find(from, pos + len_to)) {
-      src.replace(pos, len_from, to);
+      src->replace(pos, len_from, to);
    }
  }
  void SetComment(std::string name, std::string equation) {
-    Replace(comment_, "{name}", name);
+    Replace(&comment_, "{name}", name);
-    Replace(comment_, "{equation}", equation);
+    Replace(&comment_, "{equation}", equation);
  }
 };

--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/gru_op.h"
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/im2sequence_op.h"
+#include <vector>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -13,7 +13,7 @@
   limitations under the License. */
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"

--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/label_smooth_op.h"
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -100,7 +100,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
    auto x_row_max = EigenMatrix<T>::From(emission_row_max);
    x_row_max.device(place) =
        x.maximum(Eigen::DSizes<int, 1>(1))
-            .reshape(Eigen::DSizes<int, 2>(int(batch_size), 1));
+            .reshape(Eigen::DSizes<int, 2>(static_cast<int>(batch_size), 1));
    auto x_exps = EigenMatrix<T>::From(*emission_exps);
    x_exps.device(place) =

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <ostream>
-#include <thread>
+#include <thread>  // NOLINT
+#include <vector>
 #include "paddle/fluid/operators/listen_and_serv_op.h"
@@ -88,8 +89,9 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
  auto ins = Inputs("X");
  auto fan_in = Attr<int>("Fanin");
-  auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
+  auto *optimize_block = Attr<framework::BlockDesc *>(kOptimizeBlock);
-  auto *program = block->Program();
+  auto *prefetch_block = Attr<framework::BlockDesc *>(kPrefetchBlock);
+  auto *program = optimize_block->Program();
  size_t num_blocks = program->Size();
  PADDLE_ENFORCE_GE(num_blocks, 2,
                    "server program should have at least 2 blocks");
@@ -97,18 +99,25 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
  framework::Executor executor(dev_place);
  std::vector<int> block_list;
  for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
-    block_list.push_back(blkid);
+    if (blkid != prefetch_block->ID()) {
+      block_list.push_back(blkid);
+    }
  }
-  auto prepared = executor.Prepare(*program, block_list);
+  auto optimize_prepared = executor.Prepare(*program, block_list);
  // Insert placeholder for block0 which holds current op itself.
-  prepared.insert(prepared.begin(),
+  optimize_prepared.insert(
-                  std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
+      optimize_prepared.begin(),
+      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
  rpc_service_->SetScope(&recv_scope);
  rpc_service_->SetDevCtx(&dev_ctx);
  // TODO(qiao) set proper fields for table lookup and update
  rpc_service_->SetExecutor(&executor);
-  rpc_service_->SetPrefetchBlkdId(0);
+  VLOG(3) << "prefetch block id is " << prefetch_block->ID();
+  auto prefetch_prepared = executor.Prepare(*program, prefetch_block->ID());
+  rpc_service_->SetPrefetchBlkdId(prefetch_block->ID());
+  rpc_service_->SetPrefetchPreparedCtx(prefetch_prepared.get());
+  prefetch_prepared.release();
  rpc_service_->SetProgram(program);
  // start the server listening after all member initialized.
  server_thread_.reset(new std::thread(RunServer, rpc_service_));
@@ -166,16 +175,18 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope,
    parallel_blkids.push_back(1);
    double ts = detail::GetTimestamp();
    for (size_t blkid = 2; blkid < num_blocks; ++blkid) {
-      if (program->Block(blkid).Parent() != last_parent_blkid) {
+      if (blkid != prefetch_block->ID()) {
-        ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
+        if (program->Block(blkid).Parent() != last_parent_blkid) {
-                              &recv_scope);
+          ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
-        parallel_blkids.clear();
+                                program, &recv_scope);
-        last_parent_blkid = program->Block(blkid).Parent();
+          parallel_blkids.clear();
+          last_parent_blkid = program->Block(blkid).Parent();
+        }
+        parallel_blkids.push_back(blkid);
      }
-      parallel_blkids.push_back(blkid);
    }
-    ParallelExecuteBlocks(parallel_blkids, &executor, prepared, program,
+    ParallelExecuteBlocks(parallel_blkids, &executor, optimize_prepared,
-                          &recv_scope);
+                          program, &recv_scope);
    VLOG(2) << "run all blocks spent " << detail::GetTimestamp() - ts << "(ms)";
    // Reset the received sparse variables, the sum operator would not
@@ -211,6 +222,8 @@ from send_op and send back variables to recv_op.
        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
    AddAttr<framework::BlockDesc *>(kOptimizeBlock,
                                    "BlockID to run on server side.");
+    AddAttr<framework::BlockDesc *>(kPrefetchBlock,
+                                    "prefetch block to run on server side.");
    AddAttr<int>("Fanin", "How many clients send to this server.")
        .SetDefault(1);
  }

--- a/paddle/fluid/operators/listen_and_serv_op.h
+++ b/paddle/fluid/operators/listen_and_serv_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <stdint.h>
 #include <ostream>
+#include <string>
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -27,6 +28,7 @@ namespace paddle {
 namespace operators {
 constexpr char kOptimizeBlock[] = "OptimizeBlock";
+constexpr char kPrefetchBlock[] = "PrefetchBlock";
 void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service);

--- a/paddle/fluid/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/logical_op.h"
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {

--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -78,6 +78,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
                  "(boolean, default false) "
                  "Sparse update.")
        .SetDefault(false);
+    AddAttr<bool>("is_distributed",
+                  "(boolean, default false) distributed lookup table.")
+        .SetDefault(false);
    AddAttr<int64_t>("padding_idx",
                     "(int64, default -1) "
                     "If the value is -1, it makes no effect to lookup. "

--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/lrn_op.h"
+#include <string>
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif

--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/lstm_op.h"
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"

--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
@@ -18,6 +18,7 @@ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.c
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/cross_entropy_op.h"
+#include "paddle/fluid/operators/lstm_unit_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/hostdevice.h"

--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/lstmp_op.h"
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"

--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/matmul_op.h"
+#include <algorithm>
+#include <vector>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/matmul_op.h
+++ b/paddle/fluid/operators/matmul_op.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <algorithm>
+#include <functional>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matmul.h"

--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -13,6 +13,8 @@
 *     limitations under the License. */
 #include "paddle/fluid/operators/maxout_op.h"
+#include <vector>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/minus_op.h"
-#include "paddle/fluid/operators/net_op.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/momentum_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/mul_op.h"
+#include <vector>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/net_op.cc
+++ b/paddle/fluid/operators/net_op.cc
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/net_op.h"
-#include <set>
-#include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace operators {
-const char NetOp::kAll[] = "all";
-void NetOp::CompleteAddOp(bool calc) {
-  add_op_done_ = true;
-  if (!calc) return;
-  std::set<std::string> input_set;
-  std::set<std::string> output_set;
-  for (auto& op : ops_) {
-    for (auto& ipt : op->Inputs()) {
-      for (auto& var_name : ipt.second) {
-        // If input variable has been in output set, then it will be
-        // added into intermediate_outputs_. Otherwise, it will be
-        // added into input set.
-        if (Contains(output_set, var_name)) {
-          intermediate_outputs_.insert(var_name);
-        } else {
-          input_set.insert(var_name);
-        }
-      }
-    }
-    for (auto& opt : op->Outputs()) {
-      for (auto& var_name : opt.second) {
-        output_set.insert(var_name);
-      }
-    }
-  }
-  auto& inputs = inputs_[kAll];
-  inputs.reserve(input_set.size());
-  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs));
-  auto& outputs = outputs_[kAll];
-  outputs.reserve(output_set.size());
-  std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs));
-}
-std::string NetOp::DebugStringEx(const framework::Scope* scope) const {
-  std::ostringstream os;
-  os << OperatorBase::DebugStringEx(scope) << std::endl;
-  for (auto& op : ops_) {
-    std::istringstream is(op->DebugStringEx(scope));
-    for (std::string line; std::getline(is, line);) {
-      os << "    " << line << std::endl;
-    }
-  }
-  return os.str();
-}
-bool NetOp::IsNetOp() const { return true; }
-std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
-  std::vector<std::string> all;
-  for (auto& pair : this->outputs_) {
-    for (auto& var_name : pair.second) {
-      all.push_back(var_name);
-    }
-  }
-  if (has_intermediate) {
-    return all;
-  }
-  std::vector<std::string> ret_val;
-  for (auto& each : all) {
-    if (!Contains(intermediate_outputs_, each)) {
-      ret_val.push_back(each);
-    }
-  }
-  return ret_val;
-}
-NetOp::NetOp(const std::string& type, const framework::VariableNameMap& inputs,
-             const framework::VariableNameMap& outputs,
-             const framework::AttributeMap& attrs)
-    : framework::OperatorBase(type, inputs, outputs, attrs) {}
-std::unique_ptr<framework::OperatorBase> NetOp::Clone() const {
-  PADDLE_ENFORCE(
-      add_op_done_,
-      "Must clone a sealed NetOp, invoke Net::CompleteAddOp before clone");
-  return std::unique_ptr<OperatorBase>(new NetOp(*this));
-}
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/net_op.h
+++ b/paddle/fluid/operators/net_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <set>
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace operators {
-/**
- * @brief Network is also a type of Operator
- *
- * It will manage the operators it has.
- *
- * Network is the container and controller of a set of operators.
- * A network object knows all Operators belonging to this network. Variables,
- * which are inputs and outputs of these operators, are created and managed by a
- * hierarchy of Scope objects.
- *
- * This is the base class of network, all the networks should implement the APIs
- * it defines.
- */
-class NetOp : public framework::OperatorBase {
- public:
-  static const char kAll[];
-  NetOp()
-      : framework::OperatorBase("plain_net", framework::VariableNameMap{},
-                                framework::VariableNameMap{},
-                                framework::AttributeMap{}) {}
-  NetOp(const std::string& type, const framework::VariableNameMap& inputs,
-        const framework::VariableNameMap& outputs,
-        const framework::AttributeMap& attrs);
-  NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) {
-    this->ops_.reserve(o.ops_.size());
-    std::transform(
-        o.ops_.begin(), o.ops_.end(), std::back_inserter(this->ops_),
-        [](const std::unique_ptr<framework::OperatorBase>& op) {
-          return std::unique_ptr<framework::OperatorBase>(op->Clone());
-        });
-    this->CompleteAddOp();
-  }
-  bool SupportGPU() const override {
-    for (auto& op : ops_) {
-      if (!op->SupportGPU()) {
-        return false;
-      }
-    }
-    return true;
-  }
-  void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); }
-  /**
-   * @brief Add an operator by ptr
-   */
-  void AppendOp(std::unique_ptr<framework::OperatorBase> op) {
-    PADDLE_ENFORCE(!add_op_done_,
-                   "Cannot AppendOp when this network is sealed");
-    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
-    ops_.push_back(std::move(op));
-  }
-  void InsertOp(size_t pos, std::unique_ptr<framework::OperatorBase> op) {
-    PADDLE_ENFORCE(!add_op_done_,
-                   "Cannot InsertOp when this network is sealed");
-    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
-    PADDLE_ENFORCE_LE(pos, ops_.size(), "Out of range");
-    ops_.insert(ops_.begin() + pos, std::move(op));
-  }
-  void InsertOp(size_t pos, const framework::OperatorBase& op) {
-    InsertOp(pos, op.Clone());
-  }
-  void CompleteAddOp(bool calculate = true);
-  std::string DebugStringEx(
-      const framework::Scope* scope = nullptr) const override;
-  bool IsNetOp() const override;
-  std::vector<std::string> OutputVars(bool has_intermediate) const override;
-  std::unique_ptr<framework::OperatorBase> Clone() const override;
-  std::vector<std::unique_ptr<framework::OperatorBase>> ops_;
- private:
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators with the `scope`, if no scope is provided, default
-   * scope will be used instead. If no OpContext is provicded, default context
-   * will be used.
-   */
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    for (auto& op : ops_) {
-      op->Run(scope, place);
-    }
-  }
-  bool add_op_done_{false};
-  std::set<std::string> intermediate_outputs_;
-  template <typename T, typename KeyType>
-  static bool Contains(T container, KeyType key) {
-    return container.find(key) != container.end();
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/net_op_test.cc
+++ b/paddle/fluid/operators/net_op_test.cc
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/net_op.h"
-#include <gtest/gtest.h>
-namespace paddle {
-namespace operators {
-using Scope = framework::Scope;
-using DeviceContext = platform::DeviceContext;
-static int run_cnt = 0;
-class TestOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  DEFINE_OP_CLONE_METHOD(TestOp);
- private:
-  void RunImpl(const Scope& scope,
-               const platform::Place& place) const override {
-    ++run_cnt;
-  }
-};
-template <typename T>
-void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
-                                  const std::vector<T>& actual) {
-  ASSERT_EQ(expected.size(), actual.size());
-  std::unordered_set<T> expected_set;
-  for (auto& tmp : expected) {
-    expected_set.insert(tmp);
-  }
-  for (auto& act : actual) {
-    ASSERT_NE(expected_set.end(), expected_set.find(act));
-  }
-}
-TEST(OpKernel, all) {
-  auto net = std::make_shared<NetOp>();
-  ASSERT_NE(net, nullptr);
-  net->AppendOp(std::unique_ptr<TestOp>(
-      new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                 {{"Out", {"y"}}}, framework::AttributeMap{})));
-  net->AppendOp(std::unique_ptr<TestOp>(
-      new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
-                 {{"Out", {"z"}}}, framework::AttributeMap{})));
-  net->CompleteAddOp();
-  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"},
-                               net->Inputs(NetOp::kAll));
-  AssertSameVectorWithoutOrder({"y", "z"}, net->Outputs(NetOp::kAll));
-  auto final_outs = net->OutputVars(false);
-  ASSERT_EQ(final_outs.size(), 1UL);
-  ASSERT_EQ(final_outs[0], "z");
-}
-TEST(NetOp, insert_op) {
-  NetOp net;
-  auto op1 = std::unique_ptr<framework::NOP>(
-      new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                         {{"Out", {"y"}}}, framework::AttributeMap{}));
-  net.AppendOp(*op1);
-  net.InsertOp(0, *op1);
-  ASSERT_EQ(2UL, net.ops_.size());
-  net.InsertOp(2, std::move(op1));
-  ASSERT_EQ(3UL, net.ops_.size());
-}
-TEST(NetOp, Clone) {
-  NetOp net;
-  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
-      "empty", framework::VariableNameMap{}, framework::VariableNameMap{},
-      framework::AttributeMap{}}));
-  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
-      "empty2", framework::VariableNameMap{}, framework::VariableNameMap{},
-      framework::AttributeMap{}}));
-  net.CompleteAddOp(true);
-  auto new_net_op = net.Clone();
-  ASSERT_NE(new_net_op, nullptr);
-  ASSERT_TRUE(new_net_op->IsNetOp());
-  auto* new_net = static_cast<NetOp*>(new_net_op.get());
-  ASSERT_EQ(2UL, new_net->ops_.size());
-  ASSERT_EQ(new_net->ops_[0]->Type(), "empty");
-  ASSERT_EQ(new_net->ops_[1]->Type(), "empty2");
-}
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/prefetch_op.cc
+++ b/paddle/fluid/operators/prefetch_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <future>
+#include <future>  // NOLINT
 #include <ostream>
 #include "paddle/fluid/framework/data_type.h"
@@ -50,8 +50,8 @@ class PrefetchOp : public framework::OperatorBase {
    for (size_t i = 0; i < ins.size(); i++) {
      if (NeedSend(scope, ins[i])) {
-        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << "to get "
+        VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << " to get "
-                << outs[i] << "back";
+                << outs[i] << " back";
        rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i],
                                          outs[i]);
      } else {
@@ -71,7 +71,7 @@ class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker {
              "(RPCClient) The RPC client object which will be"
              "initialized at most once.");
    AddOutput("Out",
-              "(SelectedRows) result "
+              "(LoDTensor) result "
              "to be fetched from parameter server")
        .AsDuplicable();
    AddAttr<std::vector<std::string>>(

--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/prelu_op.h"
-#include "paddle/fluid/operators/net_op.h"
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/scale_op.h"
-#include "paddle/fluid/operators/net_op.h"
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/send_recv_op_test.cc
+++ b/paddle/fluid/operators/send_recv_op_test.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <unistd.h>
 #include <string>
-#include <thread>
+#include <thread>  // NOLINT
 #include "gtest/gtest.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -37,11 +37,11 @@ namespace m = paddle::operators::math;
 std::unique_ptr<f::OperatorBase> listen_and_serv_op;
 int selected_port;
-void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
+void InitTensorsInScope(const p::CPUPlace &place, f::Scope *scope) {
  p::CPUDeviceContext ctx(place);
  for (int i = 0; i < 2; ++i) {
    auto var_name = paddle::string::Sprintf("x%d", i);
-    auto var = scope.Var(var_name);
+    auto var = scope->Var(var_name);
    auto tensor = var->GetMutable<f::LoDTensor>();
    tensor->Resize({10, 10});
    float *expect = tensor->mutable_data<float>(place);
@@ -50,20 +50,20 @@ void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
    }
  }
-  auto out_var = scope.Var("Out");
+  auto out_var = scope->Var("Out");
  auto out_tensor = out_var->GetMutable<f::LoDTensor>();
  out_tensor->Resize({10, 10});
  out_tensor->mutable_data<float>(place);  // allocate
 }
-void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
+void InitSelectedRowsInScope(const p::CPUPlace &place, f::Scope *scope) {
  p::CPUDeviceContext ctx(place);
  int64_t height = 10;
  int64_t row_numel = 10;
  m::SetConstant<p::CPUDeviceContext, float> set_one;
  // init x0
  std::vector<int64_t> rows0{0, 4, 7};
-  auto x0_var = scope.Var("x0");
+  auto x0_var = scope->Var("x0");
  auto x0 = x0_var->GetMutable<f::SelectedRows>();
  x0->set_rows(rows0);
  x0->set_height(height);
@@ -74,7 +74,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
  // init x1
  std::vector<int64_t> rows1{2, 9};
-  auto x1_var = scope.Var("x1");
+  auto x1_var = scope->Var("x1");
  auto x1 = x1_var->GetMutable<f::SelectedRows>();
  x1->set_rows(rows1);
  x1->set_height(height);
@@ -83,7 +83,7 @@ void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
      f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
  set_one(ctx, x1_value, 1.0);
-  auto out_var = scope.Var("Out");
+  auto out_var = scope->Var("Out");
  auto out = out_var->GetMutable<f::SelectedRows>();
  auto out_value = out->mutable_value();
  out->set_height(height);
@@ -117,15 +117,16 @@ void StartServerNet(bool is_sparse) {
  f::Scope scope;
  p::CPUPlace place;
  if (is_sparse) {
-    InitSelectedRowsInScope(scope, place);
+    InitSelectedRowsInScope(place, &scope);
  } else {
-    InitTensorsInScope(scope, place);
+    InitTensorsInScope(place, &scope);
  }
  // sub program run in listen_and_serv_op, for simple test we use sum
  f::ProgramDesc program;
  const auto &root_block = program.Block(0);
  auto *optimize_block = program.AppendBlock(root_block);
+  auto *prefetch_block = program.AppendBlock(root_block);
  // X for server side tensors, RX for received tensers, must be of same shape.
  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block);
@@ -135,6 +136,7 @@ void StartServerNet(bool is_sparse) {
  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
  attrs.insert({"OptimizeBlock", optimize_block});
+  attrs.insert({"PrefetchBlock", prefetch_block});
  listen_and_serv_op =
      f::OpRegistry::CreateOp("listen_and_serv", {{"X", {"x1"}}}, {}, attrs);
  LOG(INFO) << "selected port before run " << selected_port;
@@ -148,7 +150,7 @@ TEST(SendRecvOp, CPUDense) {
  // local net
  f::Scope scope;
  p::CPUPlace place;
-  InitTensorsInScope(scope, place);
+  InitTensorsInScope(place, &scope);
  // create rpc client var
  scope.Var("RPC_CLIENT_VAR");
@@ -191,7 +193,7 @@ TEST(SendRecvOp, CPUSparse) {
  f::Scope scope;
  p::CPUPlace place;
  p::CPUDeviceContext ctx(place);
-  InitSelectedRowsInScope(scope, place);
+  InitSelectedRowsInScope(place, &scope);
  scope.Var("RPC_CLIENT_VAR");
  f::AttributeMap attrs;
  selected_port = static_cast<paddle::operators::ListenAndServOp *>(

--- a/paddle/fluid/operators/send_vars_op.cc
+++ b/paddle/fluid/operators/send_vars_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <future>
+#include <future>  // NOLINT
 #include <ostream>
 #include "paddle/fluid/framework/data_type.h"
@@ -36,7 +36,7 @@ class SendVarsOp : public framework::OperatorBase {
    auto ins = Inputs("X");
    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    int sync_send = Attr<int>("sync_sent");
+    int sync_send = Attr<int>("sync_send");
    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
    auto& ctx = *pool.Get(place);

--- a/paddle/fluid/operators/sgd_op.cc
+++ b/paddle/fluid/operators/sgd_op.cc
@@ -35,8 +35,8 @@ class SGDOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
                      "Learning rate should have 1 element");
    auto param_dim = ctx->GetInputDim("Param");
-    // TODO(qijun): check dimensions of Param and Grad at complie
+    // TODO(qijun): check dimensions of Param and Grad at compile
-    // and run time.
+    // and runtime.
    ctx->SetOutputDim("ParamOut", param_dim);
  }

--- a/paddle/fluid/operators/split_ids_op.cc
+++ b/paddle/fluid/operators/split_ids_op.cc
@@ -48,11 +48,11 @@ class SplitIdsOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out.");
    auto ids_var_type = ctx->GetInputsVarType("Ids").front();
-    PADDLE_ENFORCE_EQ(ids_var_type, framework::proto::VarType::LOD_TENSOR);
    auto ids_dims = ctx->GetInputDim("Ids");
-    PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+    if (ids_var_type == framework::proto::VarType::LOD_TENSOR) {
-    PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+      PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+      PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+    }
  }
 };
@@ -60,8 +60,9 @@ class SplitIdsOpInferVarType : public framework::VarTypeInference {
 public:
  void operator()(const framework::OpDesc &op_desc,
                  framework::BlockDesc *block) const override {
+    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
    for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR);
+      block->Var(out_var)->SetType(input_var->GetType());
    }
  }
 };
@@ -73,4 +74,5 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
                  ops::SplitIdsOpInferVarType);
 REGISTER_OP_CPU_KERNEL(
-    split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>);
+    split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>,
+    ops::SplitIdsOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/fluid/operators/split_ids_op.h
+++ b/paddle/fluid/operators/split_ids_op.h
@@ -24,35 +24,63 @@ namespace operators {
 template <typename DeviceContext, typename T>
 class SplitIdsOpKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext &ctx) const override {
    auto place = ctx.GetPlace();
    if (!platform::is_cpu_place(place)) {
      PADDLE_THROW("SplitIds do not support GPU kernel");
    }
-    auto& ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims();
+    const auto *ids_var = ctx.InputVar("Ids");
-    const T* ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>();
+    if (ids_var->IsType<framework::LoDTensor>()) {
-    auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
+      const auto &ids_dims = ctx.Input<framework::LoDTensor>("Ids")->dims();
-    const size_t shard_num = outs.size();
+      const T *ids = ctx.Input<framework::LoDTensor>("Ids")->data<T>();
+      auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
+      const size_t shard_num = outs.size();
-    std::vector<std::vector<T>> out_ids;
+      std::vector<std::vector<T>> out_ids;
-    out_ids.resize(outs.size());
+      out_ids.resize(outs.size());
-    // split id by their shard_num.
+      // split id by their shard_num.
-    for (int i = 0; i < ids_dims[0]; ++i) {
+      for (int i = 0; i < ids_dims[0]; ++i) {
-      T id = ids[i];
+        T id = ids[i];
-      size_t shard_id = static_cast<size_t>(id) % shard_num;
+        size_t shard_id = static_cast<size_t>(id) % shard_num;
-      out_ids[shard_id].push_back(id);
+        out_ids[shard_id].push_back(id);
-    }
+      }
+      // create tensor for each shard and send to parameter server
+      for (size_t i = 0; i < out_ids.size(); ++i) {
+        auto *shard_t = outs[i];
+        std::vector<T> ids = out_ids[i];
+        auto *shard_data = shard_t->mutable_data<T>(
+            framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
+        for (size_t i = 0; i < ids.size(); ++i) {
+          shard_data[i] = ids[i];
+        }
+      }
+    } else if (ids_var->IsType<framework::SelectedRows>()) {
+      const auto *ids_selected_rows = ctx.Input<framework::SelectedRows>("Ids");
+      auto &ids_dims = ids_selected_rows->value().dims();
+      PADDLE_ENFORCE_EQ(ids_dims[0], ids_selected_rows->rows().size(), "");
+      const T *ids = ids_selected_rows->value().data<T>();
+      const auto &ids_rows = ids_selected_rows->rows();
+      auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
+      const size_t shard_num = outs.size();
+      // get rows for outputs
+      for (auto &id : ids_rows) {
+        size_t shard_id = static_cast<size_t>(id) % shard_num;
+        outs[shard_id]->mutable_rows()->push_back(id);
+      }
-    // create tensor for each shard and send to parameter server
+      int64_t row_width = ids_dims[1];
-    for (size_t i = 0; i < out_ids.size(); ++i) {
+      for (auto &out : outs) {
-      auto* shard_t = outs[i];
+        out->set_height(ids_selected_rows->height());
-      std::vector<T> ids = out_ids[i];
+        framework::DDim ddim = framework::make_ddim(
-      auto* shard_data = shard_t->mutable_data<T>(
+            {static_cast<int64_t>(out->rows().size()), row_width});
-          framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
+        T *output = out->mutable_value()->mutable_data<T>(ddim, place);
-      for (size_t i = 0; i < ids.size(); ++i) {
+        for (size_t i = 0; i < ddim[0]; ++i) {
-        shard_data[i] = ids[i];
+          memcpy(output + i * row_width, ids + out->rows()[i] * row_width,
+                 row_width * sizeof(T));
+        }
      }
    }
  }

--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/split_op.h"
-#include "paddle/fluid/operators/net_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -10,9 +10,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/sum_op.h"
 #include <algorithm>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
@@ -37,7 +39,10 @@ class SumOp : public framework::OperatorWithKernel {
    auto x_dims = ctx->GetInputsDim("X");
    size_t N = x_dims.size();
-    PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
+    PADDLE_ENFORCE_GT(N, 0, "Input tensors count should > 0.");
+    if (N == 1) {
+      VLOG(3) << "Warning: sum have only one input, may waste memory";
+    }
    framework::DDim in_dim({0});
    for (auto& x_dim : x_dims) {

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,13 +2,13 @@ if(WITH_PYTHON)
  if(WITH_AMD_GPU)
    hip_library(paddle_pybind SHARED
      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
+      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
           parallel_executor
      ${GLOB_OP_LIB})
  else()
    cc_library(paddle_pybind SHARED
      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
+      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
           parallel_executor
      ${GLOB_OP_LIB})
    if(NOT APPLE AND NOT ANDROID)

--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <string>
 #include <tuple>
-#include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -125,23 +124,6 @@ void BindProgramDesc(pybind11::module *m) {
           })
      .def("append_block", &pd::ProgramDesc::AppendBlock,
           pybind11::return_value_policy::reference)
-      .def("append_backward",
-           [](pd::ProgramDesc &program_desc, const pd::VarDesc &target,
-              const std::unordered_set<std::string> &no_grad_vars) {
-             pd::ParamGradInfoMap param_grad_map =
-                 AppendBackward(program_desc, target, no_grad_vars);
-             std::unordered_map<
-                 std::string, std::tuple<std::string /* grad_var_name */,
-                                         int /* block_idx */, int /* op_idx */>>
-                 retv;
-             for (auto it = param_grad_map.begin(); it != param_grad_map.end();
-                  ++it) {
-               const auto &grad_info = it->second;
-               retv[it->first] = std::make_tuple(
-                   grad_info.name_, grad_info.block_idx_, grad_info.op_idx_);
-             }
-             return retv;
-           })
      .def("block", &pd::ProgramDesc::MutableBlock,
           pybind11::return_value_policy::reference)
      .def("num_blocks", &pd::ProgramDesc::Size)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -20,9 +20,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
-#include "paddle/fluid/pybind/protobuf.h"
-#include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
@@ -31,18 +28,18 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/cond_op.h"
-#include "paddle/fluid/operators/net_op.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
-#include "paddle/fluid/pybind/pybind.h"
+#include "paddle/fluid/pybind/protobuf.h"
+#include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/recordio.h"
 #include "paddle/fluid/pybind/tensor_py.h"
@@ -239,11 +236,6 @@ All parameter, weight, gradient are variables in Paddle.
           },
           py::return_value_policy::reference)
 #endif
-      .def("get_net",
-           [](Variable &self) -> operators::NetOp * {
-             return self.GetMutable<operators::NetOp>();
-           },
-           py::return_value_policy::reference)
      .def("get_reader",
           [](Variable &self) -> framework::ReaderHolder * {
             PADDLE_ENFORCE(self.IsType<framework::ReaderHolder>());
@@ -388,11 +380,6 @@ All parameter, weight, gradient are variables in Paddle.
                                   desc.InitializationErrorString());
                    return OpRegistry::CreateOp(desc);
                  })
-      .def("backward",
-           [](const OperatorBase &forwardOp,
-              const std::unordered_set<std::string> &no_grad_vars) {
-             return Backward(forwardOp, no_grad_vars).release();
-           })
      .def("run",
           [](OperatorBase &self, const Scope &scope,
              const platform::CPUPlace &place) { self.Run(scope, place); })
@@ -420,42 +407,6 @@ All parameter, weight, gradient are variables in Paddle.
           [](const OperatorBase &op) { return op.OutputVars(false); })
      .def("support_gpu", &OperatorBase::SupportGPU);
-  py::class_<operators::NetOp, OperatorBase>(m, "Net")
-      .def_static("create",
-                  []() -> operators::NetOp * {
-                    auto *retv = new operators::NetOp;
-                    retv->SetType("plain_net");
-                    return retv;
-                  })
-      .def("append_op", [](operators::NetOp &self,
-                           const OperatorBase &op) { self.AppendOp(op); })
-      .def("complete_add_op", &operators::NetOp::CompleteAddOp)
-      .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
-        self->CompleteAddOp();
-      });
-  // cond_op
-  py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
-      .def_static("create",
-                  [](py::bytes protobin) -> operators::CondOp * {
-                    proto::OpDesc desc;
-                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
-                                   "Cannot parse user input to OpDesc");
-                    PADDLE_ENFORCE(desc.IsInitialized(),
-                                   "User OpDesc is not initialized, reason %s",
-                                   desc.InitializationErrorString());
-                    auto cond_op = OpRegistry::CreateOp(desc);
-                    return static_cast<operators::CondOp *>(cond_op.release());
-                  })
-      .def("set_truenet",
-           [](operators::CondOp &self, const operators::NetOp &net) -> void {
-             self.set_truenet(net.Clone());
-           })
-      .def("set_falsenet",
-           [](operators::CondOp &self, const operators::NetOp &net) -> void {
-             self.set_falsenet(net.Clone());
-           });
  py::class_<framework::Executor>(m, "Executor")
      .def(py::init<const platform::Place &>())
      .def("run",

--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
@@ -14,13 +14,13 @@
 #include "paddle/fluid/recordio/chunk.h"
+#include <zlib.h>
 #include <algorithm>
 #include <memory>
 #include <sstream>
 #include "paddle/fluid/platform/enforce.h"
-#include "snappy_stream/include/snappystream.hpp"
+#include "snappystream.hpp"
-#include "zlib/include/zlib.h"
 namespace paddle {
 namespace recordio {

--- a/paddle/fluid/recordio/header.cc
+++ b/paddle/fluid/recordio/header.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 #include "paddle/fluid/recordio/header.h"
+#include <string>
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -231,7 +231,7 @@ function gen_fluid_inference_lib() {
    Deploying fluid inference library ...
    ========================================
 EOF
-        make inference_lib_dist
+        make -j `nproc` inference_lib_dist
    fi
 }

--- a/python/paddle/fluid/distribute_transpiler.py
+++ b/python/paddle/fluid/distribute_transpiler.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1119,24 +1119,6 @@ class Program(object):
    def current_block(self):
        return self.blocks[self.current_block_idx]
-    def append_backward(self, target, no_grad_set=None):
-        """
-        return map(param_name -> (grad_name, block_index, op_index))
-        """
-        assert isinstance(target, Variable)
-        if no_grad_set is None:
-            no_grad_set = set()
-        try:
-            param_to_grad_info = self.desc.append_backward(target.desc,
-                                                           no_grad_set)
-        except Exception as e:
-            raise core.EnforceNotMet(
-                str(e) + "\nCurrent protobuf is\n{0}".format(
-                    self.to_string(False)))
-        self.sync_with_cpp()
-        return param_to_grad_info
    def create_block(self, parent_idx=None):
        new_block_idx = len(self.blocks)
        parent = self.current_block() if parent_idx is None else self.block(
@@ -1201,6 +1183,8 @@ class Parameter(Variable):
        self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
+        self.do_model_average = kwargs.get('do_model_average', None)
    def __str__(self):
        return self.to_string(True)
@@ -1221,7 +1205,7 @@ class Parameter(Variable):
        if with_details:
            res_str = Variable.to_string(self, throw_on_error, True)
            additional_attr = ("trainable", "optimize_attr", "regularizer",
-                               "gradient_clip_attr")
+                               "gradient_clip_attr", "do_model_average")
            for attr_name in additional_attr:
                res_str += "%s: %s\n" % (attr_name,
                                         str(getattr(self, attr_name)))

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -218,6 +218,7 @@ def fc(input,
 def embedding(input,
              size,
              is_sparse=False,
+              is_distributed=False,
              padding_idx=None,
              param_attr=None,
              dtype='float32'):
@@ -268,8 +269,11 @@ def embedding(input,
        inputs={'Ids': input,
                'W': w},
        outputs={'Out': tmp},
-        attrs={'is_sparse': is_sparse,
+        attrs={
-               'padding_idx': padding_idx})
+            'is_sparse': is_sparse,
+            'is_distributed': is_distributed,
+            'padding_idx': padding_idx
+        })
    return tmp
@@ -1516,7 +1520,8 @@ def batch_norm(input,
               in_place=False,
               name=None,
               moving_mean_name=None,
-               moving_variance_name=None):
+               moving_variance_name=None,
+               do_model_average_for_mean_and_var=False):
    """
    This function helps create an operator to implement
    the BatchNorm layer using the configurations from the input parameters.
@@ -1547,7 +1552,10 @@ def batch_norm(input,
    mean = helper.create_parameter(
        attr=ParamAttr(
-            name=moving_mean_name, initializer=Constant(0.0), trainable=False),
+            name=moving_mean_name,
+            initializer=Constant(0.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var),
        shape=param_shape,
        dtype=input.dtype)
    mean.stop_gradient = True
@@ -1556,7 +1564,8 @@ def batch_norm(input,
        attr=ParamAttr(
            name=moving_variance_name,
            initializer=Constant(1.0),
-            trainable=False),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var),
        shape=param_shape,
        dtype=input.dtype)
    variance.stop_gradient = True
@@ -3374,14 +3383,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
    Here are some examples to explain it.
    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
-    is [6, 8], the reshape operator will transform x into a 2-D tensor with 
+    is [6, 8], the reshape operator will transform x into a 2-D tensor with
    shape [6, 8] and leaving x's data unchanged.
    2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
    specified is [2, 3, -1, 2], the reshape operator will transform x into a
    4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
-    case, one dimension of the target shape is set to -1, the value of this 
+    case, one dimension of the target shape is set to -1, the value of this
-    dimension is inferred from the total element number of x and remaining 
+    dimension is inferred from the total element number of x and remaining
    dimensions.
    3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
@@ -3615,7 +3624,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
 def pad(x, paddings, pad_value=0., name=None):
    """
    Pads a tensor with a constant value given by :attr:`pad_value`, and the
-    padded width is specified by :attr:`paddings`. 
+    padded width is specified by :attr:`paddings`.
    Specifically, the number of values padded before the contents of :attr:`x`
    in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number
@@ -3643,7 +3652,7 @@ def pad(x, paddings, pad_value=0., name=None):
        x (Variable): The input tensor variable.
        paddings (list): A list of integers. Its elements specify the padded
                         width before and after for each dimension in turn.
-                         The length of :attr:paddings must be 
+                         The length of :attr:paddings must be
                         :math:`rank(x) \\times 2`.
        pad_value (float): The constant value used to pad.
        name(str|None): A name for this layer(optional). If set None, the layer

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 from collections import defaultdict
 from paddle.fluid.framework import Program
 import framework
@@ -818,8 +818,8 @@ class ModelAverage(Optimizer):
    min_average_window, max_average_window and current update times.
    Args:
-        params_grads: A list of parameter-grad variable pairs.
        average_window_rate: The rate of average window.
+        params_grads: A list of parameter-grad variable pairs.
        min_average_window: The minimum size of average window.
        max_average_window: The maximum size of average window.
@@ -840,8 +840,8 @@ class ModelAverage(Optimizer):
    """
    def __init__(self,
-                 params_grads,
                 average_window_rate,
+                 params_grads=None,
                 min_average_window=10000,
                 max_average_window=10000,
                 **kwargs):
@@ -849,24 +849,37 @@ class ModelAverage(Optimizer):
        self.average_window = average_window_rate
        self.min_average_window = min_average_window
        self.max_average_window = max_average_window
-        self.params_grads = params_grads
+        self.params_grads = [] if params_grads is None else params_grads
+        params = {}
+        for param, grad in self.params_grads:
+            if param.do_model_average != False:
+                params[param.name] = (param, grad)
+        for param in framework.default_main_program().global_block(
+        ).all_parameters():
+            if param.name not in params and param.do_model_average != False:
+                grad = param.block.create_var(
+                    name=unique_name.generate(".".join([param.name, 'tmp'])),
+                    dtype=param.dtype,
+                    persistable=False,
+                    stop_gradient=True)
+                params[param.name] = (param, grad)
+        self.params_grads = params.values()
        for param, grad in self.params_grads:
-            if grad is not None:
+            self._append_average_accumulate_op(param)
-                self._append_average_accumulate_op(param)
        self.apply_program = Program()
        block = self.apply_program.global_block()
        with program_guard(main_program=self.apply_program):
            for param_grad in self.params_grads:
-                if param_grad[1] is not None:
+                self._add_average_apply_op(block, param_grad)
-                    self._add_average_apply_op(block, param_grad)
        self.restore_program = Program()
        block = self.restore_program.global_block()
        with program_guard(main_program=self.restore_program):
            for param_grad in self.params_grads:
-                if param_grad[1] is not None:
+                self._add_average_restore_op(block, param_grad)
-                    self._add_average_restore_op(block, param_grad)
    def _add_average_apply_op(self, block, param_grad):
        param = block.clone_variable(param_grad[0])

--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -28,13 +28,15 @@ class ParamAttr(object):
                 learning_rate=1.0,
                 regularizer=None,
                 trainable=True,
-                 gradient_clip=None):
+                 gradient_clip=None,
+                 do_model_average=None):
        self.name = name
        self.initializer = initializer
        self.learning_rate = learning_rate
        self.regularizer = regularizer
        self.trainable = trainable
        self.gradient_clip = gradient_clip
+        self.model_average = do_model_average
    def set_default_initializer(self, initializer):
        if initializer is None:
@@ -80,7 +82,8 @@ class ParamAttr(object):
            },
            'regularizer': self.regularizer,
            'trainable': self.trainable,
-            'gradient_clip_attr': self.gradient_clip
+            'gradient_clip_attr': self.gradient_clip,
+            'model_average': self.model_average
        }
        if with_initializer:
            kwargs['initializer'] = self.initializer
@@ -90,7 +93,7 @@ class ParamAttr(object):
 class WeightNormParamAttr(ParamAttr):
    """
    Used for weight normalization. Any field in ParamAttr can also be set here.
-    Besides, an extra field dim can be set to indicate the dimension except 
+    Besides, an extra field dim can be set to indicate the dimension except
    which to normalize.
    """
    # List to record the parameters reparameterized by weight normalization.

--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -14,23 +14,13 @@
 import unittest
 import numpy as np
-from op_test import OpTest
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest
 from paddle.fluid.framework import grad_var_name
-def get_backward_op(scope, op, no_grad_set):
-    backward_op = core.Operator.backward(op, no_grad_set)
-    for input in backward_op.input_vars():
-        var = scope.var(input)
-        var.get_tensor()
-    for output in backward_op.output_vars():
-        var = scope.var(output)
-        var.get_tensor()
-    return backward_op
 def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
    x_shape = x.shape
    if len(x_shape) == 2:
@@ -64,11 +54,6 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
 def _reference_training(x, scale, offset, epsilon, data_format):
    x_shape = x.shape
-    if len(x_shape) == 2:
-        if data_format == "NCHW":
-            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
-        else:
-            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
    if data_format == "NCHW":
        n, c, h, w = x.shape
@@ -88,8 +73,6 @@ def _reference_training(x, scale, offset, epsilon, data_format):
        offset_tile = np.reshape(offset, (1, c, 1, 1))
        offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
        y = normalized * scale_tile + offset_tile
-        if len(x_shape) == 2:
-            y = np.reshape(y, (y.shape[0], y.shape[1]))
        return y, mean, var
    elif data_format == "NHWC":
        x_square = x * x
@@ -100,59 +83,42 @@ def _reference_training(x, scale, offset, epsilon, data_format):
        var = x_square_sum / element_count - mean * mean
        normalized = (x - mean) / np.sqrt(var + epsilon)
        y = normalized * scale + offset
-        if len(x_shape) == 2:
-            y = np.reshape(y, x_shape)
        return y, mean, var
    else:
        raise ValueError("Unknown data order.")
-def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
+def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
    # Use the following formulas to calculate gradients:
    # grad_scale =
    #   sum(grad_y * (x - mean)) * rsqrt(var + epsilon)
    #
    # grad_offset = sum(output_y)
    #
-    # grad_x =
+    # x_grad =
    #   1/N * scale * rsqrt(var + epsilon) * (N * grad_y - sum(grad_y) -
    #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
    # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
-    x_shape = x.shape
-    if len(x_shape) == 2:
-        if data_format == "NCHW":
-            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
-            grad_y = np.reshape(grad_y,
-                                (grad_y.shape[0], grad_y.shape[1], 1, 1))
-        else:
-            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
-            grad_y = np.reshape(grad_y,
-                                (grad_y.shape[0], 1, 1, grad_y.shape[1]))
    if data_format == "NCHW":
        x = np.transpose(x, (0, 2, 3, 1))
-        grad_y = np.transpose(grad_y, (0, 2, 3, 1))
+        y_grad = np.transpose(y_grad, (0, 2, 3, 1))
-        # raise ValueError("data_format must be NHWC, got %s." % data_format)
+    x_grad = scale * (y_grad - np.mean(
-    grad_x = scale * (grad_y - np.mean(
+        y_grad, axis=(0, 1, 2)) - (x - mean) * np.mean(
-        grad_y, axis=(0, 1, 2)) - (x - mean) * np.mean(
+            y_grad * (x - mean), axis=(0, 1, 2)) /
-            grad_y * (x - mean), axis=(0, 1, 2)) /
                      (var + epsilon)) / np.sqrt(var + epsilon)
-    grad_scale = np.sum(grad_y * (x - mean) / np.sqrt(var + epsilon),
+    grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
                        axis=(0, 1, 2))
-    grad_offset = np.sum(grad_y, axis=(0, 1, 2))
+    grad_offset = np.sum(y_grad, axis=(0, 1, 2))
    # transfer back to N, C, H, W
    if data_format == "NCHW":
-        grad_x = np.transpose(grad_x, (0, 3, 1, 2))
+        x_grad = np.transpose(x_grad, (0, 3, 1, 2))
        x = np.transpose(x, (0, 3, 1, 2))
-        grad_y = np.transpose(grad_y, (0, 3, 1, 2))
+        y_grad = np.transpose(y_grad, (0, 3, 1, 2))
-    if len(x_shape) == 2:
+    return x_grad, grad_scale, grad_offset
-        grad_x = np.reshape(grad_x, x_shape)
-    return grad_x, grad_scale, grad_offset
 def create_or_get_tensor(scope, var_name, var, place):
@@ -186,7 +152,7 @@ def set_output_grad(scope, outputs, place, feed_dict=None):
        __set_tensor__(output, data)
-class TestBatchNormOpInference(OpTest):
+class TestBatchNormOpInference(unittest.TestCase):
    def setUp(self):
        self.dtype = np.float32
@@ -304,231 +270,121 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
                self.check_with_place(place, data_format, self.dtype, [2, 3])
-class TestBatchNormOpTraining(OpTest):
+class TestBatchNormOpTraining(unittest.TestCase):
    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        if not np.allclose(np.array(tensor), np_array, atol=atol):
+            import pdb
+            pdb.set_trace()
        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
-    def test_python_testing(self):
-        data_format = "NHWC"
-        epsilon = 0.00001
-        n, h, w, c = 2, 3, 4, 5
-        x_shape = [n, h, w, c]
-        scale_shape = [c]
-        x_val = np.random.random_sample(x_shape).astype(np.float32)
-        scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-        bias_val = np.random.random_sample(scale_shape).astype(np.float32)
-        mean = np.zeros(scale_shape).astype(np.float32)
-        variance = np.ones(scale_shape).astype(np.float32)
-        y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
-                                   epsilon, "NHWC")
-        # running N, C, H, W case
-        # should produce the same results
-        x_shape2 = [n, c, h, w]
-        x_val2 = np.transpose(x_val, (0, 3, 1, 2))
-        y_out2 = _reference_testing(x_val2, scale_val, bias_val, mean, variance,
-                                    epsilon, "NCHW")
-        # transfer (N, C, H, W) back to (N, H, W, C)
-        y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
-        self.__assert_close(y_out, y_out2_trans, "inference output")
-        print 'python: NHWC, NCHW, inference checking passed'
-    def test_python_training(self):
-        data_format = "NHWC"
-        epsilon = 0.00001
-        momentum = 0.9
-        # N, H, W, C: 2, 3, 4, 2
-        n, h, w, c = 2, 3, 4, 5
-        x_shape = [n, h, w, c]
-        scale_shape = [c]
-        x_val = np.random.random_sample(x_shape).astype(np.float32)
-        scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-        bias_val = np.random.random_sample(scale_shape).astype(np.float32)
-        mean = np.zeros(scale_shape).astype(np.float32)
-        variance = np.ones(scale_shape).astype(np.float32)
-        # run forward
-        y_out, saved_mean, var_ref = _reference_training(
-            x_val, scale_val, bias_val, epsilon, "NHWC")
-        #
-        mean_out = saved_mean * (1. - momentum) + momentum * mean
-        variance_out = var_ref * (1. - momentum) + momentum * variance
-        saved_variance = 1. / np.sqrt(var_ref + epsilon)
-        # running N, C, H, W case
-        # should produce the same results
-        x_shape2 = [n, c, h, w]
-        x_val2 = np.transpose(x_val, (0, 3, 1, 2))
-        y_out2, saved_mean2, var_ref2 = _reference_training(
-            x_val2, scale_val, bias_val, epsilon, "NCHW")
-        self.__assert_close(saved_mean, saved_mean2, "batch mean")
-        self.__assert_close(var_ref, var_ref2, "batch variance")
-        # transfer (N, C, H, W) back to (N, H, W, C)
-        y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
-        self.__assert_close(y_out, y_out2_trans, "batch output")
-        print 'python: NHWC, NCHW, forward checking passed'
-        # test backward now
-        # NHWC
-        self.y_grad = np.random.random_sample(x_shape).astype(np.float32)
-        y_grad = self.y_grad
-        # y_grad = np.ones(x_shape).astype(np.float32)
-        x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
-            x_val, y_grad, scale_val, saved_mean, var_ref, epsilon, "NHWC")
-        # NCHW
-        y_grad2 = np.transpose(y_grad, (0, 3, 1, 2))
-        # y_grad2 = np.ones(x_shape2).astype(np.float32)
-        x_grad_ref2, scale_grad_ref2, bias_grad_ref2 = _reference_grad(
-            x_val2, y_grad2, scale_val, saved_mean2, var_ref2, epsilon, "NCHW")
-        self.__assert_close(scale_grad_ref, scale_grad_ref2, "scale gradient")
-        self.__assert_close(bias_grad_ref, bias_grad_ref2, "bias gradient")
-        x_grad_transpose = np.transpose(x_grad_ref2, (0, 2, 3, 1))
-        self.__assert_close(x_grad_ref, x_grad_transpose, "x gradient")
-        print 'python: NHWC, NCHW, backward checking passed'
    def test_forward_backward(self):
        def test_with_place(place, data_layout, shape):
            # attr
            epsilon = 0.00001
            momentum = 0.9
+            if data_layout == "NCHW":
-            if len(shape) == 2:
+                n, c, h, w = shape[0], shape[1], shape[2], shape[3]
-                x_shape = shape
-                c = shape[1]
            else:
-                # n, h, w, c = 2, 3, 4, 2
                n, h, w, c = shape[0], shape[1], shape[2], shape[3]
-                if data_format == "NHWC":
-                    x_shape = [n, h, w, c]
-                elif data_format == "NCHW":
-                    x_shape = [n, c, h, w]
-                else:
-                    raise ValueError("Unknown data type.")
            scale_shape = [c]
-            x_val = np.random.random_sample(x_shape).astype(np.float32)
+            np.random.seed(123)
-            scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+            x = np.random.random_sample(shape).astype(np.float32)
-            bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+            scale = np.random.random_sample(scale_shape).astype(np.float32)
+            bias = np.random.random_sample(scale_shape).astype(np.float32)
            mean = np.zeros(scale_shape).astype(np.float32)
            variance = np.ones(scale_shape).astype(np.float32)
            # run forward
-            y_out, saved_mean, var_ref = _reference_training(
+            y, saved_mean, var_ref = _reference_training(x, scale, bias,
-                x_val, scale_val, bias_val, epsilon, data_format)
+                                                         epsilon, data_layout)
-            # update moving mean and variance
            mean_out = saved_mean * (1. - momentum) + momentum * mean
            variance_out = var_ref * (1. - momentum) + momentum * variance
            saved_variance = 1. / np.sqrt(var_ref + epsilon)
-            #  for gradient test
-            # y_grad = np.ones(x_shape).astype(np.float32)
-            y_grad = np.zeros(x_shape).astype(np.float32)
-            if len(y_grad.shape) == 2:
-                y_grad[0, 0] = 1.
-            else:
-                y_grad[0, 0, 0, 0] = 1.
-            # y_grad = np.random.random_sample(x_shape).astype(np.float32)
-            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
-                x_val, y_grad, scale_val, saved_mean, var_ref, epsilon,
-                data_format)
-            scope = core.Scope()
-            # create input
-            x_tensor = create_or_get_tensor(scope, "x_val", x_val, place)
-            scale_tensor = create_or_get_tensor(scope, "scale_val", scale_val,
-                                                place)
-            bias_tensor = create_or_get_tensor(scope, "bias_val", bias_val,
-                                               place)
-            mean_tensor = create_or_get_tensor(scope, "mean", mean, place)
-            variance_tensor = create_or_get_tensor(scope, "variance", variance,
-                                                   place)
-            # create output
-            y_tensor = create_or_get_tensor(scope, "y_out", None, place)
-            saved_mean_tensor = create_or_get_tensor(scope, "saved_mean", None,
-                                                     place)
-            saved_variance_tensor = create_or_get_tensor(
-                scope, "saved_variance", None, place)
-            mean_out_tensor = mean_tensor
-            variance_out_tensor = variance_tensor
-            batch_norm_op = Operator(
-                "batch_norm",
-                # inputs
-                X="x_val",
-                Scale="scale_val",
-                Bias="bias_val",
-                Mean="mean",
-                Variance="variance",
-                # outputs
-                Y="y_out",
-                MeanOut="mean",
-                VarianceOut="variance",
-                SavedMean="saved_mean",
-                SavedVariance="saved_variance",
-                # attrs
-                is_test=False,
-                data_layout=data_layout,
-                momentum=momentum,
-                epsilon=epsilon)
-            batch_norm_op.run(scope, place)
-            # check forward result
-            self.__assert_close(y_tensor, y_out, "y_out")
-            self.__assert_close(saved_mean_tensor, saved_mean, "saved_mean")
-            self.__assert_close(saved_variance_tensor, saved_variance,
-                                "saved_variance")
-            self.__assert_close(mean_out_tensor, mean_out, "mean_out")
-            if isinstance(place, core.CUDAPlace):
-                atol = 5e-2
-            else:
-                atol = 1e-4
-            self.__assert_close(variance_out_tensor, variance_out,
-                                "variance_out", atol)
-            print "op test forward passed: ", str(place), data_layout
            # run backward
-            batch_norm_op_grad = get_backward_op(scope, batch_norm_op, set())
+            y_grad = np.random.random_sample(shape).astype(np.float32)
-            set_output_grad(
+            x_grad, scale_grad, bias_grad = _reference_grad(
-                scope,
+                x, y_grad, scale, saved_mean, var_ref, epsilon, data_format)
-                ["y_out", "mean", "variance", "saved_mean", "saved_variance"],
-                place,
+            var_dict = locals()
-                feed_dict={"y_out": y_grad})
+            var_dict['y@GRAD'] = y_grad
-            batch_norm_op_grad.run(scope, place)
+            var_names = [
-            x_grad_tensor = create_or_get_tensor(scope,
+                'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
-                                                 grad_var_name("x_val"), None,
+                'saved_variance'
-                                                 place)
+            ]
-            scale_grad_tensor = create_or_get_tensor(scope,
+            ground_truth = {name: var_dict[name] for name in var_names}
-                                                     grad_var_name("scale_val"),
-                                                     None, place)
+            program = fluid.Program()
-            bias_grad_tensor = create_or_get_tensor(scope,
+            with fluid.program_guard(program):
-                                                    grad_var_name("bias_val"),
+                block = program.global_block()
-                                                    None, place)
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+                bn_op = block.append_op(
+                    type="batch_norm",
+                    inputs={
+                        "X": block.var('x'),
+                        "Scale": block.var('scale'),
+                        "Bias": block.var('bias'),
+                        "Mean": block.var('mean'),
+                        "Variance": block.var('variance')
+                    },
+                    outputs={
+                        "Y": block.var('y'),
+                        "MeanOut": block.var('mean'),  # share the same memory
+                        "VarianceOut":
+                        block.var('variance'),  # share the same memory
+                        "SavedMean": block.var('saved_mean'),
+                        "SavedVariance": block.var('saved_variance')
+                    },
+                    attrs={
+                        "momentum": momentum,
+                        "epsilon": epsilon,
+                        "is_test": False,
+                        "data_layout": data_layout
+                    })
+                block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    bn_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+                exe = fluid.Executor(place)
+                out = exe.run(
+                    program,
+                    feed={
+                        name: var_dict[name]
+                        for name in
+                        ['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD']
+                    },
+                    fetch_list=[
+                        'y', 'mean', 'variance', 'saved_mean', 'saved_variance',
+                        'x@GRAD', 'scale@GRAD', 'bias@GRAD'
+                    ])
+            self.__assert_close(y, out[0], "y")
+            self.__assert_close(mean_out, out[1], "mean")
+            self.__assert_close(variance_out, out[2], "variance", 1e-3)
+            self.__assert_close(saved_mean, out[3], "saved_mean")
+            self.__assert_close(saved_variance, out[4], "saved_variance", 1e-3)
+            self.__assert_close(x_grad, out[5], "x_grad")
+            self.__assert_close(scale_grad, out[6], "scale_grad")
+            self.__assert_close(bias_grad, out[7], "bias_grad")
-            # check gradient output
+            print "op test forward passed: ", str(place), data_layout
-            self.__assert_close(x_grad_tensor, x_grad_ref, "x_grad")
-            self.__assert_close(scale_grad_tensor, scale_grad_ref, "scale_grad")
-            self.__assert_close(bias_grad_tensor, bias_grad_ref, "bias_grad")
-            print "op test backward passed: ", str(place), data_layout
        places = [core.CPUPlace()]
        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
@@ -537,7 +393,6 @@ class TestBatchNormOpTraining(OpTest):
        for place in places:
            for data_format in ["NCHW", "NHWC"]:
                test_with_place(place, data_format, [2, 3, 4, 5])
-                test_with_place(place, data_format, [2, 3])
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_cond_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cond_op.py
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -32,7 +32,6 @@ class TestBook(unittest.TestCase):
            cost = layers.square_error_cost(input=y_predict, label=y)
            avg_cost = layers.mean(cost)
            self.assertIsNotNone(avg_cost)
-            program.append_backward(avg_cost)
        print(str(program))
@@ -94,8 +93,6 @@ class TestBook(unittest.TestCase):
            cost = layers.cross_entropy(input=predict, label=label)
            avg_cost = layers.mean(cost)
-            program.append_backward(avg_cost)
        print(str(program))
    def test_word_embedding(self):

--- a/python/paddle/fluid/tests/unittests/test_net.py
+++ b/python/paddle/fluid/tests/unittests/test_net.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-import unittest
-def fc(X, W, Y):
-    ret_v = core.Net.create()
-    ret_v.append_op(Operator("mul", X="X", Y="W", Out="pre_activation"))
-    ret_v.append_op(Operator("sigmoid", X="pre_activation", Out=Y))
-    ret_v.complete_add_op(True)
-    return ret_v
-class TestNet(unittest.TestCase):
-    def test_net_all(self):
-        net = core.Net.create()
-        op1 = Operator("sum", X=["X", "Y"], Out="Out")
-        net.append_op(op1)
-        net2 = core.Net.create()
-        net2.append_op(fc(X="X", W="w", Y="fc.out"))
-        net2.complete_add_op(True)
-        net.append_op(net2)
-        net.complete_add_op(True)
-        expected = '''
-Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
-    Op(sum), inputs:{X[X, Y]}, outputs:{Out[Out]}.
-    Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
-        Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
-            Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
-            Op(sigmoid), inputs:{X[pre_activation]}, outputs:{Out[fc.out]}.
-'''
-        self.assertEqual(expected, "\n" + str(net))
-if __name__ == "__main__":
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -473,7 +473,7 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
            loss = simple_fc_net(True)
            test_program = main.clone(for_test=True)
-            opt = fluid.optimizer.SGD(learning_rate=0.0001)
+            opt = fluid.optimizer.SGD(learning_rate=0.001)
            opt.minimize(loss)
            batch_size = 32
@@ -500,4 +500,8 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
                train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
                train_loss = numpy.array(train_loss)
-                self.assertTrue(numpy.allclose(train_loss, test_loss))
+                self.assertTrue(
+                    numpy.allclose(
+                        train_loss, test_loss, atol=1e-8),
+                    "Train loss: " + str(train_loss) + "\n Test loss:" +
+                    str(test_loss))
--- a/python/paddle/fluid/tests/unittests/test_program.py
+++ b/python/paddle/fluid/tests/unittests/test_program.py