Merge branch 'develop' into core_inference_prepare

2762959f · Liu Yiqun · 339be625 · ad73b331 · 2762959f · 2762959f
72 changed file
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -62,29 +62,33 @@ endif()
 ## Then find the reference-cblas.  www.netlib.org/blas/
 set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
  "Folder contains reference-cblas")
-set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
+if(NOT CMAKE_CROSSCOMPILING)
-  ${REFERENCE_CBLAS_ROOT}/include
+  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
-  /usr/include
+    ${REFERENCE_CBLAS_ROOT}/include
-  /usr/include/cblas
+    /usr/include
-)
+    /usr/include/cblas
+  )
-set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
-  ${REFERENCE_CBLAS_ROOT}/lib
+  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS
-  /usr/lib
+    ${REFERENCE_CBLAS_ROOT}/lib
-  /usr/lib/blas/reference/
+    /usr/lib
-  /usr/lib/reference/
+    /usr/lib/blas/reference/
-)
+    /usr/lib/reference/
+  )
+else()
+  # Diable the finding of reference cblas under host's system path
+  set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/include)
+  set(REFERENCE_CBLAS_LIB_SEARCH_PATHS ${REFERENCE_CBLAS_ROOT}/lib)
+endif()
 find_path(REFERENCE_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS
        ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
 find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
        ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
-if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
+if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
  set(CBLAS_FOUND ON)
  set(CBLAS_PROVIDER REFERENCE)
  set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})

--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -24,16 +24,16 @@ SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
 SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
 SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
 IF(APPLE)
-  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
 ELSE()
-  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j static grpc_cpp_plugin)
 ENDIF()
 ExternalProject_Add(
    extern_grpc
    DEPENDS protobuf zlib
    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
-    GIT_TAG "v1.8.x"
+    GIT_TAG "v1.11.x"
    PREFIX          ${GRPC_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""

--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -11,19 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
-IF(MOBILE_INFERENCE)
+if(MOBILE_INFERENCE OR RPI)
    return()
-ENDIF()
+endif()
 include (ExternalProject)
 # NOTE: snappy is needed when linking with recordio
-SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
+set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
-SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
+set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
-SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE)
+set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
+set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
 ExternalProject_Add(
    extern_snappy
@@ -51,8 +52,7 @@ ExternalProject_Add(
 )
 add_library(snappy STATIC IMPORTED GLOBAL)
-set_property(TARGET snappy PROPERTY IMPORTED_LOCATION
+set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES})
-             "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
 include_directories(${SNAPPY_INCLUDE_DIR})
 add_dependencies(snappy extern_snappy)
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -11,9 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#
-IF(MOBILE_INFERENCE)
+IF(MOBILE_INFERENCE OR RPI)
    return()
 ENDIF()
@@ -21,9 +20,11 @@ include (ExternalProject)
 # NOTE: snappy is needed when linking with recordio
-SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
+set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
-SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
+set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
-SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE)
+set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
+set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
 ExternalProject_Add(
        extern_snappystream
@@ -51,8 +52,7 @@ ExternalProject_Add(
 )
 add_library(snappystream STATIC IMPORTED GLOBAL)
-set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
+set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
-        "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
 include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) # For snappysteam to include its own headers.
 include_directories(${THIRD_PARTY_PATH}/install) # For Paddle to include snappy stream headers.

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -195,14 +195,7 @@ function(cc_library TARGET_NAME)
        list(REMOVE_ITEM cc_library_DEPS warpctc)
        add_dependencies(${TARGET_NAME} warpctc)
      endif()
-      if("${cc_library_DEPS}" MATCHES "ARCHIVE_START")
+      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
-        # Support linking flags: --whole-archive (Linux) / -force_load (MacOS).
-        # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries.
-        target_circle_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
-        list(REMOVE_ITEM cc_library_DEPS ARCHIVE_START ARCHIVE_END)
-      else()
-        target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
-      endif()
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
    endif()
@@ -243,11 +236,7 @@ function(cc_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS ARGS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
-    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
-      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
-    endif()
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 set_property(GLOBAL PROPERTY FLUID_MODULES "")
 # find all fluid modules is used for paddle fluid static library
 function(find_fluid_modules TARGET_NAME)
  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
+  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
  string(FIND "${__target_path}" "fluid" pos)
  if(pos GREATER 1)
    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
@@ -77,6 +92,23 @@ elseif (WITH_MKLML)
    )
 endif()
+if(NOT MOBILE_INFERENCE AND NOT RPI)
+  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy")
+  copy(snappy_lib
+    SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappystream")
+  copy(snappystream_lib
+    SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/zlib")
+  copy(zlib_lib
+    SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
+    DSTS ${dst_dir} ${dst_dir}/lib)
+endif()
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
 set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")

--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -24,6 +24,6 @@ if(NOT WITH_FLUID_ONLY)
 endif()
 add_subdirectory(testing)
-if(NOT MOBILE_INFERENCE AND NOT ANDROID AND NOT IOS)
+if(NOT MOBILE_INFERENCE AND NOT RPI)
  add_subdirectory(fluid)
 endif()
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -3,6 +3,7 @@ add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(pybind)
-add_subdirectory(inference)
 add_subdirectory(string)
 add_subdirectory(recordio)
+# NOTE: please add subdirectory inference at last.
+add_subdirectory(inference)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -79,14 +79,12 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
    COMMENT "Copy generated python proto into directory paddle/fluid/proto."
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-cc_library(backward SRCS backward.cc DEPS net_op)
-cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
 cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
-framework_proto backward glog lod_rank_table feed_fetch_method)
+framework_proto glog lod_rank_table feed_fetch_method)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)

--- a/paddle/fluid/framework/backward.cc
+++ b/paddle/fluid/framework/backward.cc
--- a/paddle/fluid/framework/backward.h
+++ b/paddle/fluid/framework/backward.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/program_desc.h"
-namespace paddle {
-namespace framework {
-// Create the backward operator from a forward operator.
-// TODO(yuyang18): Add more API reference comment.
-extern std::unique_ptr<OperatorBase> Backward(
-    const OperatorBase& forwardOp,
-    const std::unordered_set<std::string>& no_grad_vars);
-struct GradVarInfo {
-  GradVarInfo() {}
-  GradVarInfo(const std::string& name, int block_idx, int op_idx)
-      : name_(name), block_idx_(block_idx), op_idx_(op_idx) {}
-  bool operator==(const GradVarInfo& b) const {
-    return name_ == b.name_ && block_idx_ == b.block_idx_ &&
-           op_idx_ == b.op_idx_;
-  }
-  std::string name_;
-  int block_idx_;
-  int op_idx_;
-};
-using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
-                                            GradVarInfo /*grad_var_info*/>;
-ParamGradInfoMap AppendBackward(
-    ProgramDesc& program_desc, const VarDesc& target,
-    const std::unordered_set<std::string>& no_grad_vars);
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/backward_test.cc
+++ b/paddle/fluid/framework/backward_test.cc
--- a/paddle/fluid/framework/details/computation_op_handle.cc
+++ b/paddle/fluid/framework/details/computation_op_handle.cc
@@ -14,6 +14,8 @@
 #include "paddle/fluid/framework/details/computation_op_handle.h"
+#include <string>
 namespace paddle {
 namespace framework {
 namespace details {
@@ -33,7 +35,7 @@ void ComputationOpHandle::RunImpl() {
    }
  }
-  op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get<Scope *>(), place_);
+  op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
 }
 std::string ComputationOpHandle::Name() const { return op_->Type(); }

--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -14,6 +14,9 @@
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace framework {
 namespace details {
@@ -57,7 +60,10 @@ void FetchOpHandle::RunImpl() {
  for (size_t i = 0; i < scopes.size(); ++i) {
    auto &scope = scopes[i];
-    auto &t = scope->FindVar(var_name)->Get<framework::LoDTensor>();
+    auto &t = scope->FindVar(kLocalExecScopeName)
+                  ->Get<Scope *>()
+                  ->FindVar(var_name)
+                  ->Get<framework::LoDTensor>();
    if (platform::is_gpu_place(var->place_)) {
 #ifdef PADDLE_WITH_CUDA
      TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);

--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -24,6 +24,8 @@ namespace paddle {
 namespace framework {
 namespace details {
+constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@";
 class OpHandleBase {
 private:
  DISABLE_COPY_AND_ASSIGN(OpHandleBase);

--- a/paddle/fluid/framework/details/ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/ssa_graph_executor.h
@@ -15,13 +15,15 @@
 #pragma once
 #include <memory>
+#include <string>
+#include <vector>
 #include "paddle/fluid/framework/details/ssa_graph.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 namespace paddle {
 namespace framework {
 namespace details {
 class SSAGraphExecutor {
  DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor);

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -136,12 +136,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    ready_ops.clear();
  };
-  // Create local scopes.
-  for (auto &scope : local_scopes_) {
-    auto &local_scope = scope->NewScope();
-    *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>() = &local_scope;
-  }
  // Step 3. Execution
  while (!pending_vars.empty() || !ready_ops.empty() || !delayed_ops.empty()) {
    // 1. Run All Ready ops
@@ -189,34 +183,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
  PADDLE_ENFORCE(ready_ops.empty());
  PADDLE_ENFORCE(delayed_ops.empty());
  PADDLE_ENFORCE(blocked_by_delayed_ops.empty());
-  ++computation_count_;
-  auto sync_computation = [&] {
-    computation_count_ = 0;
-    // Wait All computational streams
-    for (auto p : this->places_) {
-      platform::DeviceContextPool::Instance().Get(p)->Wait();
-    }
-    for (auto &scope : local_scopes_) {
-      scope->DropKids();
-    }
-  };
  // Wait FetchOps.
  if (!fetch_ops.empty()) {
    fetch_ops.clear();
-    sync_computation();
-  }
-  if (computation_count_ == max_async_computation) {
-    sync_computation();
-  }
-  // NOTE: the temp scope can be dropped lazily if needed.
-  // Drop tmp scopes;
-  for (auto &scope : local_scopes_) {
-    auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable<Scope *>();
-    kid = nullptr;
  }
  return fetch_data;

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -99,9 +99,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  std::unique_ptr<platform::EnforceNotMet> exception_;
  std::atomic<int> running_ops_;
  bool allow_op_delay_;
-  size_t computation_count_{0};
-  size_t max_async_computation{100};
 };
 }  // namespace details

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
 #include <string>
+#include <tuple>
 #include <vector>
 #ifdef PADDLE_WITH_CUDA
@@ -41,6 +42,8 @@ class ParallelExecutorPrivate {
 #ifdef PADDLE_WITH_CUDA
  std::unique_ptr<platform::NCCLContextMap> nccl_ctxs_;
 #endif
+  std::vector<std::tuple<std::string, proto::VarType::Type, bool>> var_types_;
 };
 std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -97,14 +100,9 @@ ParallelExecutor::ParallelExecutor(
      allow_op_delay));
  // Step 3. Create vars in each scope;
-  for (auto *scope : member_->local_scopes_) {
+  for (auto *var : main_program.Block(0).AllVars()) {
-    for (auto *var : main_program.Block(0).AllVars()) {
+    member_->var_types_.emplace_back(var->Name(), var->GetType(),
-      if (scope->FindVar(var->Name()) != nullptr) {
+                                     var->Persistable());
-        continue;
-      }
-      InitializeVariable(scope->Var(var->Name()), var->GetType());
-    }
  }
 }
@@ -163,9 +161,42 @@ void ParallelExecutor::Run(
    const std::unordered_map<std::string, LoDTensor> &feed_tensors) {
  platform::RecordBlock b(0);
  SplitTensorToPlaces(feed_tensors);
+  // Create local scopes.
+  for (auto &scope : member_->local_scopes_) {
+    Scope &local_scope = scope->NewScope();
+    *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>() =
+        &local_scope;
+    for (auto &name_type_pair : member_->var_types_) {
+      if (scope->FindVar(std::get<0>(name_type_pair)) != nullptr) {
+        continue;
+      }
+      if (std::get<2>(name_type_pair)) {  // Persistable
+        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+                           std::get<1>(name_type_pair));
+      } else {
+        InitializeVariable(scope->Var(std::get<0>(name_type_pair)),
+                           std::get<1>(name_type_pair));
+      }
+    }
+  }
  auto fetch_data = member_->executor_->Run(fetch_tensors);
  *member_->global_scope_->Var(fetched_var_name)->GetMutable<FeedFetchList>() =
      fetch_data;
+  // Wait All computational streams
+  for (auto p : member_->places_) {
+    platform::DeviceContextPool::Instance().Get(p)->Wait();
+  }
+  for (auto &scope : member_->local_scopes_) {
+    auto &local_scope =
+        *scope->Var(details::kLocalExecScopeName)->GetMutable<Scope *>();
+    scope->DeleteScope(local_scope);
+    local_scope = nullptr;
+  }
 }
 void ParallelExecutor::SplitTensorToPlaces(

--- a/paddle/fluid/framework/prune_test.cc
+++ b/paddle/fluid/framework/prune_test.cc
@@ -14,18 +14,17 @@ limitations under the License. */
 #include "paddle/fluid/framework/prune.h"
+#include <gtest/gtest.h>
+#include <string>
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/net_op.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include <gtest/gtest.h>
 namespace f = paddle::framework;
-namespace ops = paddle::operators;
 void AddOp(const std::string &type, const f::VariableNameMap &inputs,
           const f::VariableNameMap &outputs, f::AttributeMap attrs,

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
-set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor prune init)
+set(FLUID_CORE_MODULES proto_desc memory lod_tensor executor init)
 cc_library(paddle_fluid_api
    SRCS io.cc
@@ -11,7 +11,7 @@ cc_library(paddle_fluid DEPS ${fluid_modules})
 # Create shared library
 cc_library(paddle_fluid_shared SHARED
    SRCS io.cc
-    DEPS ARCHIVE_START ${GLOB_OP_LIB} ${FLUID_CORE_MODULES} ARCHIVE_END)
+    DEPS ${fluid_modules})
 set_target_properties(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
 if(NOT APPLE)
  # TODO(liuyiqun): Temporarily disable the link flag because it is not support on Mac.

--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -17,10 +17,16 @@ limitations under the License. */
 #include <fstream>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/pybind/pybind.h"
 namespace paddle {
 namespace inference {
+// Temporarilly add this function for exposing framework::InitDevices() when
+// linking the inference shared library.
+void Init(bool init_p2p) { framework::InitDevices(init_p2p); }
 void ReadBinaryFile(const std::string& filename, std::string& contents) {
  std::ifstream fin(filename, std::ios::in | std::ios::binary);
  PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);

--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -18,12 +18,15 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
+#include "paddle/fluid/framework/init.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 namespace paddle {
 namespace inference {
+void Init(bool init_p2p);
 void LoadPersistables(framework::Executor& executor, framework::Scope& scope,
                      const framework::ProgramDesc& main_program,
                      const std::string& dirname,

--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
@@ -17,7 +17,7 @@ function(inference_test TARGET_NAME)
    string(REGEX REPLACE "^_$" "" arg "${arg}")
    cc_test(test_inference_${TARGET_NAME}${arg}
        SRCS test_inference_${TARGET_NAME}.cc
-        DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+        DEPS paddle_fluid
        ARGS --dirname=${PYTHON_TESTS_DIR}/book/${TARGET_NAME}${arg}.inference.model)
    set_tests_properties(test_inference_${TARGET_NAME}${arg}
        PROPERTIES DEPENDS test_${TARGET_NAME})

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -100,7 +100,7 @@ function(op_library TARGET)
    endif()
    # Define operators that don't need pybind here.
-    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+    foreach(manual_pybind_op "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
            set(pybind_flag 1)
        endif()
@@ -199,7 +199,6 @@ else()
    set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
 endif()
-op_library(cond_op DEPS framework_proto tensor net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
 op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
 op_library(softmax_op DEPS softmax)
@@ -259,7 +258,6 @@ endforeach()
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
-cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)

--- a/paddle/fluid/operators/cond_op.cc
+++ b/paddle/fluid/operators/cond_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "paddle/fluid/operators/cond_op.h"
-#include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/scatter.h"
-#include "paddle/fluid/platform/device_context.h"
-namespace paddle {
-namespace operators {
-using Scope = framework::Scope;
-using Variable = framework::Variable;
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-framework::Scope& CondOp::AddSubScope(const Scope& scope) const {
-  auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
-  auto& sub_scope = scope.NewScope();
-  sub_scopes->push_back(&sub_scope);
-  return sub_scope;
-}
-std::vector<framework::Scope*>& CondOp::GetSubScopes(
-    const framework::Scope& scope) const {
-  auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  return *sub_scopes_var->GetMutable<std::vector<framework::Scope*>>();
-}
-LoDTensor& CondOp::AddIndexTensor(const Scope& scope) const {
-  auto index_tensors_var = scope.FindVar("IndexTensors");
-  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
-                          "Output(IndexTensors) of CondOp should not be null.");
-  auto& index_tensors =
-      *index_tensors_var->GetMutable<std::vector<LoDTensor>>();
-  index_tensors.push_back(LoDTensor());
-  return index_tensors.back();
-}
-std::vector<framework::LoDTensor>& CondOp::GetIndexTensors(
-    const framework::Scope& scope) const {
-  auto* index_tensors_var = scope.FindVar("IndexTensors");
-  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
-                          "Output(IndexTensors) of CondOp should not be null.");
-  return *index_tensors_var->GetMutable<std::vector<framework::LoDTensor>>();
-}
-void CondOp::PrepareDataForSubnet(
-    const framework::Scope& scope,
-    const platform::DeviceContext& dev_ctx) const {
-  PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs(Xs) of CondOp can't be empty.");
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    // Create two sub scopes for true and false branches
-    //   sub_scopes[0] for the true branch
-    //   sub_scopes[1] for the false branch
-    AddSubScope(scope);
-    // Create two tensors for true and false indices:
-    //   index_tensors[0] for the true branch
-    //   index_tensors[1] for the false branch
-    AddIndexTensor(scope);
-  }
-  Variable* cond_var = scope.FindVar(Input("Cond"));
-  PADDLE_ENFORCE_NOT_NULL(cond_var,
-                          "Input(Cond) of CondOp should not be null.");
-  const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
-  // get the true/false index at runtime according to cond tensor
-  // index_vectors[0]: vector<int>, contains all index for cond[i] == true
-  // index_vectors[1]: vector<int>, contains all index for cond[i] == false
-  std::vector<std::vector<int>> index_vectors;
-  index_vectors.resize(BRANCH_NUM);
-  const int* cond_data = cond->data<int>();
-  for (int i = 0; i < cond->dims()[0]; ++i) {
-    if (cond_data[i])
-      index_vectors[TRUE_BRANCH].push_back(i);
-    else
-      index_vectors[FALSE_BRANCH].push_back(i);
-  }
-  // put index_vectors[0] and index_vectors[1] into two tensors:
-  // index_tensors[0] and index_tensors[1]
-  std::vector<framework::LoDTensor>& index_tensors = GetIndexTensors(scope);
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    DDim dim = {static_cast<int64_t>(index_vectors[i].size())};
-    int* index_tensor_data_ptr =
-        index_tensors[i].mutable_data<int>(dim, platform::CPUPlace());
-    memcpy(index_tensor_data_ptr, index_vectors[i].data(),
-           dim[0] * sizeof(int));
-  }
-  // create input in subscopes according to index_vectors
-  for (auto& input : Inputs("Xs")) {
-    Variable* var_parent = scope.FindVar(input);
-    PADDLE_ENFORCE_NOT_NULL(var_parent);
-    const auto* tensor_parent = &var_parent->Get<LoDTensor>();
-    for (int i = 0; i < BRANCH_NUM; ++i) {
-      Variable* var_child = sub_scopes[i]->FindVar(input);
-      PADDLE_ENFORCE_NOT_NULL(var_child);
-      auto* tensor_child = var_child->GetMutable<LoDTensor>();
-      // Resize child
-      DDim dim = tensor_parent->dims();
-      dim[0] = index_tensors[i].dims()[0];
-      tensor_child->mutable_data<float>(dim, platform::CPUPlace());
-      CPUGather<float>(dev_ctx, *tensor_parent, index_tensors[i], tensor_child);
-    }
-  }
-  // create output_tensors in subscope for sub_net
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    for (auto& output : (*sub_net_op_[i]).Outputs()) {
-      for (auto& var_name : output.second) {
-        sub_scopes[i]->Var(var_name);
-      }
-    }
-  }
-}
-void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
-                                 const platform::DeviceContext& dev_ctx) const {
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-  const std::vector<framework::LoDTensor>& index_tensors =
-      GetIndexTensors(scope);
-  // Infer the output dim, out_dim[0] = true_dim[0] + false_dim[0]
-  PADDLE_ENFORCE(!Outputs("Outs").empty(),
-                 "Outputs(Outs) of CondOp can't be empty.");
-  for (auto& output : Outputs("Outs")) {
-    const LoDTensor* tensor_t_out =
-        &sub_scopes[TRUE_BRANCH]->FindVar(output)->Get<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
-    const LoDTensor* tensor_f_out =
-        &sub_scopes[FALSE_BRANCH]->FindVar(output)->Get<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
-    auto* var_out = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(var_out, "Output not found");
-    LoDTensor* tensor_out = var_out->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
-                            "True output tensor should not be NULL");
-    DDim true_dim = tensor_t_out->dims();
-    DDim false_dim = tensor_f_out->dims();
-    true_dim[0] = 0;
-    false_dim[0] = 0;
-    PADDLE_ENFORCE_EQ(true_dim, false_dim,
-                      "Outputs not of the same shape except the first dim");
-    DDim out_dim = tensor_t_out->dims();
-    out_dim[0] = tensor_t_out->dims()[0] + tensor_f_out->dims()[0];
-    tensor_out->Resize(out_dim);
-    tensor_out->mutable_data<float>(platform::CPUPlace());
-  }
-  // merge output results:
-  // output_tensor = true_output_tensor + false_output_tensor
-  for (auto& output : Outputs("Outs")) {
-    Variable* var_parent = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(var_parent);
-    auto* tensor_parent = var_parent->GetMutable<LoDTensor>();
-    for (int i = 0; i < BRANCH_NUM; ++i) {
-      Variable* var_child = sub_scopes[i]->FindVar(output);
-      PADDLE_ENFORCE_NOT_NULL(var_child);
-      auto* tensor_child = &var_child->Get<LoDTensor>();
-      ScatterAssign<float>(dev_ctx, *tensor_child, index_tensors[i],
-                           tensor_parent);
-    }
-  }
-}
-void CondOp::RunImpl(const Scope& scope, const platform::Place& place) const {
-  // get device context from pool
-  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-  auto& dev_ctx = *pool.Get(place);
-  PrepareDataForSubnet(scope, dev_ctx);
-  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
-  for (int i = 0; i < BRANCH_NUM; ++i) {
-    sub_net_op_[i]->Run(*sub_scopes[i], place);
-  }
-  MergeDataFromSubnet(scope, dev_ctx);
-}
-class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Cond", "The condition, which is a bool vector");
-    AddInput("Xs", "Inputs of Subnets").AsDuplicable();
-    AddOutput("Outs", "Outputs of Cond_Op after merge").AsDuplicable();
-    AddOutput("SubScopes", "sub scopes for true and false branches");
-    AddOutput("IndexTensors", "Index Tensors contains indices for true/false");
-    AddComment(R"DOC(
-Sample Dependent Conditional Operator.
-Given Cond[i] as a 1/0 vector to indicate true/false:
-Out[i] = subnet_true[i], if Cond[i] == true
-Out[i] = subnet_false[i], if Cond[i] == false
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-REGISTER_OP_WITHOUT_GRADIENT(cond, paddle::operators::CondOp,
-                             paddle::operators::CondOpProtoAndCheckerMaker);
--- a/paddle/fluid/operators/cond_op.h
+++ b/paddle/fluid/operators/cond_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <string>
-#include <vector>
-#include "glog/logging.h"
-#include "paddle/fluid/framework/ddim.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/net_op.h"
-namespace paddle {
-namespace operators {
-/*
- * @brief CondOp is a dynamic if-else Operator
- *
- * It has a input tensor named cond indicating which netop each instance will
- * run.
- *
- * if cond == 1, it will run true_net, which is a NetOp.
- *
- * if cond == 0, it will run false_net, which is another NetOp.
- */
-class CondOp : public framework::OperatorBase {
- public:
-  CondOp(const std::string& type, const framework::VariableNameMap& inputs,
-         const framework::VariableNameMap& outputs,
-         const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {
-    sub_net_op_.resize(BRANCH_NUM);
-  }
-  CondOp(const CondOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    // TODO(yuyang18): Implement copy ctor well.
-    PADDLE_THROW("Not implemented");
-  }
-  framework::Scope& AddSubScope(const framework::Scope& scope) const;
-  std::vector<framework::Scope*>& GetSubScopes(
-      const framework::Scope& scope) const;
-  framework::LoDTensor& AddIndexTensor(const framework::Scope& scope) const;
-  std::vector<framework::LoDTensor>& GetIndexTensors(
-      const framework::Scope& scope) const;
-  void PrepareDataForSubnet(const framework::Scope& scope,
-                            const platform::DeviceContext& dev_ctx) const;
-  void MergeDataFromSubnet(const framework::Scope& scope,
-                           const platform::DeviceContext& dev_ctx) const;
-  /*
-   * Set True Block
-   */
-  void set_truenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[TRUE_BRANCH] = std::move(net);
-  }
-  /*
-   * Set False Block
-   */
-  void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[FALSE_BRANCH] = std::move(net);
-  }
- private:
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override;
- private:
-  const int TRUE_BRANCH = 0;
-  const int FALSE_BRANCH = 1;
-  const int BRANCH_NUM = 2;
-  // sub_net_op_[0]: subnet_t
-  // sub_net_op_[1]: subnet_f
-  std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <stdio.h>
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
+#include <vector>
 #include "paddle/fluid/operators/ctc_align_op.h"
 namespace paddle {

--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string.h>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"

--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
@@ -106,18 +107,18 @@ information. However, the output only shares the LoD information with input $X$.
 protected:
  std::string comment_;
-  void Replace(std::string& src, std::string from, std::string to) {
+  void Replace(std::string* src, std::string from, std::string to) {
    std::size_t len_from = std::strlen(from.c_str());
    std::size_t len_to = std::strlen(to.c_str());
-    for (std::size_t pos = src.find(from); pos != std::string::npos;
+    for (std::size_t pos = src->find(from); pos != std::string::npos;
-         pos = src.find(from, pos + len_to)) {
+         pos = src->find(from, pos + len_to)) {
-      src.replace(pos, len_from, to);
+      src->replace(pos, len_from, to);
    }
  }
  void SetComment(std::string name, std::string equation) {
-    Replace(comment_, "{name}", name);
+    Replace(&comment_, "{name}", name);
-    Replace(comment_, "{equation}", equation);
+    Replace(&comment_, "{equation}", equation);
  }
 };

--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/gru_op.h"
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/im2sequence_op.h"
+#include <vector>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -13,7 +13,7 @@
   limitations under the License. */
 #pragma once
+#include <vector>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"

--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/label_smooth_op.h"
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -100,7 +100,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
    auto x_row_max = EigenMatrix<T>::From(emission_row_max);
    x_row_max.device(place) =
        x.maximum(Eigen::DSizes<int, 1>(1))
-            .reshape(Eigen::DSizes<int, 2>(int(batch_size), 1));
+            .reshape(Eigen::DSizes<int, 2>(static_cast<int>(batch_size), 1));
    auto x_exps = EigenMatrix<T>::From(*emission_exps);
    x_exps.device(place) =

--- a/paddle/fluid/operators/logical_op.cc
+++ b/paddle/fluid/operators/logical_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/logical_op.h"
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {

--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/lrn_op.h"
+#include <string>
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif

--- a/paddle/fluid/operators/lstm_op.cc
+++ b/paddle/fluid/operators/lstm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/lstm_op.h"
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"

--- a/paddle/fluid/operators/lstm_unit_op.cu
+++ b/paddle/fluid/operators/lstm_unit_op.cu
@@ -18,6 +18,7 @@ https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.c
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/cross_entropy_op.h"
+#include "paddle/fluid/operators/lstm_unit_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/hostdevice.h"

--- a/paddle/fluid/operators/lstmp_op.cc
+++ b/paddle/fluid/operators/lstmp_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/lstmp_op.h"
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"

--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/matmul_op.h"
+#include <algorithm>
+#include <vector>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/matmul_op.h
+++ b/paddle/fluid/operators/matmul_op.h
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <algorithm>
+#include <functional>
+#include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matmul.h"

--- a/paddle/fluid/operators/maxout_op.cc
+++ b/paddle/fluid/operators/maxout_op.cc
@@ -13,6 +13,8 @@
 *     limitations under the License. */
 #include "paddle/fluid/operators/maxout_op.h"
+#include <vector>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/minus_op.h"
-#include "paddle/fluid/operators/net_op.h"
+#include <string>
+#include <vector>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/momentum_op.cu
+++ b/paddle/fluid/operators/momentum_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/momentum_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/mul_op.h"
+#include <vector>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/net_op.cc
+++ b/paddle/fluid/operators/net_op.cc
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/net_op.h"
-#include <set>
-#include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace operators {
-const char NetOp::kAll[] = "all";
-void NetOp::CompleteAddOp(bool calc) {
-  add_op_done_ = true;
-  if (!calc) return;
-  std::set<std::string> input_set;
-  std::set<std::string> output_set;
-  for (auto& op : ops_) {
-    for (auto& ipt : op->Inputs()) {
-      for (auto& var_name : ipt.second) {
-        // If input variable has been in output set, then it will be
-        // added into intermediate_outputs_. Otherwise, it will be
-        // added into input set.
-        if (Contains(output_set, var_name)) {
-          intermediate_outputs_.insert(var_name);
-        } else {
-          input_set.insert(var_name);
-        }
-      }
-    }
-    for (auto& opt : op->Outputs()) {
-      for (auto& var_name : opt.second) {
-        output_set.insert(var_name);
-      }
-    }
-  }
-  auto& inputs = inputs_[kAll];
-  inputs.reserve(input_set.size());
-  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs));
-  auto& outputs = outputs_[kAll];
-  outputs.reserve(output_set.size());
-  std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs));
-}
-std::string NetOp::DebugStringEx(const framework::Scope* scope) const {
-  std::ostringstream os;
-  os << OperatorBase::DebugStringEx(scope) << std::endl;
-  for (auto& op : ops_) {
-    std::istringstream is(op->DebugStringEx(scope));
-    for (std::string line; std::getline(is, line);) {
-      os << "    " << line << std::endl;
-    }
-  }
-  return os.str();
-}
-bool NetOp::IsNetOp() const { return true; }
-std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
-  std::vector<std::string> all;
-  for (auto& pair : this->outputs_) {
-    for (auto& var_name : pair.second) {
-      all.push_back(var_name);
-    }
-  }
-  if (has_intermediate) {
-    return all;
-  }
-  std::vector<std::string> ret_val;
-  for (auto& each : all) {
-    if (!Contains(intermediate_outputs_, each)) {
-      ret_val.push_back(each);
-    }
-  }
-  return ret_val;
-}
-NetOp::NetOp(const std::string& type, const framework::VariableNameMap& inputs,
-             const framework::VariableNameMap& outputs,
-             const framework::AttributeMap& attrs)
-    : framework::OperatorBase(type, inputs, outputs, attrs) {}
-std::unique_ptr<framework::OperatorBase> NetOp::Clone() const {
-  PADDLE_ENFORCE(
-      add_op_done_,
-      "Must clone a sealed NetOp, invoke Net::CompleteAddOp before clone");
-  return std::unique_ptr<OperatorBase>(new NetOp(*this));
-}
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/net_op.h
+++ b/paddle/fluid/operators/net_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <set>
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/op_registry.h"
-namespace paddle {
-namespace operators {
-/**
- * @brief Network is also a type of Operator
- *
- * It will manage the operators it has.
- *
- * Network is the container and controller of a set of operators.
- * A network object knows all Operators belonging to this network. Variables,
- * which are inputs and outputs of these operators, are created and managed by a
- * hierarchy of Scope objects.
- *
- * This is the base class of network, all the networks should implement the APIs
- * it defines.
- */
-class NetOp : public framework::OperatorBase {
- public:
-  static const char kAll[];
-  NetOp()
-      : framework::OperatorBase("plain_net", framework::VariableNameMap{},
-                                framework::VariableNameMap{},
-                                framework::AttributeMap{}) {}
-  NetOp(const std::string& type, const framework::VariableNameMap& inputs,
-        const framework::VariableNameMap& outputs,
-        const framework::AttributeMap& attrs);
-  NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) {
-    this->ops_.reserve(o.ops_.size());
-    std::transform(
-        o.ops_.begin(), o.ops_.end(), std::back_inserter(this->ops_),
-        [](const std::unique_ptr<framework::OperatorBase>& op) {
-          return std::unique_ptr<framework::OperatorBase>(op->Clone());
-        });
-    this->CompleteAddOp();
-  }
-  bool SupportGPU() const override {
-    for (auto& op : ops_) {
-      if (!op->SupportGPU()) {
-        return false;
-      }
-    }
-    return true;
-  }
-  void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); }
-  /**
-   * @brief Add an operator by ptr
-   */
-  void AppendOp(std::unique_ptr<framework::OperatorBase> op) {
-    PADDLE_ENFORCE(!add_op_done_,
-                   "Cannot AppendOp when this network is sealed");
-    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
-    ops_.push_back(std::move(op));
-  }
-  void InsertOp(size_t pos, std::unique_ptr<framework::OperatorBase> op) {
-    PADDLE_ENFORCE(!add_op_done_,
-                   "Cannot InsertOp when this network is sealed");
-    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
-    PADDLE_ENFORCE_LE(pos, ops_.size(), "Out of range");
-    ops_.insert(ops_.begin() + pos, std::move(op));
-  }
-  void InsertOp(size_t pos, const framework::OperatorBase& op) {
-    InsertOp(pos, op.Clone());
-  }
-  void CompleteAddOp(bool calculate = true);
-  std::string DebugStringEx(
-      const framework::Scope* scope = nullptr) const override;
-  bool IsNetOp() const override;
-  std::vector<std::string> OutputVars(bool has_intermediate) const override;
-  std::unique_ptr<framework::OperatorBase> Clone() const override;
-  std::vector<std::unique_ptr<framework::OperatorBase>> ops_;
- private:
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators with the `scope`, if no scope is provided, default
-   * scope will be used instead. If no OpContext is provicded, default context
-   * will be used.
-   */
-  void RunImpl(const framework::Scope& scope,
-               const platform::Place& place) const override {
-    for (auto& op : ops_) {
-      op->Run(scope, place);
-    }
-  }
-  bool add_op_done_{false};
-  std::set<std::string> intermediate_outputs_;
-  template <typename T, typename KeyType>
-  static bool Contains(T container, KeyType key) {
-    return container.find(key) != container.end();
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/net_op_test.cc
+++ b/paddle/fluid/operators/net_op_test.cc
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/net_op.h"
-#include <gtest/gtest.h>
-namespace paddle {
-namespace operators {
-using Scope = framework::Scope;
-using DeviceContext = platform::DeviceContext;
-static int run_cnt = 0;
-class TestOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  DEFINE_OP_CLONE_METHOD(TestOp);
- private:
-  void RunImpl(const Scope& scope,
-               const platform::Place& place) const override {
-    ++run_cnt;
-  }
-};
-template <typename T>
-void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
-                                  const std::vector<T>& actual) {
-  ASSERT_EQ(expected.size(), actual.size());
-  std::unordered_set<T> expected_set;
-  for (auto& tmp : expected) {
-    expected_set.insert(tmp);
-  }
-  for (auto& act : actual) {
-    ASSERT_NE(expected_set.end(), expected_set.find(act));
-  }
-}
-TEST(OpKernel, all) {
-  auto net = std::make_shared<NetOp>();
-  ASSERT_NE(net, nullptr);
-  net->AppendOp(std::unique_ptr<TestOp>(
-      new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                 {{"Out", {"y"}}}, framework::AttributeMap{})));
-  net->AppendOp(std::unique_ptr<TestOp>(
-      new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
-                 {{"Out", {"z"}}}, framework::AttributeMap{})));
-  net->CompleteAddOp();
-  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"},
-                               net->Inputs(NetOp::kAll));
-  AssertSameVectorWithoutOrder({"y", "z"}, net->Outputs(NetOp::kAll));
-  auto final_outs = net->OutputVars(false);
-  ASSERT_EQ(final_outs.size(), 1UL);
-  ASSERT_EQ(final_outs[0], "z");
-}
-TEST(NetOp, insert_op) {
-  NetOp net;
-  auto op1 = std::unique_ptr<framework::NOP>(
-      new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
-                         {{"Out", {"y"}}}, framework::AttributeMap{}));
-  net.AppendOp(*op1);
-  net.InsertOp(0, *op1);
-  ASSERT_EQ(2UL, net.ops_.size());
-  net.InsertOp(2, std::move(op1));
-  ASSERT_EQ(3UL, net.ops_.size());
-}
-TEST(NetOp, Clone) {
-  NetOp net;
-  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
-      "empty", framework::VariableNameMap{}, framework::VariableNameMap{},
-      framework::AttributeMap{}}));
-  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
-      "empty2", framework::VariableNameMap{}, framework::VariableNameMap{},
-      framework::AttributeMap{}}));
-  net.CompleteAddOp(true);
-  auto new_net_op = net.Clone();
-  ASSERT_NE(new_net_op, nullptr);
-  ASSERT_TRUE(new_net_op->IsNetOp());
-  auto* new_net = static_cast<NetOp*>(new_net_op.get());
-  ASSERT_EQ(2UL, new_net->ops_.size());
-  ASSERT_EQ(new_net->ops_[0]->Type(), "empty");
-  ASSERT_EQ(new_net->ops_[1]->Type(), "empty2");
-}
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/prelu_op.h"
-#include "paddle/fluid/operators/net_op.h"
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/scale_op.h"
-#include "paddle/fluid/operators/net_op.h"
+#include <string>
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/split_op.h"
-#include "paddle/fluid/operators/net_op.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -2,13 +2,13 @@ if(WITH_PYTHON)
  if(WITH_AMD_GPU)
    hip_library(paddle_pybind SHARED
      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
+      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
           parallel_executor
      ${GLOB_OP_LIB})
  else()
    cc_library(paddle_pybind SHARED
      SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
-      DEPS pybind python backward proto_desc memory executor prune init profiler feed_fetch_method
+      DEPS pybind python proto_desc memory executor prune init profiler feed_fetch_method
           parallel_executor
      ${GLOB_OP_LIB})
    if(NOT APPLE AND NOT ANDROID)

--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <string>
 #include <tuple>
-#include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/program_desc.h"
@@ -125,23 +124,6 @@ void BindProgramDesc(pybind11::module *m) {
           })
      .def("append_block", &pd::ProgramDesc::AppendBlock,
           pybind11::return_value_policy::reference)
-      .def("append_backward",
-           [](pd::ProgramDesc &program_desc, const pd::VarDesc &target,
-              const std::unordered_set<std::string> &no_grad_vars) {
-             pd::ParamGradInfoMap param_grad_map =
-                 AppendBackward(program_desc, target, no_grad_vars);
-             std::unordered_map<
-                 std::string, std::tuple<std::string /* grad_var_name */,
-                                         int /* block_idx */, int /* op_idx */>>
-                 retv;
-             for (auto it = param_grad_map.begin(); it != param_grad_map.end();
-                  ++it) {
-               const auto &grad_info = it->second;
-               retv[it->first] = std::make_tuple(
-                   grad_info.name_, grad_info.block_idx_, grad_info.op_idx_);
-             }
-             return retv;
-           })
      .def("block", &pd::ProgramDesc::MutableBlock,
           pybind11::return_value_policy::reference)
      .def("num_blocks", &pd::ProgramDesc::Size)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -20,9 +20,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
-#include "paddle/fluid/pybind/protobuf.h"
-#include "paddle/fluid/framework/backward.h"
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
@@ -31,18 +28,18 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/parallel_executor.h"
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
-#include "paddle/fluid/operators/cond_op.h"
-#include "paddle/fluid/operators/net_op.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/pybind/const_value.h"
 #include "paddle/fluid/pybind/exception.h"
-#include "paddle/fluid/pybind/pybind.h"
+#include "paddle/fluid/pybind/protobuf.h"
+#include "paddle/fluid/pybind/pybind.h"  // NOLINT
 #include "paddle/fluid/pybind/recordio.h"
 #include "paddle/fluid/pybind/tensor_py.h"
@@ -239,11 +236,6 @@ All parameter, weight, gradient are variables in Paddle.
           },
           py::return_value_policy::reference)
 #endif
-      .def("get_net",
-           [](Variable &self) -> operators::NetOp * {
-             return self.GetMutable<operators::NetOp>();
-           },
-           py::return_value_policy::reference)
      .def("get_reader",
           [](Variable &self) -> framework::ReaderHolder * {
             PADDLE_ENFORCE(self.IsType<framework::ReaderHolder>());
@@ -388,11 +380,6 @@ All parameter, weight, gradient are variables in Paddle.
                                   desc.InitializationErrorString());
                    return OpRegistry::CreateOp(desc);
                  })
-      .def("backward",
-           [](const OperatorBase &forwardOp,
-              const std::unordered_set<std::string> &no_grad_vars) {
-             return Backward(forwardOp, no_grad_vars).release();
-           })
      .def("run",
           [](OperatorBase &self, const Scope &scope,
              const platform::CPUPlace &place) { self.Run(scope, place); })
@@ -420,42 +407,6 @@ All parameter, weight, gradient are variables in Paddle.
           [](const OperatorBase &op) { return op.OutputVars(false); })
      .def("support_gpu", &OperatorBase::SupportGPU);
-  py::class_<operators::NetOp, OperatorBase>(m, "Net")
-      .def_static("create",
-                  []() -> operators::NetOp * {
-                    auto *retv = new operators::NetOp;
-                    retv->SetType("plain_net");
-                    return retv;
-                  })
-      .def("append_op", [](operators::NetOp &self,
-                           const OperatorBase &op) { self.AppendOp(op); })
-      .def("complete_add_op", &operators::NetOp::CompleteAddOp)
-      .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
-        self->CompleteAddOp();
-      });
-  // cond_op
-  py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
-      .def_static("create",
-                  [](py::bytes protobin) -> operators::CondOp * {
-                    proto::OpDesc desc;
-                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
-                                   "Cannot parse user input to OpDesc");
-                    PADDLE_ENFORCE(desc.IsInitialized(),
-                                   "User OpDesc is not initialized, reason %s",
-                                   desc.InitializationErrorString());
-                    auto cond_op = OpRegistry::CreateOp(desc);
-                    return static_cast<operators::CondOp *>(cond_op.release());
-                  })
-      .def("set_truenet",
-           [](operators::CondOp &self, const operators::NetOp &net) -> void {
-             self.set_truenet(net.Clone());
-           })
-      .def("set_falsenet",
-           [](operators::CondOp &self, const operators::NetOp &net) -> void {
-             self.set_falsenet(net.Clone());
-           });
  py::class_<framework::Executor>(m, "Executor")
      .def(py::init<const platform::Place &>())
      .def("run",

--- a/paddle/fluid/recordio/chunk.cc
+++ b/paddle/fluid/recordio/chunk.cc
@@ -14,13 +14,13 @@
 #include "paddle/fluid/recordio/chunk.h"
+#include <zlib.h>
 #include <algorithm>
 #include <memory>
 #include <sstream>
 #include "paddle/fluid/platform/enforce.h"
-#include "snappy_stream/include/snappystream.hpp"
+#include "snappystream.hpp"
-#include "zlib/include/zlib.h"
 namespace paddle {
 namespace recordio {

--- a/paddle/fluid/recordio/header.cc
+++ b/paddle/fluid/recordio/header.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 #include "paddle/fluid/recordio/header.h"
+#include <string>
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -231,7 +231,7 @@ function gen_fluid_inference_lib() {
    Deploying fluid inference library ...
    ========================================
 EOF
-        make inference_lib_dist
+        make -j `nproc` inference_lib_dist
    fi
 }

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1119,24 +1119,6 @@ class Program(object):
    def current_block(self):
        return self.blocks[self.current_block_idx]
-    def append_backward(self, target, no_grad_set=None):
-        """
-        return map(param_name -> (grad_name, block_index, op_index))
-        """
-        assert isinstance(target, Variable)
-        if no_grad_set is None:
-            no_grad_set = set()
-        try:
-            param_to_grad_info = self.desc.append_backward(target.desc,
-                                                           no_grad_set)
-        except Exception as e:
-            raise core.EnforceNotMet(
-                str(e) + "\nCurrent protobuf is\n{0}".format(
-                    self.to_string(False)))
-        self.sync_with_cpp()
-        return param_to_grad_info
    def create_block(self, parent_idx=None):
        new_block_idx = len(self.blocks)
        parent = self.current_block() if parent_idx is None else self.block(
@@ -1201,6 +1183,8 @@ class Parameter(Variable):
        self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
+        self.do_model_average = kwargs.get('do_model_average', None)
    def __str__(self):
        return self.to_string(True)
@@ -1221,7 +1205,7 @@ class Parameter(Variable):
        if with_details:
            res_str = Variable.to_string(self, throw_on_error, True)
            additional_attr = ("trainable", "optimize_attr", "regularizer",
-                               "gradient_clip_attr")
+                               "gradient_clip_attr", "do_model_average")
            for attr_name in additional_attr:
                res_str += "%s: %s\n" % (attr_name,
                                         str(getattr(self, attr_name)))

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -1516,7 +1516,8 @@ def batch_norm(input,
               in_place=False,
               name=None,
               moving_mean_name=None,
-               moving_variance_name=None):
+               moving_variance_name=None,
+               do_model_average_for_mean_and_var=False):
    """
    This function helps create an operator to implement
    the BatchNorm layer using the configurations from the input parameters.
@@ -1547,7 +1548,10 @@ def batch_norm(input,
    mean = helper.create_parameter(
        attr=ParamAttr(
-            name=moving_mean_name, initializer=Constant(0.0), trainable=False),
+            name=moving_mean_name,
+            initializer=Constant(0.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var),
        shape=param_shape,
        dtype=input.dtype)
    mean.stop_gradient = True
@@ -1556,7 +1560,8 @@ def batch_norm(input,
        attr=ParamAttr(
            name=moving_variance_name,
            initializer=Constant(1.0),
-            trainable=False),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var),
        shape=param_shape,
        dtype=input.dtype)
    variance.stop_gradient = True
@@ -3374,14 +3379,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None):
    Here are some examples to explain it.
    1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
-    is [6, 8], the reshape operator will transform x into a 2-D tensor with 
+    is [6, 8], the reshape operator will transform x into a 2-D tensor with
    shape [6, 8] and leaving x's data unchanged.
    2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
    specified is [2, 3, -1, 2], the reshape operator will transform x into a
    4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this
-    case, one dimension of the target shape is set to -1, the value of this 
+    case, one dimension of the target shape is set to -1, the value of this
-    dimension is inferred from the total element number of x and remaining 
+    dimension is inferred from the total element number of x and remaining
    dimensions.
    3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape
@@ -3615,7 +3620,7 @@ def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
 def pad(x, paddings, pad_value=0., name=None):
    """
    Pads a tensor with a constant value given by :attr:`pad_value`, and the
-    padded width is specified by :attr:`paddings`. 
+    padded width is specified by :attr:`paddings`.
    Specifically, the number of values padded before the contents of :attr:`x`
    in dimension :attr:`i` is indicated by :attr:`paddings[i]`, and the number
@@ -3643,7 +3648,7 @@ def pad(x, paddings, pad_value=0., name=None):
        x (Variable): The input tensor variable.
        paddings (list): A list of integers. Its elements specify the padded
                         width before and after for each dimension in turn.
-                         The length of :attr:paddings must be 
+                         The length of :attr:paddings must be
                         :math:`rank(x) \\times 2`.
        pad_value (float): The constant value used to pad.
        name(str|None): A name for this layer(optional). If set None, the layer

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 from collections import defaultdict
 from paddle.fluid.framework import Program
 import framework
@@ -818,8 +818,8 @@ class ModelAverage(Optimizer):
    min_average_window, max_average_window and current update times.
    Args:
-        params_grads: A list of parameter-grad variable pairs.
        average_window_rate: The rate of average window.
+        params_grads: A list of parameter-grad variable pairs.
        min_average_window: The minimum size of average window.
        max_average_window: The maximum size of average window.
@@ -840,8 +840,8 @@ class ModelAverage(Optimizer):
    """
    def __init__(self,
-                 params_grads,
                 average_window_rate,
+                 params_grads=None,
                 min_average_window=10000,
                 max_average_window=10000,
                 **kwargs):
@@ -849,24 +849,37 @@ class ModelAverage(Optimizer):
        self.average_window = average_window_rate
        self.min_average_window = min_average_window
        self.max_average_window = max_average_window
-        self.params_grads = params_grads
+        self.params_grads = [] if params_grads is None else params_grads
+        params = {}
+        for param, grad in self.params_grads:
+            if param.do_model_average != False:
+                params[param.name] = (param, grad)
+        for param in framework.default_main_program().global_block(
+        ).all_parameters():
+            if param.name not in params and param.do_model_average != False:
+                grad = param.block.create_var(
+                    name=unique_name.generate(".".join([param.name, 'tmp'])),
+                    dtype=param.dtype,
+                    persistable=False,
+                    stop_gradient=True)
+                params[param.name] = (param, grad)
+        self.params_grads = params.values()
        for param, grad in self.params_grads:
-            if grad is not None:
+            self._append_average_accumulate_op(param)
-                self._append_average_accumulate_op(param)
        self.apply_program = Program()
        block = self.apply_program.global_block()
        with program_guard(main_program=self.apply_program):
            for param_grad in self.params_grads:
-                if param_grad[1] is not None:
+                self._add_average_apply_op(block, param_grad)
-                    self._add_average_apply_op(block, param_grad)
        self.restore_program = Program()
        block = self.restore_program.global_block()
        with program_guard(main_program=self.restore_program):
            for param_grad in self.params_grads:
-                if param_grad[1] is not None:
+                self._add_average_restore_op(block, param_grad)
-                    self._add_average_restore_op(block, param_grad)
    def _add_average_apply_op(self, block, param_grad):
        param = block.clone_variable(param_grad[0])

--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -28,13 +28,15 @@ class ParamAttr(object):
                 learning_rate=1.0,
                 regularizer=None,
                 trainable=True,
-                 gradient_clip=None):
+                 gradient_clip=None,
+                 do_model_average=None):
        self.name = name
        self.initializer = initializer
        self.learning_rate = learning_rate
        self.regularizer = regularizer
        self.trainable = trainable
        self.gradient_clip = gradient_clip
+        self.model_average = do_model_average
    def set_default_initializer(self, initializer):
        if initializer is None:
@@ -80,7 +82,8 @@ class ParamAttr(object):
            },
            'regularizer': self.regularizer,
            'trainable': self.trainable,
-            'gradient_clip_attr': self.gradient_clip
+            'gradient_clip_attr': self.gradient_clip,
+            'model_average': self.model_average
        }
        if with_initializer:
            kwargs['initializer'] = self.initializer
@@ -90,7 +93,7 @@ class ParamAttr(object):
 class WeightNormParamAttr(ParamAttr):
    """
    Used for weight normalization. Any field in ParamAttr can also be set here.
-    Besides, an extra field dim can be set to indicate the dimension except 
+    Besides, an extra field dim can be set to indicate the dimension except
    which to normalize.
    """
    # List to record the parameters reparameterized by weight normalization.

--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
@@ -14,23 +14,13 @@
 import unittest
 import numpy as np
-from op_test import OpTest
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
+import paddle.fluid as fluid
+from op_test import OpTest
 from paddle.fluid.framework import grad_var_name
-def get_backward_op(scope, op, no_grad_set):
-    backward_op = core.Operator.backward(op, no_grad_set)
-    for input in backward_op.input_vars():
-        var = scope.var(input)
-        var.get_tensor()
-    for output in backward_op.output_vars():
-        var = scope.var(output)
-        var.get_tensor()
-    return backward_op
 def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
    x_shape = x.shape
    if len(x_shape) == 2:
@@ -64,11 +54,6 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format):
 def _reference_training(x, scale, offset, epsilon, data_format):
    x_shape = x.shape
-    if len(x_shape) == 2:
-        if data_format == "NCHW":
-            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
-        else:
-            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
    if data_format == "NCHW":
        n, c, h, w = x.shape
@@ -88,8 +73,6 @@ def _reference_training(x, scale, offset, epsilon, data_format):
        offset_tile = np.reshape(offset, (1, c, 1, 1))
        offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
        y = normalized * scale_tile + offset_tile
-        if len(x_shape) == 2:
-            y = np.reshape(y, (y.shape[0], y.shape[1]))
        return y, mean, var
    elif data_format == "NHWC":
        x_square = x * x
@@ -100,59 +83,42 @@ def _reference_training(x, scale, offset, epsilon, data_format):
        var = x_square_sum / element_count - mean * mean
        normalized = (x - mean) / np.sqrt(var + epsilon)
        y = normalized * scale + offset
-        if len(x_shape) == 2:
-            y = np.reshape(y, x_shape)
        return y, mean, var
    else:
        raise ValueError("Unknown data order.")
-def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
+def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format):
    # Use the following formulas to calculate gradients:
    # grad_scale =
    #   sum(grad_y * (x - mean)) * rsqrt(var + epsilon)
    #
    # grad_offset = sum(output_y)
    #
-    # grad_x =
+    # x_grad =
    #   1/N * scale * rsqrt(var + epsilon) * (N * grad_y - sum(grad_y) -
    #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
    # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
-    x_shape = x.shape
-    if len(x_shape) == 2:
-        if data_format == "NCHW":
-            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
-            grad_y = np.reshape(grad_y,
-                                (grad_y.shape[0], grad_y.shape[1], 1, 1))
-        else:
-            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
-            grad_y = np.reshape(grad_y,
-                                (grad_y.shape[0], 1, 1, grad_y.shape[1]))
    if data_format == "NCHW":
        x = np.transpose(x, (0, 2, 3, 1))
-        grad_y = np.transpose(grad_y, (0, 2, 3, 1))
+        y_grad = np.transpose(y_grad, (0, 2, 3, 1))
-        # raise ValueError("data_format must be NHWC, got %s." % data_format)
+    x_grad = scale * (y_grad - np.mean(
-    grad_x = scale * (grad_y - np.mean(
+        y_grad, axis=(0, 1, 2)) - (x - mean) * np.mean(
-        grad_y, axis=(0, 1, 2)) - (x - mean) * np.mean(
+            y_grad * (x - mean), axis=(0, 1, 2)) /
-            grad_y * (x - mean), axis=(0, 1, 2)) /
                      (var + epsilon)) / np.sqrt(var + epsilon)
-    grad_scale = np.sum(grad_y * (x - mean) / np.sqrt(var + epsilon),
+    grad_scale = np.sum(y_grad * (x - mean) / np.sqrt(var + epsilon),
                        axis=(0, 1, 2))
-    grad_offset = np.sum(grad_y, axis=(0, 1, 2))
+    grad_offset = np.sum(y_grad, axis=(0, 1, 2))
    # transfer back to N, C, H, W
    if data_format == "NCHW":
-        grad_x = np.transpose(grad_x, (0, 3, 1, 2))
+        x_grad = np.transpose(x_grad, (0, 3, 1, 2))
        x = np.transpose(x, (0, 3, 1, 2))
-        grad_y = np.transpose(grad_y, (0, 3, 1, 2))
+        y_grad = np.transpose(y_grad, (0, 3, 1, 2))
-    if len(x_shape) == 2:
+    return x_grad, grad_scale, grad_offset
-        grad_x = np.reshape(grad_x, x_shape)
-    return grad_x, grad_scale, grad_offset
 def create_or_get_tensor(scope, var_name, var, place):
@@ -186,7 +152,7 @@ def set_output_grad(scope, outputs, place, feed_dict=None):
        __set_tensor__(output, data)
-class TestBatchNormOpInference(OpTest):
+class TestBatchNormOpInference(unittest.TestCase):
    def setUp(self):
        self.dtype = np.float32
@@ -304,231 +270,121 @@ class TestFP16BatchNormOpInference(TestBatchNormOpInference):
                self.check_with_place(place, data_format, self.dtype, [2, 3])
-class TestBatchNormOpTraining(OpTest):
+class TestBatchNormOpTraining(unittest.TestCase):
    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        if not np.allclose(np.array(tensor), np_array, atol=atol):
+            import pdb
+            pdb.set_trace()
        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
-    def test_python_testing(self):
-        data_format = "NHWC"
-        epsilon = 0.00001
-        n, h, w, c = 2, 3, 4, 5
-        x_shape = [n, h, w, c]
-        scale_shape = [c]
-        x_val = np.random.random_sample(x_shape).astype(np.float32)
-        scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-        bias_val = np.random.random_sample(scale_shape).astype(np.float32)
-        mean = np.zeros(scale_shape).astype(np.float32)
-        variance = np.ones(scale_shape).astype(np.float32)
-        y_out = _reference_testing(x_val, scale_val, bias_val, mean, variance,
-                                   epsilon, "NHWC")
-        # running N, C, H, W case
-        # should produce the same results
-        x_shape2 = [n, c, h, w]
-        x_val2 = np.transpose(x_val, (0, 3, 1, 2))
-        y_out2 = _reference_testing(x_val2, scale_val, bias_val, mean, variance,
-                                    epsilon, "NCHW")
-        # transfer (N, C, H, W) back to (N, H, W, C)
-        y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
-        self.__assert_close(y_out, y_out2_trans, "inference output")
-        print 'python: NHWC, NCHW, inference checking passed'
-    def test_python_training(self):
-        data_format = "NHWC"
-        epsilon = 0.00001
-        momentum = 0.9
-        # N, H, W, C: 2, 3, 4, 2
-        n, h, w, c = 2, 3, 4, 5
-        x_shape = [n, h, w, c]
-        scale_shape = [c]
-        x_val = np.random.random_sample(x_shape).astype(np.float32)
-        scale_val = np.random.random_sample(scale_shape).astype(np.float32)
-        bias_val = np.random.random_sample(scale_shape).astype(np.float32)
-        mean = np.zeros(scale_shape).astype(np.float32)
-        variance = np.ones(scale_shape).astype(np.float32)
-        # run forward
-        y_out, saved_mean, var_ref = _reference_training(
-            x_val, scale_val, bias_val, epsilon, "NHWC")
-        #
-        mean_out = saved_mean * (1. - momentum) + momentum * mean
-        variance_out = var_ref * (1. - momentum) + momentum * variance
-        saved_variance = 1. / np.sqrt(var_ref + epsilon)
-        # running N, C, H, W case
-        # should produce the same results
-        x_shape2 = [n, c, h, w]
-        x_val2 = np.transpose(x_val, (0, 3, 1, 2))
-        y_out2, saved_mean2, var_ref2 = _reference_training(
-            x_val2, scale_val, bias_val, epsilon, "NCHW")
-        self.__assert_close(saved_mean, saved_mean2, "batch mean")
-        self.__assert_close(var_ref, var_ref2, "batch variance")
-        # transfer (N, C, H, W) back to (N, H, W, C)
-        y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
-        self.__assert_close(y_out, y_out2_trans, "batch output")
-        print 'python: NHWC, NCHW, forward checking passed'
-        # test backward now
-        # NHWC
-        self.y_grad = np.random.random_sample(x_shape).astype(np.float32)
-        y_grad = self.y_grad
-        # y_grad = np.ones(x_shape).astype(np.float32)
-        x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
-            x_val, y_grad, scale_val, saved_mean, var_ref, epsilon, "NHWC")
-        # NCHW
-        y_grad2 = np.transpose(y_grad, (0, 3, 1, 2))
-        # y_grad2 = np.ones(x_shape2).astype(np.float32)
-        x_grad_ref2, scale_grad_ref2, bias_grad_ref2 = _reference_grad(
-            x_val2, y_grad2, scale_val, saved_mean2, var_ref2, epsilon, "NCHW")
-        self.__assert_close(scale_grad_ref, scale_grad_ref2, "scale gradient")
-        self.__assert_close(bias_grad_ref, bias_grad_ref2, "bias gradient")
-        x_grad_transpose = np.transpose(x_grad_ref2, (0, 2, 3, 1))
-        self.__assert_close(x_grad_ref, x_grad_transpose, "x gradient")
-        print 'python: NHWC, NCHW, backward checking passed'
    def test_forward_backward(self):
        def test_with_place(place, data_layout, shape):
            # attr
            epsilon = 0.00001
            momentum = 0.9
+            if data_layout == "NCHW":
-            if len(shape) == 2:
+                n, c, h, w = shape[0], shape[1], shape[2], shape[3]
-                x_shape = shape
-                c = shape[1]
            else:
-                # n, h, w, c = 2, 3, 4, 2
                n, h, w, c = shape[0], shape[1], shape[2], shape[3]
-                if data_format == "NHWC":
-                    x_shape = [n, h, w, c]
-                elif data_format == "NCHW":
-                    x_shape = [n, c, h, w]
-                else:
-                    raise ValueError("Unknown data type.")
            scale_shape = [c]
-            x_val = np.random.random_sample(x_shape).astype(np.float32)
+            np.random.seed(123)
-            scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+            x = np.random.random_sample(shape).astype(np.float32)
-            bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+            scale = np.random.random_sample(scale_shape).astype(np.float32)
+            bias = np.random.random_sample(scale_shape).astype(np.float32)
            mean = np.zeros(scale_shape).astype(np.float32)
            variance = np.ones(scale_shape).astype(np.float32)
            # run forward
-            y_out, saved_mean, var_ref = _reference_training(
+            y, saved_mean, var_ref = _reference_training(x, scale, bias,
-                x_val, scale_val, bias_val, epsilon, data_format)
+                                                         epsilon, data_layout)
-            # update moving mean and variance
            mean_out = saved_mean * (1. - momentum) + momentum * mean
            variance_out = var_ref * (1. - momentum) + momentum * variance
            saved_variance = 1. / np.sqrt(var_ref + epsilon)
-            #  for gradient test
-            # y_grad = np.ones(x_shape).astype(np.float32)
-            y_grad = np.zeros(x_shape).astype(np.float32)
-            if len(y_grad.shape) == 2:
-                y_grad[0, 0] = 1.
-            else:
-                y_grad[0, 0, 0, 0] = 1.
-            # y_grad = np.random.random_sample(x_shape).astype(np.float32)
-            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
-                x_val, y_grad, scale_val, saved_mean, var_ref, epsilon,
-                data_format)
-            scope = core.Scope()
-            # create input
-            x_tensor = create_or_get_tensor(scope, "x_val", x_val, place)
-            scale_tensor = create_or_get_tensor(scope, "scale_val", scale_val,
-                                                place)
-            bias_tensor = create_or_get_tensor(scope, "bias_val", bias_val,
-                                               place)
-            mean_tensor = create_or_get_tensor(scope, "mean", mean, place)
-            variance_tensor = create_or_get_tensor(scope, "variance", variance,
-                                                   place)
-            # create output
-            y_tensor = create_or_get_tensor(scope, "y_out", None, place)
-            saved_mean_tensor = create_or_get_tensor(scope, "saved_mean", None,
-                                                     place)
-            saved_variance_tensor = create_or_get_tensor(
-                scope, "saved_variance", None, place)
-            mean_out_tensor = mean_tensor
-            variance_out_tensor = variance_tensor
-            batch_norm_op = Operator(
-                "batch_norm",
-                # inputs
-                X="x_val",
-                Scale="scale_val",
-                Bias="bias_val",
-                Mean="mean",
-                Variance="variance",
-                # outputs
-                Y="y_out",
-                MeanOut="mean",
-                VarianceOut="variance",
-                SavedMean="saved_mean",
-                SavedVariance="saved_variance",
-                # attrs
-                is_test=False,
-                data_layout=data_layout,
-                momentum=momentum,
-                epsilon=epsilon)
-            batch_norm_op.run(scope, place)
-            # check forward result
-            self.__assert_close(y_tensor, y_out, "y_out")
-            self.__assert_close(saved_mean_tensor, saved_mean, "saved_mean")
-            self.__assert_close(saved_variance_tensor, saved_variance,
-                                "saved_variance")
-            self.__assert_close(mean_out_tensor, mean_out, "mean_out")
-            if isinstance(place, core.CUDAPlace):
-                atol = 5e-2
-            else:
-                atol = 1e-4
-            self.__assert_close(variance_out_tensor, variance_out,
-                                "variance_out", atol)
-            print "op test forward passed: ", str(place), data_layout
            # run backward
-            batch_norm_op_grad = get_backward_op(scope, batch_norm_op, set())
+            y_grad = np.random.random_sample(shape).astype(np.float32)
-            set_output_grad(
+            x_grad, scale_grad, bias_grad = _reference_grad(
-                scope,
+                x, y_grad, scale, saved_mean, var_ref, epsilon, data_format)
-                ["y_out", "mean", "variance", "saved_mean", "saved_variance"],
-                place,
+            var_dict = locals()
-                feed_dict={"y_out": y_grad})
+            var_dict['y@GRAD'] = y_grad
-            batch_norm_op_grad.run(scope, place)
+            var_names = [
-            x_grad_tensor = create_or_get_tensor(scope,
+                'x', 'scale', 'bias', 'mean', 'variance', 'y', 'saved_mean',
-                                                 grad_var_name("x_val"), None,
+                'saved_variance'
-                                                 place)
+            ]
-            scale_grad_tensor = create_or_get_tensor(scope,
+            ground_truth = {name: var_dict[name] for name in var_names}
-                                                     grad_var_name("scale_val"),
-                                                     None, place)
+            program = fluid.Program()
-            bias_grad_tensor = create_or_get_tensor(scope,
+            with fluid.program_guard(program):
-                                                    grad_var_name("bias_val"),
+                block = program.global_block()
-                                                    None, place)
+                for name in ground_truth:
+                    block.create_var(
+                        name=name,
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
+                bn_op = block.append_op(
+                    type="batch_norm",
+                    inputs={
+                        "X": block.var('x'),
+                        "Scale": block.var('scale'),
+                        "Bias": block.var('bias'),
+                        "Mean": block.var('mean'),
+                        "Variance": block.var('variance')
+                    },
+                    outputs={
+                        "Y": block.var('y'),
+                        "MeanOut": block.var('mean'),  # share the same memory
+                        "VarianceOut":
+                        block.var('variance'),  # share the same memory
+                        "SavedMean": block.var('saved_mean'),
+                        "SavedVariance": block.var('saved_variance')
+                    },
+                    attrs={
+                        "momentum": momentum,
+                        "epsilon": epsilon,
+                        "is_test": False,
+                        "data_layout": data_layout
+                    })
+                block.create_var(name='y@GRAD', dtype='float32', shape=y.shape)
+                # generate backward op_desc
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    bn_op.desc, set(), [])
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
+                new_op_desc.copy_from(grad_op_desc)
+                for var_name in grad_op_desc.output_arg_names():
+                    block.desc.var(var_name.encode("ascii"))
+                grad_op_desc.infer_var_type(block.desc)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
+                exe = fluid.Executor(place)
+                out = exe.run(
+                    program,
+                    feed={
+                        name: var_dict[name]
+                        for name in
+                        ['x', 'scale', 'bias', 'mean', 'variance', 'y@GRAD']
+                    },
+                    fetch_list=[
+                        'y', 'mean', 'variance', 'saved_mean', 'saved_variance',
+                        'x@GRAD', 'scale@GRAD', 'bias@GRAD'
+                    ])
+            self.__assert_close(y, out[0], "y")
+            self.__assert_close(mean_out, out[1], "mean")
+            self.__assert_close(variance_out, out[2], "variance", 1e-3)
+            self.__assert_close(saved_mean, out[3], "saved_mean")
+            self.__assert_close(saved_variance, out[4], "saved_variance", 1e-3)
+            self.__assert_close(x_grad, out[5], "x_grad")
+            self.__assert_close(scale_grad, out[6], "scale_grad")
+            self.__assert_close(bias_grad, out[7], "bias_grad")
-            # check gradient output
+            print "op test forward passed: ", str(place), data_layout
-            self.__assert_close(x_grad_tensor, x_grad_ref, "x_grad")
-            self.__assert_close(scale_grad_tensor, scale_grad_ref, "scale_grad")
-            self.__assert_close(bias_grad_tensor, bias_grad_ref, "bias_grad")
-            print "op test backward passed: ", str(place), data_layout
        places = [core.CPUPlace()]
        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
@@ -537,7 +393,6 @@ class TestBatchNormOpTraining(OpTest):
        for place in places:
            for data_format in ["NCHW", "NHWC"]:
                test_with_place(place, data_format, [2, 3, 4, 5])
-                test_with_place(place, data_format, [2, 3])
 if __name__ == '__main__':

--- a/python/paddle/fluid/tests/unittests/test_cond_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cond_op.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import logging
-import paddle.fluid.core as core
-import unittest
-import numpy as np
-from paddle.fluid.op import Operator, CondOp
-class PySimpleCond(object):
-    '''
-    A simple implementation of dynamic if-else based on numpy
-    '''
-    def __init__(self):
-        array = [1] * 10
-        for i in range(1, 10, 2):
-            array[i] = 0
-        self.cond = np.array(array)
-        self.x = np.ones(shape=(10, 1)).astype("float32")
-    def forward(self):
-        self.index_t = np.where(self.cond == 1)
-        self.index_f = np.where(self.cond == 0)
-        y_t = self.x[self.index_t]
-        y_f = self.x[self.index_f]
-        y_t = y_t * 2.
-        y_f = y_f * (-2.)
-        output = np.zeros(shape=(10, 1))
-        output[self.index_t] = y_t
-        output[self.index_f] = y_f
-        return output
-class PySimpleCondTest(unittest.TestCase):
-    def setUp(self):
-        self.condnn = PySimpleCond()
-    def test_forward(self):
-        output = self.condnn.forward()
-def create_tensor(scope, name, shape, np_data):
-    tensor = scope.var(name).get_tensor()
-    tensor.set_dims(shape)
-    tensor.set(np_data, core.CPUPlace())
-    return tensor
-class TestCondOp(unittest.TestCase):
-    '''
-    Test CondOp
-    equation:
-        cond = [True, False, True, False, ...]
-        y[index_t] = x[index_t] * 2.
-        y[index_f] = x[index_f] * -2.
-    outputs:
-        y
-    '''
-    def setUp(self):
-        self.py_cond = PySimpleCond()
-    def forward(self):
-        self.scope = core.Scope()
-        self.create_global_variables()
-        self.create_cond_op()
-        self.create_sub_net()
-        self.condop.run(self.scope, core.CPUPlace())
-        return np.array(self.scope.find_var("Out").get_tensor())
-    def create_global_variables(self):
-        x_np_data = self.py_cond.x
-        create_tensor(self.scope, "X", [10, 1], x_np_data)
-        cond_np_data = self.py_cond.cond.astype("int32")
-        create_tensor(self.scope, "cond", [10, 1], cond_np_data)
-        self.scope.var("SubScopes")
-        self.scope.var("IndexTensors")
-        self.scope.var("Out")
-    def create_cond_op(self):
-        self.condop = CondOp(
-            Cond="cond",
-            Xs=["X"],
-            Outs=["Out"],
-            SubScopes="SubScopes",
-            IndexTensors="IndexTensors")
-    def create_sub_net(self):
-        truenet = core.Net.create()
-        scale_op_t = Operator("scale", X='X', Out='Out', scale=2.)
-        truenet.append_op(scale_op_t)
-        truenet.complete_add_op(True)
-        self.condop.set_truenet(truenet)
-        falsenet = core.Net.create()
-        scale_op_t = Operator("scale", X='X', Out='Out', scale=-2.)
-        falsenet.append_op(scale_op_t)
-        falsenet.complete_add_op(True)
-        self.condop.set_falsenet(falsenet)
-    def test_forward(self):
-        print 'test cond op forward'
-        pd_output = self.forward()
-        py_output = self.py_cond.forward()
-        print 'pd_output', pd_output
-        print
-        print 'py_output', py_output
-        self.assertEqual(pd_output.shape, py_output.shape)
-        print 'test passed'
-        return 0
-if __name__ == "__main__":
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -15,10 +15,8 @@ import unittest
 import numpy as np
 from operator import mul
-from op_test import OpTest
 import paddle.fluid.core as core
-from paddle.fluid.op import Operator
+import paddle.fluid as fluid
-from paddle.fluid.framework import grad_var_name
 np.random.random(123)
@@ -70,161 +68,93 @@ def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
    return grad_x, d_scale, d_bias
-def get_backward_op(scope, op, no_grad_set):
+class TestLayerNormdOp(unittest.TestCase):
-    backward_op = core.Operator.backward(op, no_grad_set)
-    for input in backward_op.input_vars():
-        var = scope.var(input)
-        var.get_tensor()
-    for output in backward_op.output_vars():
-        var = scope.var(output)
-        var.get_tensor()
-    return backward_op
-def create_or_get_tensor(scope, var_name, var, place):
-    tensor = scope.var(var_name).get_tensor()
-    if var is not None:
-        assert isinstance(var, np.ndarray)
-        tensor.set_lod([[]])
-        tensor.set_dims(var.shape)
-        tensor.set(var, place)
-    return tensor
-def set_output_grad(scope, outputs, place, feed_dict=None):
-    def __set_tensor__(name, data=None):
-        out_tensor = scope.find_var(name).get_tensor()
-        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
-        out_dtype = out_tensor.dtype()
-        if data is None:
-            if out_dtype == core.VarDesc.VarType.FP64:
-                data = np.ones(out_tensor.shape(), dtype=np.float64)
-            elif out_dtype == core.VarDesc.VarType.FP32:
-                data = np.ones(out_tensor.shape(), dtype=np.float32)
-            else:
-                raise ValueError("Not supported data type " + str(out_dtype))
-        grad_tensor.set(data, place)
-    for output in outputs:
-        data = None
-        if output in feed_dict:
-            data = feed_dict[output]
-        __set_tensor__(output, data)
-class TestLayerNormdOp(OpTest):
    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
-    def __assert_grad_close(self,
-                            tensor,
-                            np_array,
-                            name,
-                            place,
-                            max_relative_error=0.02):
-        a = np.array(tensor)
-        b = np_array
-        abs_a = np.abs(a)
-        abs_a[abs_a < 1e-5] = 1
-        diff_mat = np.abs(a - b) / abs_a
-        max_diff = np.max(diff_mat)
-        def err_msg():
-            offset = np.argmax(diff_mat > max_relative_error)
-            return ("%s Variable %s max gradient diff %f over limit %f, "
-                    "the first error element is %d, %f, %f") % (
-                        "Gradient Check On %s" % str(place), name, max_diff,
-                        max_relative_error, offset, a.flatten()[offset],
-                        b.flatten()[offset])
-        self.assertLessEqual(max_diff, max_relative_error, err_msg())
    def check_forward_backward(self, shape, begin_norm_axis):
-        def test_with_place(place, shape, begin_norm_axis=1):
+        def test_with_place(place, shape, begin_norm_axis):
-            # setUp
-            assert begin_norm_axis > 0 and begin_norm_axis < len(
-                shape), 'begin_norm_axis must be between 0 and len(shape)-1.'
            # attr
            epsilon = 0.00001
            x_shape = shape
            D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
            scale_shape = [D]
-            x_val = np.random.random_sample(x_shape).astype(np.float32)
+            np.random.seed(123)
-            scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+            x = np.random.random_sample(x_shape).astype(np.float32)
-            bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+            scale = np.random.random_sample(scale_shape).astype(np.float32)
+            bias = np.random.random_sample(scale_shape).astype(np.float32)
            y_grad = np.random.random_sample(x_shape).astype(np.float32)
-            # run forward
+            # reference forward & backward
-            y_out, saved_mean, var_ref = _reference_layer_norm_naive(
+            y, mean, variance = _reference_layer_norm_naive(
-                x_val, scale_val, bias_val, epsilon, begin_norm_axis)
+                x, scale, bias, epsilon, begin_norm_axis)
-            naive_fw = {"Y": y_out, "Mean": saved_mean, "Variance": var_ref}
+            x_grad, scale_grad, bias_grad = _reference_layer_norm_grad(
+                x, y_grad, scale, mean, variance, begin_norm_axis)
-            # get gradient
-            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad(
+            var_dict = locals()
-                x_val, y_grad, scale_val, saved_mean, var_ref, begin_norm_axis)
+            var_dict['y@GRAD'] = y_grad
-            naive_grad = {
+            var_names = [
-                "X": x_grad_ref,
+                'x', 'scale', 'bias', 'mean', 'variance', 'y', 'y@GRAD'
-                "Scale": scale_grad_ref,
+            ]
-                "Bias": bias_grad_ref
+            ground_truth = {name: var_dict[name] for name in var_names}
-            }
+            program = fluid.Program()
-            scope = core.Scope()
+            with fluid.program_guard(program):
+                block = program.global_block()
-            # create input
+                for name in ground_truth:
-            input_map = {"X": x_val, "Scale": scale_val, "Bias": bias_val}
+                    block.create_var(
-            for i_name in input_map:
+                        name=name,
-                create_or_get_tensor(scope, i_name, input_map[i_name], place)
+                        dtype='float32',
+                        shape=ground_truth[name].shape)
-            # create output
+                layer_norm_op = block.append_op(
-            output_map = {"Y": None, "Mean": None, "Variance": None}
+                    type="layer_norm",
-            output_tensor = {}
+                    inputs={
-            for o_name in output_map:
+                        "X": block.var('x'),
-                output_tensor[o_name] = create_or_get_tensor(
+                        "Scale": block.var('scale'),
-                    scope, o_name, output_map[o_name], place)
+                        "Bias": block.var('bias'),
+                    },
-            layer_norm_op = Operator(
+                    outputs={
-                "layer_norm",
+                        "Y": block.var('y'),
-                # inputs
+                        "Mean": block.var('mean'),  # share the same memory
-                X="X",
+                        "Variance":
-                Scale="Scale",
+                        block.var('variance'),  # share the same memory
-                Bias="Bias",
+                    },
-                # outputs
+                    attrs={
-                Y="Y",
+                        "epsilon": epsilon,
-                Mean="Mean",
+                        "begin_norm_axis": begin_norm_axis
-                Variance="Variance",
+                    })
-                # attrs
-                epsilon=epsilon,
+                # generate backward op_desc
-                begin_norm_axis=begin_norm_axis)
+                grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
+                    layer_norm_op.desc, set(), [])
-            layer_norm_op.run(scope, place)
+                grad_op_desc = grad_op_desc_list[0]
+                new_op_desc = block.desc.append_op()
-            # check forward result
+                new_op_desc.copy_from(grad_op_desc)
-            atol = 5e-2 if isinstance(place, core.CUDAPlace) else 1e-4
+                for var_name in grad_op_desc.output_arg_names():
-            for o_tensor in output_tensor:
+                    block.desc.var(var_name.encode("ascii"))
-                self.__assert_close(output_tensor[o_tensor], naive_fw[o_tensor],
+                grad_op_desc.infer_var_type(block.desc)
-                                    o_tensor, atol)
+                grad_op_desc.infer_shape(block.desc)
+                for arg in grad_op_desc.output_arg_names():
-            # run backward
+                    grad_var = block.desc.find_var(arg.encode("ascii"))
-            layer_norm_op_grad = get_backward_op(scope, layer_norm_op, set())
+                    grad_var.set_dtype(core.VarDesc.VarType.FP32)
-            set_output_grad(
-                scope, ["Y", "Mean", "Variance"],
+                exe = fluid.Executor(place)
-                place,
+                out = exe.run(program,
-                feed_dict={"Y": y_grad})
+                              feed={
-            layer_norm_op_grad.run(scope, place)
+                                  name: var_dict[name]
+                                  for name in ['x', 'scale', 'bias', 'y@GRAD']
-            # get output
+                              },
-            grad_tensor = {}
+                              fetch_list=[
-            for o_name in naive_grad:
+                                  'y', 'mean', 'variance', 'x@GRAD',
-                grad_tensor[o_name] = x_ = create_or_get_tensor(
+                                  'scale@GRAD', 'bias@GRAD'
-                    scope, grad_var_name(o_name), None, place)
+                              ])
+                self.__assert_close(y, out[0], "y")
-            # check gradient output
+                self.__assert_close(mean, out[1], "mean")
-            for o_grad in naive_grad:
+                self.__assert_close(variance, out[2], "variance", 1e-3)
-                self.__assert_grad_close(grad_tensor[o_grad],
+                self.__assert_close(x_grad, out[3], "x_grad")
-                                         naive_grad[o_grad], o_grad + "@GRAD",
+                self.__assert_close(scale_grad, out[4], "scale_grad", 1e-3)
-                                         place)
+                self.__assert_close(bias_grad, out[5], "bias_grad")
        places = [core.CPUPlace()]
        if core.is_compiled_with_cuda() and core.op_support_gpu("layer_norm"):
@@ -237,15 +167,6 @@ class TestLayerNormdOp(OpTest):
        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
-    def test_check_forward_backward_with_scale(self):
-        pass  # TODO(zcd)
-    def test_check_forward_backward_with_bias(self):
-        pass  # TODO(zcd)
-    def test_check_forward_backward(self):
-        pass  # TODO(zcd)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -32,7 +32,6 @@ class TestBook(unittest.TestCase):
            cost = layers.square_error_cost(input=y_predict, label=y)
            avg_cost = layers.mean(cost)
            self.assertIsNotNone(avg_cost)
-            program.append_backward(avg_cost)
        print(str(program))
@@ -94,8 +93,6 @@ class TestBook(unittest.TestCase):
            cost = layers.cross_entropy(input=predict, label=label)
            avg_cost = layers.mean(cost)
-            program.append_backward(avg_cost)
        print(str(program))
    def test_word_embedding(self):

--- a/python/paddle/fluid/tests/unittests/test_net.py
+++ b/python/paddle/fluid/tests/unittests/test_net.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle.fluid.core as core
-from paddle.fluid.op import Operator
-import unittest
-def fc(X, W, Y):
-    ret_v = core.Net.create()
-    ret_v.append_op(Operator("mul", X="X", Y="W", Out="pre_activation"))
-    ret_v.append_op(Operator("sigmoid", X="pre_activation", Out=Y))
-    ret_v.complete_add_op(True)
-    return ret_v
-class TestNet(unittest.TestCase):
-    def test_net_all(self):
-        net = core.Net.create()
-        op1 = Operator("sum", X=["X", "Y"], Out="Out")
-        net.append_op(op1)
-        net2 = core.Net.create()
-        net2.append_op(fc(X="X", W="w", Y="fc.out"))
-        net2.complete_add_op(True)
-        net.append_op(net2)
-        net.complete_add_op(True)
-        expected = '''
-Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
-    Op(sum), inputs:{X[X, Y]}, outputs:{Out[Out]}.
-    Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
-        Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
-            Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
-            Op(sigmoid), inputs:{X[pre_activation]}, outputs:{Out[fc.out]}.
-'''
-        self.assertEqual(expected, "\n" + str(net))
-if __name__ == "__main__":
-    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -473,7 +473,7 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
            loss = simple_fc_net(True)
            test_program = main.clone(for_test=True)
-            opt = fluid.optimizer.SGD(learning_rate=0.0001)
+            opt = fluid.optimizer.SGD(learning_rate=0.001)
            opt.minimize(loss)
            batch_size = 32
@@ -500,4 +500,8 @@ class ParallelExecutorTestingDuringTraining(unittest.TestCase):
                train_loss, = train_exe.run([loss.name], feed_dict=feed_dict)
                train_loss = numpy.array(train_loss)
-                self.assertTrue(numpy.allclose(train_loss, test_loss))
+                self.assertTrue(
+                    numpy.allclose(
+                        train_loss, test_loss, atol=1e-8),
+                    "Train loss: " + str(train_loss) + "\n Test loss:" +
+                    str(test_loss))
--- a/python/paddle/fluid/tests/unittests/test_program.py
+++ b/python/paddle/fluid/tests/unittests/test_program.py
@@ -87,57 +87,6 @@ class TestProgram(unittest.TestCase):
        print(prog)
        print(prog_restored)
-    def test_append_backward(self):
-        prog = Program()
-        block = prog.global_block()
-        mul_x = block.create_var(
-            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
-        mul_y = block.create_var(
-            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
-        mul_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
-        mul_op = block.append_op(
-            type="mul",
-            inputs={"X": [mul_x],
-                    "Y": mul_y},
-            outputs={"Out": [mul_out]},
-            attrs={"x_num_col_dims": 1})
-        add_y = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="add.y")
-        add_out = block.create_var(
-            dtype="float32", shape=[5, 8], lod_level=0, name="add.out")
-        add_op = block.append_op(
-            type="elementwise_add",
-            inputs={"X": mul_out,
-                    "Y": add_y},
-            outputs={"Out": add_out},
-            attrs={"x_num_col_dims": 1})
-        mean_out = block.create_var(
-            dtype="float32", shape=[1], lod_level=0, name="mean.out")
-        block.append_op(
-            type="mean", inputs={"X": add_out}, outputs={"Out": mean_out})
-        self.assertEqual(mul_op.idx, 0)
-        self.assertEqual(add_op.idx, 1)
-        param_to_grad = prog.append_backward(mean_out, set())
-        for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out",
-                         "mean.out"):
-            self.assertEqual(param_to_grad[var_name][0],
-                             grad_var_name(var_name))
-            self.assertEqual(param_to_grad[var_name][1], 0)
-        expect_ops = [
-            "mul", "elementwise_add", "mean", "fill_constant", "mean_grad",
-            "elementwise_add_grad", "mul_grad"
-        ]
-        actual_ops = []
-        for op in block.ops:
-            actual_ops.append(op.type)
-        self.assertEqual(actual_ops, expect_ops)
    def test_program_clone_with_parameter(self):
        main_program = Program()
        startup_program = Program()