merge develop

ff052c0e · nhzlx · c6a5c4b0 · 3ae97aab · ff052c0e · ff052c0e
44 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -204,6 +204,11 @@ include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)

+include(flags)              # set paddle compile flags
+include(cudnn)              # set cudnn libraries, must before configure
+include(cupti)
+include(configure)          # add paddle env configuration
+
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
@@ -212,15 +217,11 @@ elseif()
    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
 endif()

-include(cudnn)              # set cudnn libraries, must before configure
-include(cupti)
-include(configure)          # add paddle env configuration
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(rdma)               # set rdma libraries
-include(flags)              # set paddle compile flags
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
 include(inference_lib)      # add paddle fluid inference libraries

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -50,16 +50,16 @@ if(NOT WITH_PROFILER)
 endif(NOT WITH_PROFILER)

 if(NOT CMAKE_CROSSCOMPILING)
-    if(WITH_AVX AND AVX_FOUND)
+    if(WITH_AVX AND AVX512F_FOUND)
+        set(SIMD_FLAG ${AVX512F_FLAG})
+    elseif(WITH_AVX AND AVX2_FOUND)
+        set(SIMD_FLAG ${AVX2_FLAG})
+    elseif(WITH_AVX AND AVX_FOUND)
        set(SIMD_FLAG ${AVX_FLAG})
    elseif(SSE3_FOUND)
        set(SIMD_FLAG ${SSE3_FLAG})
    endif()
 endif()
-if(UNIX AND NOT APPLE)
-  # except apple from nix*Os family
-  set(LINUX TRUE)
-endif(UNIX AND NOT APPLE)

 if(NOT WITH_GOLANG)
    add_definitions(-DPADDLE_WITHOUT_GOLANG)
@@ -103,15 +103,20 @@ if(WITH_GPU)
    endif()
    if(WITH_ANAKIN)
        if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
-            message(FATAL_ERROR "Anakin needs CUDA >= 8.0 to compile")
+            message(WARNING "Anakin needs CUDA >= 8.0 to compile. Force WITH_ANAKIN=OFF")
+            set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDA >= 8.0." FORCE)
        endif()
        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
-            message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
+            message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
+            set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
        endif()
-        set(ENV{CUDNN_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR})
-        set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY})
-        message(STATUS "cudnn include header is ${CUDNN_INCLUDE_DIR}/cudnn.h")
-        message(STATUS "cudnn library is ${CUDNN_LIBRARY}")
+    endif()
+    if(WITH_ANAKIN)
+        # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR
+        # is a softlink to real cudnn.h directory
+        set(ENV{CUDNN_INCLUDE_DIR} "${CUDNN_INCLUDE_DIR}/")
+        get_filename_component(CUDNN_LIBRARY_DIR ${CUDNN_LIBRARY} DIRECTORY)
+        set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY_DIR})
    endif()
 elseif(WITH_AMD_GPU)
    add_definitions(-DPADDLE_WITH_HIP)

--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -25,8 +25,25 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
    $ENV{CUDNN_ROOT}
    $ENV{CUDNN_ROOT}/lib64
    $ENV{CUDNN_ROOT}/lib
-    /usr/lib)
-find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a
+    /usr/lib
+	${CUDA_TOOLKIT_ROOT_DIR}
+	${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
+	)
+set(CUDNN_LIB_NAME "")
+if (LINUX)
+set(CUDNN_LIB_NAME "libcudnn.so")
+endif(LINUX)
+
+if(WIN32)
+# only support cudnn7
+set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
+endif(WIN32)
+
+if(Apple)
+set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
+endif(Apple)
+
+find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
    PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
          NO_DEFAULT_PATH
    DOC "Path to cuDNN library.")

--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -19,17 +19,17 @@ execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-cer
 include_directories(${ANAKIN_INCLUDE})
 include_directories(${ANAKIN_INCLUDE}/saber/)

-set(ANAKIN_COMPILE_EXTRA_FLAGS 
+set(ANAKIN_COMPILE_EXTRA_FLAGS
    -Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
-    -Wno-error=unused-variable -Wno-unused-variable 
+    -Wno-error=unused-variable -Wno-unused-variable
    -Wno-error=format-extra-args -Wno-format-extra-args
-    -Wno-error=comment -Wno-comment 
-    -Wno-error=format -Wno-format 
+    -Wno-error=comment -Wno-comment
+    -Wno-error=format -Wno-format
    -Wno-error=switch -Wno-switch
-    -Wno-error=return-type -Wno-return-type 
+    -Wno-error=return-type -Wno-return-type
    -Wno-error=non-virtual-dtor -Wno-non-virtual-dtor
    -Wno-sign-compare
-    -Wno-reorder 
+    -Wno-reorder
    -Wno-error=cpp)

 ExternalProject_Add(
@@ -47,6 +47,7 @@ ExternalProject_Add(
                        -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
                        -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
                        -DCUDNN_ROOT=${CUDNN_ROOT}
+                        -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
                        ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
 )

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -142,6 +142,11 @@ else()
        ${GPU_COMMON_FLAGS})
 endif()

+if(UNIX AND NOT APPLE)
+  # except apple from nix*Os family
+  set(LINUX TRUE)
+endif(UNIX AND NOT APPLE)
+

 foreach(flag ${COMMON_FLAGS})
    safe_set_cflag(CMAKE_C_FLAGS ${flag})

--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -10,6 +10,7 @@ if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID
    set(SSE3_FLAG "-msse3")
    set(AVX_FLAG "-mavx")
    set(AVX2_FLAG "-mavx2")
+    set(AVX512F_FLAG "-mavx512f")
 elseif(MSVC)
    set(MMX_FLAG "/arch:MMX")
    set(SSE2_FLAG "/arch:SSE2")
@@ -81,5 +82,16 @@ int main()
    return 0;
 }" AVX2_FOUND)

+# Check AVX512F
+set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m512i a = _mm512_undefined_epi32();
+    return 0;
+}" AVX512F_FOUND)
+
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
-mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
+mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -99,8 +99,13 @@ else()
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
 endif()

-
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass)
+if (NOT WIN32)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS
+        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
+        graph graph_viz_pass multi_devices_graph_pass
+        multi_devices_graph_print_pass multi_devices_graph_check_pass
+        fast_threaded_ssa_graph_executor)
+endif() # NOT WIN32

 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -42,3 +42,5 @@ cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_b
 cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor)
 #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
 #        device_context reduce_op_handle )
+cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
+        DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -19,10 +19,13 @@ namespace framework {
 namespace details {

 struct ExecutionStrategy {
+  enum ExecutorType { kDefault = 0, kExperimental = 1 };
+
  size_t num_threads_{0};
  bool use_cuda_{true};
  bool allow_op_delay_{false};
  size_t num_iteration_per_drop_scope_{100};
+  ExecutorType type_{kDefault};
 };

 }  //  namespace details

--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/details/fetch_op_handle.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
+    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
+    const std::vector<platform::Place> &places,
+    std::unique_ptr<ir::Graph> &&graph)
+    : strategy_(strategy),
+      local_scopes_(local_scopes),
+      places_(places),
+      graph_(std::move(graph)),
+      pool_(strategy.num_threads_ +
+            1),  // add one more thread for generate op_deps
+      fetch_ctxs_(places) {
+  auto &ops = graph_->Get<details::GraphOps>("ops");
+
+  for (auto &op : ops) {
+    int dep = static_cast<int>(op->NotReadyInputSize());
+    op_deps_.emplace(op.get(), dep);
+    if (dep == 0) {
+      bootstrap_ops_.emplace_back(op.get());
+    }
+  }
+
+  PrepareAtomicOpDeps();
+}
+
+FeedFetchList FastThreadedSSAGraphExecutor::Run(
+    const std::vector<std::string> &fetch_tensors) {
+  std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>
+      op_deps = atomic_op_deps_.get();
+  PrepareAtomicOpDeps();
+
+  paddle::framework::FeedFetchList fetches;
+  fetches.resize(fetch_tensors.size());
+  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
+  std::vector<std::unique_ptr<ir::Node>> fetch_nodes;
+  std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
+
+  for (auto &fetch_var_name : fetch_tensors) {
+    for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
+      auto it = var_map.find(fetch_var_name);
+      if (it != var_map.end()) {
+        fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
+      }
+    }
+  }
+
+  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
+    auto &var_name = fetch_tensors[i];
+    auto fetched_var_it = fetched_vars.find(var_name);
+    PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
+                   "Cannot find fetched variable.(Perhaps the main_program "
+                   "is not set to ParallelExecutor)");
+
+    auto &vars = fetched_var_it->second;
+
+    fetch_nodes.emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
+    auto *op = new FetchOpHandle(fetch_nodes.back().get(), &fetches, i,
+                                 &local_scopes_);
+    fetch_ops.emplace_back(op);
+
+    for (auto &p : places_) {
+      op->SetDeviceContext(p, fetch_ctxs_.Get(p));
+    }
+
+    for (auto *var : vars) {
+      op->AddInput(var);
+    }
+
+    (*op_deps)[op] = static_cast<int>(op->NotReadyInputSize());
+  }
+
+  size_t num_complete = 0;
+  remaining_ = 0;
+  BlockingQueue<size_t> complete_q;
+  for (auto op : bootstrap_ops_) {
+    RunOpAsync(op_deps.get(), op, &complete_q);
+  }
+
+  while (num_complete != op_deps->size()) {
+    size_t num_comp = complete_q.Pop();
+    if (num_comp == -1UL) {
+      int remaining = 0;
+      while (true) {
+        remaining = remaining_;
+        if (remaining == 0) {
+          break;
+        }
+        for (int i = 0; i < remaining; ++i) {
+          complete_q.Pop();
+        }
+      }
+      exception_.ReThrow();
+    }
+    num_complete += num_comp;
+  }
+  // Wait FetchOps.
+  if (!fetch_ops.empty()) {
+    fetch_ops.clear();
+  }
+  return fetches;
+}
+void FastThreadedSSAGraphExecutor::RunOpAsync(
+    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
+    OpHandleBase *op, BlockingQueue<size_t> *complete_q) {
+  ++remaining_;
+  this->pool_.enqueue([=] {
+    OpHandleBase *op_to_run = op;
+    size_t complete = 0;
+    while (op_to_run != nullptr) {
+      try {
+        op_to_run->Run(strategy_.use_cuda_);
+        ++complete;
+      } catch (...) {
+        exception_.Catch(std::current_exception());
+        --remaining_;
+        complete_q->Push(-1UL);
+        return;
+      }
+      auto &outputs = op_to_run->Outputs();
+      op_to_run = nullptr;
+      for (auto &output : outputs) {
+        for (auto &pending_op : output->PendingOps()) {
+          std::atomic<int> &deps = op_deps->at(pending_op);
+          if (deps.fetch_sub(1) == 1) {  // pending_op ready
+            if (op_to_run == nullptr) {
+              op_to_run = pending_op;
+            } else {
+              this->RunOpAsync(op_deps, pending_op, complete_q);
+            }
+          }
+        }
+      }
+    }
+    --remaining_;
+    complete_q->Push(complete);
+  });
+}
+void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
+  atomic_op_deps_ = pool_.enqueue([&] {
+    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps =
+        new std::unordered_map<OpHandleBase *, std::atomic<int>>;
+    for (auto &pair : op_deps_) {
+      (*op_deps)[pair.first] = pair.second;
+    }
+    return std::unique_ptr<
+        std::unordered_map<OpHandleBase *, std::atomic<int>>>(op_deps);
+  });
+}
+
+const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; }
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "ThreadPool.h"
+#include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/details/exception_holder.h"
+#include "paddle/fluid/framework/details/execution_strategy.h"
+#include "paddle/fluid/framework/details/ssa_graph_executor.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace details {
+
+class OpHandleBase;
+class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
+ public:
+  FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
+                               const std::vector<Scope *> &local_scopes,
+                               const std::vector<platform::Place> &places,
+                               std::unique_ptr<ir::Graph> &&graph);
+  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
+  const ir::Graph &Graph() const override;
+
+ private:
+  ExecutionStrategy strategy_;
+  std::vector<Scope *> local_scopes_;
+  std::vector<platform::Place> places_;
+  std::unique_ptr<ir::Graph> graph_;
+
+  std::unordered_map<OpHandleBase *, int> op_deps_;
+  std::vector<OpHandleBase *> bootstrap_ops_;
+
+  ::ThreadPool pool_;
+  platform::DeviceContextPool fetch_ctxs_;
+  std::atomic<int> remaining_;
+
+  void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
+                  OpHandleBase *op, BlockingQueue<size_t> *complete_q);
+
+  void PrepareAtomicOpDeps();
+
+  std::future<
+      std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
+      atomic_op_deps_;
+  ExceptionHolder exception_;
+};
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -158,6 +158,16 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p,
 #endif
 }

+size_t OpHandleBase::NotReadyInputSize() const {
+  std::unordered_set<VarHandleBase *> res;
+  for (auto *var : inputs_) {
+    if (var->GeneratedOp() != nullptr) {
+      res.emplace(var);
+    }
+  }
+  return res.size();
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -81,6 +81,8 @@ class OpHandleBase {
    return res.size();
  }

+  size_t NotReadyInputSize() const;
+
  const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }

  size_t NoDummyInputSize() const;

--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -117,7 +117,15 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
    }
    // For output args, always create a new var.
    for (auto &each_var_name : op->OutputArgumentNames()) {
-      ir::Node *var = CreateVarNode(all_vars.at(each_var_name));
+      ir::Node *var = nullptr;
+      if (all_vars.count(each_var_name) != 0) {
+        var = CreateVarNode(all_vars.at(each_var_name));
+      } else {
+        // Operation output vars can be @EMPTY@. For example, while_grad
+        // can have multi @EMPTY@ outputs with no VarDesc.
+        // TODO(panyx0718): Add a test.
+        var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
+      }
      var_nodes[each_var_name].push_back(var);
      node->outputs.push_back(var);
      var->inputs.push_back(node);
@@ -208,7 +216,8 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
      // Add write after write dependence
      ir::Node *upstream_op =
          (*it_old)->inputs.empty() ? nullptr : (*it_old)->inputs[0];
-      if (upstream_op) {
+      // TODO(zcd): Add a test.
+      if (upstream_op && upstream_op != write_op) {
        ir::Node *dep_var = CreateControlDepVar();
        write_op->inputs.push_back(dep_var);
        upstream_op->outputs.push_back(dep_var);

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif

+#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
@@ -193,8 +194,14 @@ ParallelExecutor::ParallelExecutor(
      member_->local_scopes_, member_->use_cuda_, build_strategy);
 #endif

-  member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-      exec_strategy, member_->local_scopes_, places, std::move(graph)));
+  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
+    member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, places, std::move(graph)));
+  } else {
+    member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
+        exec_strategy, member_->local_scopes_, places, std::move(graph)));
+  }
+
  member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
      exec_strategy, member_->local_scopes_, std::move(var_infos),
      member_->places_, std::move(member_->executor_)));

--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@@ -55,11 +55,20 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
    auto all_ops = blocks_[block_id]->AllOps();
    for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) {
      auto &op = all_ops[op_id];
+
      for (const std::string &attr_name : op->AttrNames()) {
        if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
          int sub_block_id =
              o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
          op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
+        } else if (op->GetAttrType(attr_name) == proto::AttrType::BLOCKS) {
+          std::vector<int> sub_block_ids =
+              o.Block(block_id).Op(op_id)->GetBlocksAttrIds(attr_name);
+          std::vector<BlockDesc *> block_descs;
+          for (int block_id : sub_block_ids) {
+            block_descs.push_back(MutableBlock(block_id));
+          }
+          op->SetBlocksAttr(attr_name, block_descs);
        }
      }
    }
@@ -68,24 +77,16 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {

 ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
  desc_ = desc;
-  for (auto &block_desc : *desc_.mutable_blocks()) {
-    blocks_.emplace_back(new BlockDesc(this, &block_desc));
-  }
-  for (auto &block : blocks_) {
-    for (auto *op : block->AllOps()) {
-      for (const auto &attr : op->Proto()->attrs()) {
-        if (attr.type() == proto::AttrType::BLOCK) {
-          size_t blk_idx = attr.block_idx();
-          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
-        }
-      }
-    }
-  }
+  InitFromProto();
 }

 ProgramDesc::ProgramDesc(const std::string &binary_str) {
  PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
                 "Fail to parse program_desc from binary string.");
+  InitFromProto();
+}
+
+void ProgramDesc::InitFromProto() {
  for (auto &block_desc : *desc_.mutable_blocks()) {
    blocks_.emplace_back(new BlockDesc(this, &block_desc));
  }
@@ -95,6 +96,13 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
        if (attr.type() == proto::AttrType::BLOCK) {
          size_t blk_idx = attr.block_idx();
          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
+        } else if (attr.type() == proto::AttrType::BLOCKS) {
+          auto blks_idx = attr.blocks_idx();
+          std::vector<BlockDesc *> block_descs;
+          for (int blk_idx : blks_idx) {
+            block_descs.push_back(this->MutableBlock(blk_idx));
+          }
+          op->SetBlocksAttr(attr.name(), block_descs);
        }
      }
    }

--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@@ -76,6 +76,8 @@ class ProgramDesc {
  void SetFetchHolderName(const std::string &fetch_holder_name);

 private:
+  void InitFromProto();
+
  proto::ProgramDesc desc_;

  std::vector<std::unique_ptr<BlockDesc>> blocks_;

--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@@ -42,6 +42,19 @@ TEST(ProgramDesc, copy_ctor) {
  out->SetType(proto::VarType::LOD_TENSOR);
  op->SetOutput("Y", {out->Name()});

+  BlockDesc* new_block = program.AppendBlock(*global_block);
+  op = new_block->AppendOp();
+  op->SetType("mul");
+
+  op = global_block->AppendOp();
+  op->SetType("op_with_subblock");
+  op->SetAttr("sub_block", new_block);
+
+  std::vector<BlockDesc*> sub_blocks;
+  sub_blocks.push_back(program.AppendBlock(*global_block));
+  sub_blocks.push_back(program.AppendBlock(*global_block));
+  op->SetAttr("sub_blocks", sub_blocks);
+
  ProgramDesc program_copy(program);

  auto* global_block_copy = program_copy.MutableBlock(0);
@@ -64,6 +77,8 @@ TEST(ProgramDesc, copy_ctor) {
  assert_same_var("Y", y);
  assert_same_var("Out", out);

+  bool found_sub_block = false;
+  bool found_sub_blocks = false;
  for (size_t i = 0; i < global_block->OpSize(); ++i) {
    auto op_origin = global_block->Op(i);
    auto op_copy = global_block_copy->Op(i);
@@ -74,8 +89,17 @@ TEST(ProgramDesc, copy_ctor) {

    ASSERT_EQ(op_copy->Proto()->SerializeAsString(),
              op_origin->Proto()->SerializeAsString());
-  }

+    if (op->Type() == "op_with_subblock") {
+      ASSERT_EQ(1, op->GetBlockAttrId("sub_block"));
+      found_sub_block = true;
+
+      ASSERT_EQ(2, op->GetBlocksAttrIds("sub_blocks").size());
+      found_sub_blocks = true;
+    }
+  }
+  ASSERT_TRUE(found_sub_block);
+  ASSERT_TRUE(found_sub_blocks);
  // Not check block's protostr are same it because the order of vars could be
  // different and it is correct.
 }

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -62,13 +62,13 @@ endif()

 if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
    # compile the libinference_anakin_api.a and anakin.so.
-    nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
-    #nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)
+    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
+    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
    function(anakin_target target_name)
      target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
    endfunction()
    anakin_target(inference_anakin_api)
-    #anakin_target(inference_anakin_api_shared)
+    anakin_target(inference_anakin_api_shared)
    if (WITH_TESTING)
        cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
                ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin

--- a/paddle/fluid/inference/api/high_level_api_cn.md
+++ b/paddle/fluid/inference/api/high_level_api_cn.md
@@ -65,13 +65,13 @@ config.model_dir = "xxx";
 config.use_gpu = false;
 // 创建一个原生的 PaddlePredictor
 auto predictor =
-      paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+      paddle::CreatePaddlePredictor<paddle::NativeConfig, paddle::PaddleEngineKind::kNative>(config);
 // 创建输入 tensor
 int64_t data[4] = {1, 2, 3, 4};
 paddle::PaddleTensor tensor{.name = "",
                            .shape = std::vector<int>({4, 1}),
-                            .data = PaddleBuf(data, sizeof(data)),
-                            .dtype = PaddleDType::INT64};
+                            .data = paddle::PaddleBuf(data, sizeof(data)),
+                            .dtype = paddle::PaddleDType::INT64};
 // 创建输出 tensor，输出 tensor 的内存可以复用
 std::vector<paddle::PaddleTensor> outputs;
 // 执行预测

--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
-nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto)
+nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
 add_subdirectory(convert)
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -84,6 +84,15 @@ function(op_library TARGET)
        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
    endif()

+    #remove windows unsupported op
+    if (WIN32)
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
+        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
+          return()
+        endif()
+    endforeach()
+    endif(WIN32)
+
    list(LENGTH op_library_DEPS op_library_DEPS_len)
    if (${op_library_DEPS_len} GREATER 0)
        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
@@ -181,19 +190,19 @@ function(op_library TARGET)
 endfunction()

 add_subdirectory(math)
+if (NOT WIN32)
 add_subdirectory(nccl)
-
 if(WITH_GPU)
    op_library(nccl_op DEPS nccl_common)
    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
 else()
    set(DEPS_OPS ${DEPS_OPS} nccl_op)
 endif()
+endif() # NOT WIN32

 set(DISTRIBUTE_DEPS "")
 if(WITH_DISTRIBUTE)
    add_subdirectory(distributed)
-    
    set(DISTRIBUTE_DEPS "")
    if(WITH_GRPC)
        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
@@ -222,7 +231,7 @@ if(WITH_DISTRIBUTE)
    #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
    #        listen_and_serv_op sum_op executor SERIAL)
-    if(WITH_GPU)
+    if(WITH_GPU AND NOT WIN32)
        set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op ${DISTRIBUTE_DEPS} executor SERIAL)
        if(WITH_GRPC)
@@ -233,7 +242,7 @@ if(WITH_DISTRIBUTE)
        set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    else()
        set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
-    endif()
+    endif() # WITH_GPU AND NOT WIN32
 else()
    set(DEPS_OPS ${DEPS_OPS}  checkpoint_notify_op prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
 endif()
@@ -331,5 +340,7 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
+if(NOT WIN32)
 nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+endif()
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@@ -29,9 +29,9 @@ class ConditionalOp : public framework::OperatorBase {

 protected:
  std::vector<const framework::LoDTensor *> InputTensors(
-      const framework::Scope &scope) const {
+      const framework::Scope &scope, const std::string &in_name) const {
    std::vector<const framework::LoDTensor *> retv;
-    auto xs = Inputs("X");
+    auto xs = Inputs(in_name);
    retv.resize(xs.size(), nullptr);
    std::transform(
        xs.begin(), xs.end(), retv.begin(),
@@ -81,12 +81,18 @@ class ConditionalBlockOp : public ConditionalOp {
 private:
  void RunImpl(const framework::Scope &scope,
               const platform::Place &dev_place) const override {
-    auto xs = InputTensors(scope);
-
    bool need_run;
    if (Attr<bool>("is_scalar_condition")) {
+      // When is_scalar_condition is True, the conditional variable is a scalar,
+      // whether need to execute the operators in sub-block depends on the
+      // conditional variable (Cond).
+      auto xs = InputTensors(scope, "Cond");
      need_run = ScalarCondition(xs);
    } else {
+      // When is_scalar_condition is False, the conditional variable maybe a
+      // vector or tensor, whether need to execute the operators in sub-block
+      // depends on the input variables (Input).
+      auto xs = InputTensors(scope, "Input");
      need_run = std::all_of(
          xs.begin(), xs.end(),
          [](const framework::LoDTensor *t) { return t->numel() != 0; });
@@ -110,11 +116,11 @@ class ConditionalBlockOp : public ConditionalOp {
 class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("X",
-             "The conditional variable of this operator. If X is empty, the "
+    AddInput("Cond",
+             "The conditional variable of this operator. If Cond is empty, the "
             "whole sub-block will not be executed.")
        .AsDuplicable();
-    AddInput("Params", "The input variables of the sub-block.").AsDuplicable();
+    AddInput("Input", "The input variables of the sub-block.").AsDuplicable();
    AddOutput("Out", "The output variables of the sub-block.").AsDuplicable();
    AddOutput("Scope",
              "(std::vector<Scope*>) The step scope of conditional block. To "
@@ -123,13 +129,18 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<framework::BlockDesc *>(
        "sub_block", "The step block of conditional block operator");
    AddAttr<bool>("is_scalar_condition",
-                  "the input X is used as scalar "
-                  "condition")
+                  "The conditional variable (Cond) is used as scalar "
+                  "condition.")
        .SetDefault(false);
    AddComment(R"DOC(Conditional block operator

-Run the sub-block if X is not empty. Params is the other inputs and Out is the
-outputs of the sub-block.
+If `is_scalar_condition` is True, the conditional variable (Cond) is a scalar,
+run the operators in sub-block if Cond is True.
+
+If `is_scalar_condition` is False, the conditional variable (Cond) is a vector or
+tensor, run the operators in sub-block if all of input variables are not empty.
+
+
 )DOC");
  }
 };
@@ -145,12 +156,12 @@ class ConditionalBlockGradOp : public ConditionalOp {
 private:
  void RunImpl(const framework::Scope &scope,
               const platform::Place &dev_place) const override {
-    auto xs = this->InputTensors(scope);
-
    bool need_run;
    if (Attr<bool>("is_scalar_condition")) {
+      auto xs = this->InputTensors(scope, "Cond");
      need_run = ScalarCondition(xs);
    } else {
+      auto xs = this->InputTensors(scope, "Input");
      need_run = std::all_of(
          xs.begin(), xs.end(),
          [](const framework::LoDTensor *t) { return t->numel() != 0; });
@@ -166,11 +177,11 @@ class ConditionalBlockGradOp : public ConditionalOp {
      auto *block = Attr<framework::BlockDesc *>("sub_block");
      exec.Run(*block->Program(), &cur_scope, block->ID(), false);

-      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"),
-                                  Outputs(framework::GradVarName("Params")));
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Input"),
+                                  Outputs(framework::GradVarName("Input")));

-      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"),
-                                  Outputs(framework::GradVarName("X")));
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Cond"),
+                                  Outputs(framework::GradVarName("Cond")));
    }
  }

@@ -199,15 +210,15 @@ class ConditionalBlockGradOp : public ConditionalOp {
 class ConditionalBlockGradInferShape : public framework::InferShapeBase {
 public:
  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInputs("X"));
-    if (context->HasInputs("Params")) {
-      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params")));
-      context->SetOutputsDim(framework::GradVarName("Params"),
-                             context->GetInputsDim("Params"));
+    PADDLE_ENFORCE(context->HasInputs("Cond"));
+    if (context->HasInputs("Input")) {
+      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Input")));
+      context->SetOutputsDim(framework::GradVarName("Input"),
+                             context->GetInputsDim("Input"));
    }
-    if (context->HasOutputs(framework::GradVarName("X"))) {
-      context->SetOutputsDim(framework::GradVarName("X"),
-                             context->GetInputsDim("X"));
+    if (context->HasOutputs(framework::GradVarName("Cond"))) {
+      context->SetOutputsDim(framework::GradVarName("Cond"),
+                             context->GetInputsDim("Cond"));
    }
  }
 };
@@ -220,14 +231,15 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
  std::unique_ptr<framework::OpDesc> Apply() const override {
    auto grad_op = new framework::OpDesc();
    grad_op->SetType("conditional_block_grad");
-    grad_op->SetInput("X", Input("X"));
-    grad_op->SetInput("Params", Input("Params"));
+    grad_op->SetInput("Cond", Input("Cond"));
+    grad_op->SetInput("Input", Input("Input"));
    grad_op->SetInput("Out", Output("Out"));
    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
    grad_op->SetInput("Scope", Output("Scope"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
-    grad_op->SetOutput(framework::GradVarName("Params"),
-                       InputGrad("Params", false));
+    grad_op->SetOutput(framework::GradVarName("Cond"),
+                       InputGrad("Cond", false));
+    grad_op->SetOutput(framework::GradVarName("Input"),
+                       InputGrad("Input", false));
    grad_op->SetBlockAttr("sub_block", this->grad_block_[0]);
    grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition"));
    return std::unique_ptr<framework::OpDesc>(grad_op);

--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -85,6 +85,199 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
    int* track_value =
        track.mutable_data<int>(emission_dims, platform::CPUPlace());

+#ifdef __AVX__
+// It use the AVX or AVX512 instruction to deal the data as the vector of 8 or
+// 16 elements per iteration. Then it can implement the parallel processing.
+// Only optimize for float type.
+#ifdef __AVX512F__
+    size_t step_size = 16;
+#else
+    size_t step_size = 8;
+#endif
+    if (std::is_same<T, float>::value && (tag_num >= step_size)) {
+      size_t steps = tag_num / step_size;
+      size_t remain = tag_num % step_size;
+      int last_offset = static_cast<int>(remain) - static_cast<int>(step_size);
+
+      // Setup the alpha initial value.
+      size_t i_offset = 0;
+      for (size_t i = 0; i <= steps; ++i) {
+#ifdef __AVX512F__
+        // Declare the variable for the content of weights, input and alpha
+        // values.
+        __m512 w_content, x_content, alpha_content;
+
+        // Load the relevant data into the variables from un-aligned address.
+        w_content = _mm512_loadu_ps((const float*)(w + i_offset));
+        x_content = _mm512_loadu_ps((const float*)(x + i_offset));
+        alpha_content = _mm512_add_ps(w_content, x_content);
+
+        // Save the alpha value.
+        _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
+                         alpha_content);
+#else
+        // Declare the variable for the content of weights, input and alpha
+        // values.
+        __m256 w_content, x_content, alpha_content;
+
+        // Load the relevant data into the variables from un-aligned address.
+        w_content = _mm256_loadu_ps((const float*)(w + i_offset));
+        x_content = _mm256_loadu_ps((const float*)(x + i_offset));
+        alpha_content = _mm256_add_ps(w_content, x_content);
+
+        // Save the alpha value.
+        _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
+                         alpha_content);
+#endif
+        i_offset += step_size;
+        if (i == steps - 1) {
+          if (remain > 0) {
+            i_offset += last_offset;
+          } else {
+            break;
+          }
+        }
+      }
+
+      // Use the column-major strategy to get the location of maximum score.
+      size_t seq_offset = 0;
+      for (size_t k = 1; k < seq_len; ++k) {
+        size_t j_offset = 0;
+        for (size_t j = 0; j <= steps; ++j) {
+#ifdef __AVX512F__
+          // Initialize the variables of maximum score and location.
+          __m512 max_score = _mm512_set1_ps(-std::numeric_limits<T>::max());
+          __m512i max_j = _mm512_setzero_si512();
+#else
+          // Initialize the variables of maximum score and location.
+          __m256 max_score = _mm256_set1_ps(-std::numeric_limits<T>::max());
+          __m256i max_j = _mm256_set1_epi32(0);
+#endif
+          // Calculate the offset of transition_weights.
+          size_t trans_offset = state_trans_base_idx * tag_num + j_offset;
+          for (size_t i = 0; i < tag_num; ++i) {
+#ifdef __AVX512F__
+            // Initalize the content of alpha variable with related offset.
+            __m512 alpha_content =
+                _mm512_set1_ps(*(const float*)(alpha_value + seq_offset + i));
+            // Obtain the content of weights from un-aligned address.
+            __m512 w_content =
+                _mm512_loadu_ps((const float*)(w + trans_offset));
+
+            __m512 score_v = _mm512_add_ps(alpha_content, w_content);
+
+            __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS);
+
+            // According to the mask value, it update the index of the max_score
+            // location.
+            max_j = _mm512_mask_set1_epi32(max_j, mask, i);
+
+            // Update the max_score value.
+            max_score = _mm512_max_ps(max_score, score_v);
+#else
+            // Initalize the content of alpha variable with related offset.
+            __m256 alpha_content = _mm256_broadcast_ss(
+                (const float*)(alpha_value + seq_offset + i));
+            // Obtain the content of weights from un-aligned address.
+            __m256 w_content =
+                _mm256_loadu_ps((const float*)(w + trans_offset));
+            __m256 score_v = _mm256_add_ps(alpha_content, w_content);
+
+            __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);
+
+#ifdef __AVX2__
+            // According to the mask value, it update the index of the max_score
+            // location.
+            max_j = _mm256_or_si256(
+                _mm256_andnot_si256((__m256i)mask, max_j),
+                _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));
+#else
+            __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);
+            __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);
+            __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);
+            __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);
+
+            lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);
+            hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);
+            lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));
+            hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));
+
+            lo_max_j = _mm_or_si128(lo_mask, lo_max_j);
+            hi_max_j = _mm_or_si128(hi_mask, hi_max_j);
+
+            // According to the mask value, it update the index of the max_score
+            // location.
+            max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);
+            max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);
+#endif
+
+            // Update the max_score value.
+            max_score = _mm256_max_ps(max_score, score_v);
+#endif
+            trans_offset += tag_num;
+          }
+
+#ifdef __AVX512F__
+          // Update the alpha and track values.
+          __m512 x_content = _mm512_loadu_ps(
+              (const float*)(x + seq_offset + tag_num + j_offset));
+          max_score = _mm512_add_ps(max_score, x_content);
+          _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
+                                                    tag_num + j_offset),
+                           max_score);
+          _mm512_storeu_si512(
+              reinterpret_cast<__m512i*>(track_value + seq_offset + tag_num +
+                                         j_offset),
+              max_j);
+#else
+          // Update the alpha and track values.
+          __m256 x_content = _mm256_loadu_ps(
+              (const float*)(x + seq_offset + tag_num + j_offset));
+          max_score = _mm256_add_ps(max_score, x_content);
+          _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
+                                                    tag_num + j_offset),
+                           max_score);
+          _mm256_storeu_si256(
+              reinterpret_cast<__m256i*>(track_value + seq_offset + tag_num +
+                                         j_offset),
+              max_j);
+#endif
+
+          // Calculate the offset of next step
+          j_offset += step_size;
+          if (j == steps - 1) {
+            if (remain > 0) {
+              j_offset += last_offset;
+            } else {
+              break;
+            }
+          }
+        }
+
+        seq_offset += tag_num;
+      }
+    } else {
+      for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
+
+      for (size_t k = 1; k < seq_len; ++k) {
+        for (size_t i = 0; i < tag_num; ++i) {
+          T max_score = -std::numeric_limits<T>::max();
+          int max_j = 0;
+          for (size_t j = 0; j < tag_num; ++j) {
+            T score = alpha_value[(k - 1) * tag_num + j] +
+                      w[(j + state_trans_base_idx) * tag_num + i];
+            if (score > max_score) {
+              max_score = score;
+              max_j = j;
+            }
+          }
+
+          alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
+          track_value[k * tag_num + i] = max_j;
+        }
+      }
+    }
+#else
    for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];

    for (size_t k = 1; k < seq_len; ++k) {
@@ -105,6 +298,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
      }
    }

+#endif
    T max_score = -std::numeric_limits<T>::max();
    int max_i = 0;
    for (size_t i = 0; i < tag_num; ++i) {

--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -80,6 +80,9 @@ inline framework::DDim trim_trailing_singular_dims(
  for (int i = 0; i < actual_dims_size; ++i) {
    trim_dims[i] = dims[i];
  }
+  if (trim_dims.size() == 0) {
+    return framework::DDim(framework::make_dim());
+  }
  framework::DDim actual_dims = framework::make_ddim(trim_dims);
  return actual_dims;
 }

--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -15,8 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fc_op.h"
 #include <vector>
 #include "paddle/fluid/operators/math/blas.h"
-
-DECLARE_int32(paddle_num_threads);
+#include "paddle/fluid/operators/math/fc_compute.h"

 namespace paddle {
 namespace operators {
@@ -110,13 +109,8 @@ void FCOpMaker::Make() {
  AddComment(R"DOC(
  Fully Connected Operator.

-  The fully connected operation calculates the output based on the input, weights and bias attribute.
+  The fully connected operation calculates the output based on the input, weights and bias.
  The size of each dimension of the parameters checked in the infer-shape.
-  The matrix of bias is generated by the mkldnn framework, when the bias_attr is True.
-  Additional parametrs are use_mkldnn and bias_attr.
-  The input(X) size and output(Out) size may be diffrent.
-
-  The fully connected layer only supports MKLDNN version
 )DOC");
 }

@@ -133,26 +127,15 @@ class FCOpKernel : public framework::OpKernel<T> {
    auto in_dims = input->dims();
    auto w_dims = w->dims();

-    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(dev_ctx);
    const T* input_data = input->data<T>();
    const T* w_data = w->data<T>();
    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    math::FCCompute<platform::CPUDeviceContext, T>(
+        blas, in_dims[0], w_dims[1], w_dims[0], input_data, w_data, output_data,
+        bias ? bias->data<T>() : NULL);

-    blas.GEMM(CblasNoTrans, CblasNoTrans, in_dims[0], w_dims[1], w_dims[0],
-              static_cast<T>(1), input_data, w_data, static_cast<T>(0),
-              output_data);
-
-    if (bias) {
-      const T* bias_data = bias->data<T>();
-#ifdef PADDLE_WITH_MKLML
-#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
-#endif
-      for (int bs = 0; bs < in_dims[0]; bs++) {
-        blas.AXPY(w_dims[1], static_cast<T>(1), bias_data,
-                  output_data + bs * w_dims[1]);
-      }
-    }
+    // TODO(TJ): fuse act
  }
 };


--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fusion_lstm_op.h"
+#include <string>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/detail/activation_functions.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/operators/math/lstm_compute.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
+
+namespace paddle {
+namespace operators {
+
+void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightX"),
+                 "Input(WeightX) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
+                 "Input(WeightH) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                 "Input(Bias) of LSTM should not be null.");
+
+  PADDLE_ENFORCE(ctx->HasOutput("XX"),
+                 "Output(XX) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                 "Output(Hidden) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                 "Output(Cell) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchedGate"),
+                 "Output(BatchedGate) of LSTM should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
+                 "Output(BatchedGate) of LSTM should not be null.");
+
+  auto x_dims = ctx->GetInputDim("X");
+  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
+
+  if (ctx->HasInput("H0")) {
+    PADDLE_ENFORCE(ctx->HasInput("C0"),
+                   "Input(Cell) and Input(Hidden) of LSTM should not "
+                   "be null at the same time.");
+    auto h_dims = ctx->GetInputDim("H0");
+    auto c_dims = ctx->GetInputDim("C0");
+    PADDLE_ENFORCE(h_dims == c_dims,
+                   "The dimension of Input(H0) and Input(C0) "
+                   "should be the same.");
+  }
+
+  auto wx_dims = ctx->GetInputDim("WeightX");
+  PADDLE_ENFORCE_EQ(wx_dims.size(), 2,
+                    "The rank of Input(WeightX) should be 2.");
+  PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
+                    "The first dimension of Input(WeightX) "
+                    "should be %d.",
+                    x_dims[1]);
+
+  int frame_size = wx_dims[1] / 4;
+  auto wh_dims = ctx->GetInputDim("WeightH");
+  PADDLE_ENFORCE_EQ(wh_dims.size(), 2,
+                    "The rank of Input(WeightH) should be 2.");
+  PADDLE_ENFORCE_EQ(wh_dims[0], frame_size,
+                    "The first dimension of Input(WeightH) "
+                    "should be %d.",
+                    frame_size);
+  PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size,
+                    "The second dimension of Input(WeightH) "
+                    "should be 4 * %d.",
+                    frame_size);
+
+  auto b_dims = ctx->GetInputDim("Bias");
+  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
+  PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                    "The first dimension of Input(Bias) should be 1.");
+
+  PADDLE_ENFORCE(!ctx->Attrs().Get<bool>("use_peepholes"),
+                 "Do not support peephole yet.");
+  PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+                    "The second dimension of Input(Bias) should be "
+                    "4 * %d if disable peepholes connection",
+                    frame_size);
+
+  framework::DDim out_dims({x_dims[0], frame_size});
+  ctx->SetOutputDim("Hidden", out_dims);
+  ctx->SetOutputDim("Cell", out_dims);
+  ctx->SetOutputDim("BatchedGate", {x_dims[0], wx_dims[1]});
+  ctx->SetOutputDim("BatchCellPreAct", out_dims);
+  ctx->ShareLoD("X", "Hidden");
+  ctx->ShareLoD("X", "Cell");
+
+  int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
+  ctx->ShareLoD("X", "XX");
+}
+
+framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+      ctx.device_context());
+}
+
+void FusionLSTMOpMaker::Make() {
+  AddInput("X",
+           "(LoDTensor) the input is a LodTensor, which support "
+           "variable-time length input sequence. The underlying tensor in "
+           "this LoDTensor is a matrix with shape (T X M), where T is the "
+           "total time steps in this mini-batch, M is the dim size of x.");
+  AddInput("WeightX",
+           "(Tensor) the learnable weights of X."
+           " - The shape is (M x 4D), where M is the dim size of x, D is the "
+           "hidden size. "
+           " - Weight = {W_cx, W_ix, W_fx, W_ox}");
+  AddInput("WeightH",
+           "(Tensor) same as LSTMOp, the learnable hidden-hidden weights."
+           " - The shape is (D x 4D), where D is the hidden size. "
+           " - Weight = {W_ch, W_ih, W_fh, W_oh}");
+  AddInput("Bias",
+           "(Tensor) the learnable weights. Almost same as LSTMOp"
+           "Note: we should add the fc bias into this (1x4D) in bias."
+           "input-hidden bias weight and peephole connections weight if "
+           "setting `use_peepholes` True. "
+           "1. `use_peepholes = False` "
+           " - The shape is (1 x 4D). "
+           " - Bias = {b_c, b_i, b_f, b_o}."
+           "2. `use_peepholes = True` "
+           " - The shape is (1 x 7D). "
+           " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
+  AddInput("H0",
+           "(Tensor, optional) (same as LSTMOp) the initial hidden state is an "
+           "optional "
+           "input. This is a tensor with shape (N x D), where N is the "
+           "batch size and D is the hidden size.")
+      .AsDispensable();
+  AddInput("C0",
+           "(Tensor, optional) (same as LSTMOp) (the initial cell state is an "
+           "optional "
+           "input. This is a tensor with shape (N x D), where N is the "
+           "batch size. `H0` and `C0` can be NULL but only at the same time.")
+      .AsDispensable();
+  AddOutput("Hidden",
+            "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. "
+            "The shape is (T x D), and lod is the same with the `Input`.");
+  AddOutput("Cell",
+            "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. "
+            "The shape is (T x D), and lod is the same with the `Input`.");
+  AddOutput("XX",
+            "(LoDTensor) the result after X * WeightX (size is T x 4D)"
+            " or batched_X (size is T x M), this will be automatically chosen,"
+            " where T is the total time steps in this mini-batch,"
+            " D is the hidden size, M is the dim size of x input.")
+      .AsIntermediate();
+  AddOutput("BatchedGate", "(LoDTensor) (same as LSTMOp).").AsIntermediate();
+  AddOutput("BatchCellPreAct", "(LoDTensor) (same as LSTMOp).")
+      .AsIntermediate();
+  AddAttr<bool>("use_peepholes",
+                "(bool, defalut: True) "
+                "whether to enable diagonal/peephole connections.")
+      .SetDefault(true);
+  AddAttr<bool>("is_reverse",
+                "(bool, defalut: False) "
+                "whether to compute reversed LSTM.")
+      .SetDefault(false);
+  AddAttr<std::string>("gate_activation",
+                       "(string, default: sigmoid)"
+                       "The activation for input gate, forget gate and output "
+                       "gate, `sigmoid` by default.")
+      .SetDefault("sigmoid")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<std::string>("cell_activation",
+                       "(string, default: tanh)"
+                       "The activation for cell output, `tanh` by defalut.")
+      .SetDefault("tanh")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<std::string>("candidate_activation",
+                       "(string, default: tanh)"
+                       "The activation for candidate hidden state, "
+                       "`tanh` by default.")
+      .SetDefault("tanh")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddComment(R"DOC(
+Fusion Long-Short Term Memory (LSTM) Operator.
+This operator fuse the X into LSTM, more details can refer to LSTM op.
+)DOC");
+}
+
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
+                             const framework::Tensor& src,
+                             framework::Vector<size_t> index_lod,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  // TODO(TJ): check mem copy perf
+  row_shuffle(ctx, src, index_lod, dst, indexed_src);
+}
+
+template <typename DeviceContext, typename T>
+class FuisonLSTMKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* wx = ctx.Input<Tensor>("WeightX");
+    auto* wh = ctx.Input<Tensor>("WeightH");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* hidden_t0 = ctx.Input<Tensor>("H0");
+    auto* cell_t0 = ctx.Input<Tensor>("C0");
+
+    auto* xx = ctx.Output<LoDTensor>("XX");
+    auto* batched_gate = ctx.Output<LoDTensor>("BatchedGate");
+    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
+
+    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
+    T* batched_gate_data = batched_gate->mutable_data<T>(ctx.GetPlace());
+    hidden_out->mutable_data<T>(ctx.GetPlace());
+    cell_out->mutable_data<T>(ctx.GetPlace());
+
+    const T* x_data = x->data<T>();
+    const T* wx_data = wx->data<T>();
+    auto x_dims = x->dims();
+    auto wx_dims = wx->dims();
+
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    if (x_dims[1] > wx_dims[1]) {
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
+                                        x_data, wx_data, xx_data,
+                                        bias->data<T>());
+      to_batch(dev_ctx, *xx, batched_gate, true, is_reverse);
+    } else {
+      to_batch(dev_ctx, *x, xx, true, is_reverse);
+      batched_gate->set_lod(xx->lod());
+      math::FCCompute<DeviceContext, T>(blas, x_dims[0], wx_dims[1], x_dims[1],
+                                        xx_data, wx_data, batched_gate_data,
+                                        bias->data<T>());
+    }
+
+    int frame_size = static_cast<int>(wx_dims[1] / 4);
+    framework::DDim out_dims({x_dims[0], frame_size});
+    math::LstmMetaValue<T> lstm_value;
+    // no peephole
+    lstm_value.check_ig = nullptr;
+    lstm_value.check_fg = nullptr;
+    lstm_value.check_og = nullptr;
+    lstm_value.prev_state_value = nullptr;
+    Tensor ordered_c0;
+
+    framework::Vector<size_t> order(batched_gate->lod()[2]);
+
+    if (cell_t0) {
+      // Since the batch computing for LSTM reorders the input sequence
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(dev_ctx, *cell_t0, order, &ordered_c0,
+                                         true);
+      lstm_value.prev_state_value = ordered_c0.data<T>();
+    }
+
+    // Use the local variable as here.
+    LoDTensor batch_hidden, batch_cell;
+    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
+    batch_hidden.mutable_data<T>(out_dims, ctx.GetPlace());
+    batch_cell.mutable_data<T>(out_dims, ctx.GetPlace());
+    batch_cell_pre_act->mutable_data<T>(out_dims, ctx.GetPlace());
+
+    auto batch_starts = batched_gate->lod()[0];
+    size_t max_seq_len = batch_starts.size() - 1;
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+
+    for (size_t n = 0; n < max_seq_len; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor gate_t = batched_gate->Slice(bstart, bend);
+      Tensor out_t = batch_hidden.Slice(bstart, bend);
+      Tensor cell_t = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
+
+      int cur_batch_size = bend - bstart;
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
+        // TODO(TJ): use gemm directly
+        blas.MatMul(pre_hidden_t, false, *wh, false, static_cast<T>(1.0),
+                    &gate_t, static_cast<T>(1.0));
+      } else if (hidden_t0) {
+        // TODO(TJ): move h0 outside for
+        // If n == 0 and there is no initialized hidden state, that is to say
+        // the H0 is zeros, the calculation W_h * H0 will be skiped.
+        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
+
+        // Since the batch computing for LSTM reorders the input sequence
+        // according to their length. The initialized hidden state also needs
+        // to reorder.
+        Tensor ordered_h0;
+        ReorderInitState<DeviceContext, T>(dev_ctx, *hidden_t0, order,
+                                           &ordered_h0, true);
+        // TODO(TJ): use gemm directly
+        blas.MatMul(ordered_h0, false, *wh, false, static_cast<T>(1.0), &gate_t,
+                    static_cast<T>(1.0));
+      }
+
+      lstm_value.gate_value = gate_t.data<T>();
+      lstm_value.output_value = out_t.data<T>();
+      lstm_value.state_value = cell_t.data<T>();
+      lstm_value.state_active_value = cell_pre_act_t.data<T>();
+      math::LstmUnitFunctor<DeviceContext, T>::compute(
+          dev_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act,
+          cand_act);
+      lstm_value.prev_state_value = lstm_value.state_value;
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_hidden.set_lod(batched_gate->lod());
+    // restore the output hidden in LoDTensor from the batch hidden
+    to_seq(dev_ctx, batch_hidden, hidden_out);
+
+    batch_cell.set_lod(batched_gate->lod());
+    // restore the output cell state in LoDTensor from the batch cell
+    to_seq(dev_ctx, batch_cell, cell_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+
+REGISTER_OP_CPU_KERNEL(
+    fusion_lstm,
+    ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/fusion_lstm_op.h
+++ b/paddle/fluid/operators/fusion_lstm_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+// #include <string>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+class FusionLSTMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusionLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/fc_compute.h
+++ b/paddle/fluid/operators/math/fc_compute.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/operators/math/blas.h"
+
+DECLARE_int32(paddle_num_threads);
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename DeviceContext, typename T>
+inline void FCCompute(const BlasT<DeviceContext, T>& blas, const int M,
+                      const int N, const int K, const T* X, const T* W, T* Y,
+                      const T* B = NULL) {
+  blas.GEMM(CblasNoTrans, CblasNoTrans, M, N, K, static_cast<T>(1), X, W,
+            static_cast<T>(0), Y);
+  if (B) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
+#endif
+    for (int i = 0; i < M; i++) {
+      blas.AXPY(N, static_cast<T>(1), B, Y + i * N);
+    }
+  }
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/nccl/CMakeLists.txt
+++ b/paddle/fluid/operators/nccl/CMakeLists.txt
-if(WITH_GPU)
+if(WITH_GPU AND NOT WIN32)
  nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
 endif()
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -23,9 +23,9 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
 public:
  void operator()(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SqueezeOp should not be null.");
+                   "Input(X) of Squeeze operator should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SqueezeOp should not be null.");
+                   "Output(Out) of Squeeze operator should not be null.");

    const auto &x_dims = ctx->GetInputDim("X");
    // Check input tensor dims (<6) Eigen limit.
@@ -107,7 +107,6 @@ class SqueezeOp : public framework::OperatorBase {

    framework::AttributeMap attrs;
    attrs["shape"] = framework::vectorize2int(out_dims);
-    attrs["inplace"] = Attr<bool>("inplace");
    // Invoke Reshape Op
    auto reshape_op = framework::OpRegistry::CreateOp(
        "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
@@ -125,12 +124,6 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
                              "(std::vector<int>). List of integers,"
                              " indicating the dimensions to squeeze.")
        .SetDefault({});
-    AddAttr<bool>("inplace",
-                  "(default: false) Squeeze the source tensor's shape without "
-                  "memory copy. When Attr(inplace) is set true, the output "
-                  "tensor shares memory with Input(X), otherwise, a new output "
-                  "tensor is created, and its data are copied from Input(x).")
-        .SetDefault(false);
    AddComment(R"DOC(
        Squeeze Operator.
        
@@ -180,7 +173,6 @@ class SqueezeGradOp : public framework::OperatorBase {
    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
    framework::AttributeMap attrs;
    attrs["shape"] = framework::vectorize2int(x_dims);
-    attrs["inplace"] = Attr<bool>("inplace");

    auto reshape_op = framework::OpRegistry::CreateOp(
        "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},

--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -23,9 +23,9 @@ class UnsqueezeOpInferShape : public framework::InferShapeBase {
 public:
  void operator()(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of UnsqueezeOp should not be null.");
+                   "Input(X) of Unsqueeze operator should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of UnsqueezeOp should not be null.");
+                   "Output(Out) of Unsqueeze operator should not be null.");

    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
    const auto &x_dims = ctx->GetInputDim("X");
@@ -95,7 +95,6 @@ class UnsqueezeOp : public framework::OperatorBase {

    framework::AttributeMap attrs;
    attrs["shape"] = framework::vectorize2int(out_dims);
-    attrs["inplace"] = Attr<bool>("inplace");
    // Invoke Reshape op.
    auto reshape_op = framework::OpRegistry::CreateOp(
        "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
@@ -126,13 +125,6 @@ class UnsqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
                           " within [1, 6] dimensions (Eigen limit).");
          }
        });
-    AddAttr<bool>(
-        "inplace",
-        "(default: false) Unsqueeze the source tensor's shape without "
-        "memory copy. When Attr(inplace) is set true, the output "
-        "tensor shares memory with Input(X), otherwise, a new output "
-        "tensor is created, and its data are copied from Input(x).")
-        .SetDefault(false);
    AddComment(R"DOC(
    Unsqueeze Operator.
    
@@ -168,7 +160,6 @@ class UnsqueezeGradOp : public framework::OperatorBase {

    framework::AttributeMap attrs;
    attrs["shape"] = framework::vectorize2int(x_dims);
-    attrs["inplace"] = Attr<bool>("inplace");

    auto reshape_op = framework::OpRegistry::CreateOp(
        "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},

--- a/paddle/fluid/platform/dynload/CMakeLists.txt
+++ b/paddle/fluid/platform/dynload/CMakeLists.txt
@@ -3,7 +3,7 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
 list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc)

 # There is no macOS version of NCCL.
-if (NOT APPLE)
+if (NOT APPLE AND NOT WIN32)
  list(APPEND CUDA_SRCS nccl.cc)
 endif()


--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -44,7 +44,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/dynload/curand.h"
-#ifndef __APPLE__
+#if !defined(__APPLE__) and !defined(_WIN32)
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif  // __APPLE__
 #endif  // PADDLE_WITH_CUDA
@@ -205,7 +205,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
 #endif
 }

-#ifndef __APPLE__
+#if !defined(__APPLE__) and !defined(_WIN32)
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
    ncclResult_t stat, const Args&... args) {
@@ -221,7 +221,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
 #endif
  }
 }
-#endif  // __APPLE__
+#endif  // __APPLE__ and windows
 #endif  // PADDLE_WITH_CUDA

 template <typename T>

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -596,8 +596,8 @@ All parameter, weight, gradient are variables in Paddle.

  // -- python binds for parallel executor.
  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
-  py::class_<ExecutionStrategy>(pe, "ExecutionStrategy")
-      .def(py::init())
+  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy");
+  exec_strategy.def(py::init())
      .def_property(
          "num_threads",
          [](const ExecutionStrategy &self) { return self.num_threads_; },
@@ -624,6 +624,16 @@ All parameter, weight, gradient are variables in Paddle.
          [](ExecutionStrategy &self, size_t num_iteration_per_drop_scope) {
            self.num_iteration_per_drop_scope_ = num_iteration_per_drop_scope;
          });
+  exec_strategy.def_property(
+      "use_experimental_executor",
+      [](const ExecutionStrategy &self) {
+        return self.type_ == ExecutionStrategy::kExperimental;
+      },
+      [](ExecutionStrategy &self, bool experimental) {
+        self.type_ = experimental ? ExecutionStrategy::kExperimental
+                                  : ExecutionStrategy::kDefault;
+      });
+
  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy");

  py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1272,8 +1272,8 @@ class ConditionalBlock(object):
        parent_block.append_op(
            type='conditional_block',
            inputs={
-                'X': self.inputs,
-                'Params': param_list,
+                'Cond': self.inputs,
+                'Input': param_list,
            },
            outputs={'Out': out_list,
                     'Scope': [step_scope]},

--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
@@ -30,7 +30,8 @@ import numpy as np


 class TestMNISTIfElseOp(unittest.TestCase):
-    def test_raw_api(self):
+    # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379
+    def not_test_raw_api(self):
        prog = Program()
        startup_prog = Program()
        with program_guard(prog, startup_prog):
@@ -91,7 +92,8 @@ class TestMNISTIfElseOp(unittest.TestCase):
                    return
        self.assertFalse(True)

-    def test_ifelse(self):
+    # FIXME: https://github.com/PaddlePaddle/Paddle/issues/12245#issuecomment-406462379
+    def not_test_ifelse(self):
        prog = Program()
        startup_prog = Program()
        with program_guard(prog, startup_prog):
@@ -153,6 +155,13 @@ class TestIfElse(unittest.TestCase):
        self.cond_value = 0.5
        self.data = np.random.rand(25, 1).astype(np.float32)

+    def numpy_cal(self):
+        s1 = self.data[np.where(self.data < self.cond_value)]
+        res = np.sum(np.exp(s1))
+        s2 = self.data[np.where(self.data >= self.cond_value)]
+        res += np.sum(np.tanh(s2))
+        return res
+
    def compare_ifelse_op_and_numpy(self, place):
        self.set_test_case()

@@ -166,10 +175,12 @@ class TestIfElse(unittest.TestCase):
            ie = layers.IfElse(ifcond)
            with ie.true_block():
                true_target = ie.input(src)
+                true_target = fluid.layers.exp(true_target)
                ie.output(true_target)

            with ie.false_block():
                false_target = ie.input(src)
+                false_target = fluid.layers.tanh(false_target)
                ie.output(false_target)
            if_out = ie()
            out = layers.reduce_sum(if_out)
@@ -180,7 +191,8 @@ class TestIfElse(unittest.TestCase):
            o1, = exe.run(fluid.default_main_program(),
                          feed={'data': self.data},
                          fetch_list=[out])
-            o2 = np.sum(self.data)
+            o2 = self.numpy_cal()
+
            self.assertTrue(
                np.allclose(
                    o1, o2, atol=1e-8),

--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -38,7 +38,8 @@ class TestParallelExecutorBase(unittest.TestCase):
                                  seed=None,
                                  use_parallel_executor=True,
                                  use_reduce=False,
-                                  optimizer=fluid.optimizer.Adam):
+                                  optimizer=fluid.optimizer.Adam,
+                                  use_fast_executor=False):
        def run_executor(exe, feed, fetch_list, program=None):
            if isinstance(exe, fluid.ParallelExecutor):
                res = exe.run(fetch_list=fetch_list, feed=feed)
@@ -71,6 +72,8 @@ class TestParallelExecutorBase(unittest.TestCase):
            startup_exe.run(startup)
            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.allow_op_delay = allow_op_delay
+            if use_fast_executor:
+                exec_strategy.use_experimental_executor = True

            build_strategy = fluid.BuildStrategy()
            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \

--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -64,27 +64,47 @@ class TestFCOp(OpTest):
        self.check_output()


-class TestFCOpBiasBoth(TestFCOp):
+class TestFCOpNoBias(TestFCOp):
    def init_shapes(self, mb, ic, oc, h, w):
-        for with_bias in {True, False}:
-            self.with_bias = with_bias
-            self.matrix = MatrixGenerate(mb, ic, oc, h, w)
+        self.with_bias = False
+        self.matrix = MatrixGenerate(mb, ic, oc, h, w)


-class TestFCOp1(TestFCOpBiasBoth):
+class TestFCOpWithBias(TestFCOp):
+    def init_shapes(self, mb, ic, oc, h, w):
+        self.with_bias = True
+        self.matrix = MatrixGenerate(mb, ic, oc, h, w)
+
+
+class TestFCOp1(TestFCOpNoBias):
    def init_op_type(self):
        self.init_shapes(2, 8, 10, 1, 1)


-class TestFCOp2(TestFCOpBiasBoth):
+class TestFCOp2(TestFCOpNoBias):
    def init_op_type(self):
        self.init_shapes(4, 5, 6, 2, 2)


-class TestFCOp4(TestFCOpBiasBoth):
+class TestFCOp4(TestFCOpNoBias):
    def init_op_type(self):
        self.init_shapes(1, 32, 64, 3, 3)


+class TestFCOpWithBias1(TestFCOpWithBias):
+    def init_op_type(self):
+        self.init_shapes(3, 8, 10, 2, 1)
+
+
+class TestFCOpWithBias2(TestFCOpWithBias):
+    def init_op_type(self):
+        self.init_shapes(4, 5, 6, 2, 2)
+
+
+class TestFCOpWithBias3(TestFCOpWithBias):
+    def init_op_type(self):
+        self.init_shapes(1, 64, 32, 3, 3)
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fusion_lstm_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_lstm_op import lstm, ACTIVATION
+
+
+def fc(x, w, b):
+    return np.dot(x, w) + b
+
+
+def fusion_lstm(
+        x,  # T x M
+        lod,  # 1 x N
+        wx=None,  # M x 4D
+        bx=None,  # 1 x 4D
+        h0=None,  # N x D
+        c0=None,  # N x D
+        w_h=None,  # D x 4D
+        w_b=None,  # 1 x 4D
+        w_c=None,  # 1 x 3D
+        is_reverse=False,
+        act_gate=None,
+        act_cell=None,
+        act_cand=None):
+    return lstm(
+        fc(x, wx, bx), lod, h0, c0, w_h, w_b, w_c, is_reverse, act_gate,
+        act_cell, act_cand)
+
+
+class TestLstmOp(OpTest):
+    def set_argument(self):
+        self.lod = [[2, 3, 2]]
+
+    def setUp(self):
+        self.op_type = 'fusion_lstm'
+        self.lod = [[2, 3, 2]]
+        self.M = 8
+        self.D = 16
+        self.has_initial_state = False
+        self.is_reverse = False
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+        self.use_peepholes = False
+        self.set_argument()
+
+        T = sum(self.lod[0])
+        bs = len(self.lod[0])
+
+        x = np.random.normal(size=(T, self.M)).astype('float64')
+        if self.has_initial_state:
+            h0 = np.random.normal(size=(bs, self.D)).astype('float64')
+            c0 = np.random.normal(size=(bs, self.D)).astype('float64')
+        else:
+            h0 = np.zeros((bs, self.D)).astype('float64')
+            c0 = np.zeros((bs, self.D)).astype('float64')
+
+        wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float64')
+
+        if self.use_peepholes:
+            b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
+        else:
+            b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
+        w_b = np.copy(b[:, 0:4 * self.D])
+        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
+
+        # this is the weight of fc
+        wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float64')
+        # this is the bias of fc
+        # and it should be manually added into the bias of this fusion LSTM
+        bx = np.random.normal(size=(1, 4 * self.D)).astype('float64')
+        b[0, 0:4 * self.D] += bx[0, :]
+        h, c = fusion_lstm(x, self.lod, wx, bx, h0, c0, wh, w_b, w_c,
+                           self.is_reverse, ACTIVATION[self.act_gate],
+                           ACTIVATION[self.act_cell], ACTIVATION[self.act_cand])
+
+        self.inputs = {
+            'X': (x, self.lod),
+            'WeightX': wx,
+            'WeightH': wh,
+            'Bias': b
+        }
+
+        if self.has_initial_state:
+            self.inputs['H0'] = h0
+            self.inputs['C0'] = c0
+
+        self.outputs = {
+            'Hidden': (h, self.lod),
+            'Cell': (c, self.lod),
+        }
+        self.attrs = {
+            'use_peepholes': self.use_peepholes,
+            'is_reverse': self.is_reverse,
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand
+        }
+
+    def test_check_output(self):
+        self.check_output(atol=1e-8)
+
+
+class TestLstmOpInitReverse(TestLstmOp):
+    def set_argument(self):
+        self.has_initial_state = True
+        self.is_reverse = True
+
+
+class TestLstmOpMD1(TestLstmOp):
+    def set_argument(self):
+        self.M = 36
+        self.D = 8
+
+
+class TestLstmOpMD2(TestLstmOp):
+    def set_argument(self):
+        self.M = 8
+        self.D = 8
+
+
+class TestLstmOpMD3(TestLstmOp):
+    def set_argument(self):
+        self.M = 15
+        self.D = 3
+
+
+class TestLstmOpBS1(TestLstmOp):
+    def set_argument(self):
+        self.lod = [[3]]
+        self.D = 16
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
@@ -183,7 +183,9 @@ class TestMNIST(TestParallelExecutorBase):
            use_parallel_executor=True)

        self.assertAlmostEquals(
-            np.mean(parallel_first_loss), single_first_loss, delta=1e-6)
+            np.mean(parallel_first_loss),
+            single_first_loss,
+            delta=1e-6, )
        self.assertAlmostEquals(
            np.mean(parallel_last_loss), single_last_loss, delta=1e-6)

@@ -191,7 +193,7 @@ class TestMNIST(TestParallelExecutorBase):
        self.check_simple_fc_parallel_accuracy(True)
        self.check_simple_fc_parallel_accuracy(False)

-    def check_batchnorm_fc_convergence(self, use_cuda):
+    def check_batchnorm_fc_convergence(self, use_cuda, use_fast_executor):
        if use_cuda and not core.is_compiled_with_cuda():
            return

@@ -203,11 +205,13 @@ class TestMNIST(TestParallelExecutorBase):
            fc_with_batchnorm,
            feed_dict={"image": img,
                       "label": label},
-            use_cuda=use_cuda)
+            use_cuda=use_cuda,
+            use_fast_executor=use_fast_executor)

    def test_batchnorm_fc(self):
-        self.check_batchnorm_fc_convergence(True)
-        self.check_batchnorm_fc_convergence(False)
+        for use_cuda in (False, True):
+            for use_fast_executor in (False, True):
+                self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor)

    def test_batchnorm_fc_with_new_strategy(self):
        # FIXME(zcd): close this test temporally.

--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -41,7 +41,7 @@ class TestSqueezeOp(OpTest):
        self.new_shape = (3, 5)

    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": False}
+        self.attrs = {"axes": self.axes}


 # Correct: There is mins axis.
@@ -68,49 +68,5 @@ class TestSqueezeOp3(TestSqueezeOp):
        self.new_shape = (3, 5, 1, 4)


-# Correct: Inplace.
-class TestSqueezeOpInplace1(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (1, 3, 1, 5)
-        self.axes = (0, 2)
-        self.new_shape = (3, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
-# Correct: Inplace. There is mins axis.
-class TestSqueezeOpInplace2(TestSqueezeOp):
-    def inti_test_case(self):
-        self.ori_shape = (1, 3, 1, 5)
-        self.axes = (0, -2)
-        self.new_shape = (3, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
-# Correct: Inplace. No axes input.
-class TestSqueezeOpInplace3(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (1, 3, 1, 5)
-        self.axes = ()
-        self.new_shape = (3, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
-# Correct: Inpalce. Just part of axes be squeezed. 
-class TestSqueezeOpInplace4(TestSqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 1, 5, 1, 4, 1)
-        self.axes = (1, -1)
-        self.new_shape = (3, 5, 1, 4)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -41,7 +41,7 @@ class TestUnsqueezeOp(OpTest):
        self.new_shape = (3, 1, 1, 5)

    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": False}
+        self.attrs = {"axes": self.axes}


 # Correct: Single input index.
@@ -76,38 +76,5 @@ class TestUnsqueezeOp4(TestUnsqueezeOp):
        self.new_shape = (3, 1, 1, 2, 5, 1)


-# Correct: Inplace.
-class TestUnsqueezeOpInplace1(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 5)
-        self.axes = (0, 2)
-        self.new_shape = (1, 3, 1, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
-# Correct: Inplace. There is mins index.
-class TestUnsqueezeOpInplace2(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 5)
-        self.axes = (0, -2)
-        self.new_shape = (1, 3, 1, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
-# Correct: Inplace. There is duplicated axis.
-class TestUnsqueezeOpInplace3(TestUnsqueezeOp):
-    def init_test_case(self):
-        self.ori_shape = (3, 2, 5)
-        self.axes = (0, 3, 3)
-        self.new_shape = (1, 3, 2, 1, 1, 5)
-
-    def init_attrs(self):
-        self.attrs = {"axes": self.axes, "inplace": True}
-
-
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -285,11 +285,12 @@ class Trainer(object):
            self._load_checkpoint()

        if param_path and os.path.isdir(param_path):
-            # load params from param_path into scope
-            io.load_persistables(
-                executor=exe,
-                dirname=param_path,
-                main_program=self.startup_program)
+            with self._prog_and_scope_guard():
+                # load params from param_path into scope
+                io.load_persistables(
+                    executor=exe,
+                    dirname=param_path,
+                    main_program=self.startup_program)

    def _transpile_nccl2_dist(self):
        # PADDLE_TRAINER_IPS