Merge branch 'develop' into prv-calibration

883c7860 · Zhang, Guoming · 8e7a98b5 · 75bd0f18 · 883c7860 · 883c7860
222 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,6 +72,7 @@ option(WITH_INFERENCE    "Compile fluid inference library"              ON)
 option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
+option(WITH_FAST_MATH   "Make use of fast math library"                 OFF)
 # PY_VERSION
 if(NOT PY_VERSION)

--- a/Dockerfile
+++ b/Dockerfile
@@ -24,6 +24,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
    apt-get install -y --allow-downgrades patchelf \
+    python3 python3-dev python3-pip \
    git python-pip python-dev python-opencv openssh-server bison \
    libnccl2=2.1.2-1+cuda8.0 libnccl-dev=2.1.2-1+cuda8.0 \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
@@ -70,24 +71,33 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U
 # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest
 # version(1.7.1 for now), which causes building documentation failed.
-RUN easy_install -U pip && \
+RUN pip3 install -U wheel && \
+    pip3 install -U docopt PyYAML sphinx==1.5.6 && \
+    pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \
+    easy_install -U pip && \
    pip install -U wheel && \
    pip install -U docopt PyYAML sphinx==1.5.6 && \
    pip install sphinx-rtd-theme==0.1.9 recommonmark
-RUN pip install pre-commit 'ipython==5.3.0' && \
+RUN pip3 install pre-commit 'ipython==5.3.0' && \
+    pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip3 install opencv-python && \
+    pip install pre-commit 'ipython==5.3.0' && \
    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
    pip install opencv-python
 #For docstring checker
+RUN pip3 install pylint pytest astroid isort
 RUN pip install pylint pytest astroid isort LinkChecker
 COPY ./python/requirements.txt /root/
+RUN pip3 install -r /root/requirements.txt
 RUN pip install -r /root/requirements.txt
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
 RUN apt-get install -y libssl-dev libffi-dev
+RUN pip3 install certifi urllib3[secure]
 RUN pip install certifi urllib3[secure]

--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -40,7 +40,7 @@ set(OPENBLAS_LIB_SEARCH_PATHS
        /usr/local/opt/openblas/lib)
 find_path(OPENBLAS_INC_DIR NAMES cblas.h
-  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS} NO_DEFAULT_PATH)
 find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
 find_library(OPENBLAS_LIB NAMES openblas

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -62,8 +62,26 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif()
 if(WIN32)
-  # windows stupid compile option for all targets.
+  # windows header option for all targets.
  add_definitions(-D_XKEYCHECK_H)
+  # Use symbols instead of absolute path, reduce the cmake link command length. 
+  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
+  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_LIBRARIES 1)
+  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
+  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_OBJECTS 1)
+  SET(CMAKE_C_USE_RESPONSE_FILE_FOR_INCLUDES 1)
+  SET(CMAKE_CXX_USE_RESPONSE_FILE_FOR_INCLUDES 1)
+  SET(CMAKE_C_RESPONSE_FILE_LINK_FLAG "@")
+  SET(CMAKE_CXX_RESPONSE_FILE_LINK_FLAG "@")
+  # Specify the program to use when building static libraries
+  SET(CMAKE_C_CREATE_STATIC_LIBRARY "<CMAKE_AR> lib <TARGET> <LINK_FLAGS> <OBJECTS>")
+  SET(CMAKE_CXX_CREATE_STATIC_LIBRARY "<CMAKE_AR> lib <TARGET> <LINK_FLAGS> <OBJECTS>")
+  # set defination for the dll export
+  if (NOT MSVC)
+    message(FATAL "Windows build only support msvc. Which was binded by the nvcc compiler of NVIDIA.")
+  endif(NOT MSVC)
 endif(WIN32)
 if(NOT WITH_GOLANG)

--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -175,7 +175,10 @@ list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
 list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
 endif(NOT WIN32)
-list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
+if(WITH_FAST_MATH)
+  # Make use of fast math library. https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html
+  list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
+endif()
 # in cuda9, suppress cuda warning on eigen 
 list(APPEND CUDA_NVCC_FLAGS "-w")
 # Set :expt-relaxed-constexpr to suppress Eigen warnings

--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -52,6 +52,7 @@ ExternalProject_Add(
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          ${CMAKE_ARGS_PREFIX}
+                        -DUSE_LOGGER=YES
                        -DUSE_X86_PLACE=YES
                        -DBUILD_WITH_UNIT_TEST=NO
                        -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf

--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -3,6 +3,14 @@ INCLUDE(ExternalProject)
 SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
 SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
+if(NOT WITH_FAST_MATH)
+  # EIGEN_FAST_MATH: https://eigen.tuxfamily.org/dox/TopicPreprocessorDirectives.html
+  # enables some optimizations which might affect the accuracy of the result. 
+  # This currently enables the SSE vectorization of sin() and cos(), 
+  # and speedups sqrt() for single precision.
+  # Defined to 1 by default. Define it to 0 to disable.
+  add_definitions(-DEIGEN_FAST_MATH=0)
+endif()
 if(WITH_AMD_GPU)
    ExternalProject_Add(

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -27,7 +27,7 @@ IF(NOT ${CBLAS_FOUND})
    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
    SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
-    SET(CBLAS_INCLUDE_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
+    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
    SET(CBLAS_LIBRARIES
        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
@@ -96,7 +96,7 @@ IF(NOT ${CBLAS_FOUND})
    ENDIF(NOT WIN32)
    SET(CBLAS_PROVIDER openblas)
    IF(WITH_C_API)
-        INSTALL(DIRECTORY ${CBLAS_INCLUDE_DIR} DESTINATION third_party/openblas)
+        INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
        # Because libopenblas.a is a symbolic link of another library, thus need to
        # install the whole directory.
        IF(ANDROID)
@@ -117,8 +117,8 @@ IF(NOT ${CBLAS_FOUND})
 ENDIF(NOT ${CBLAS_FOUND})
 MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
-MESSAGE(STATUS "BLAS Include: ${CBLAS_INCLUDE_DIR}")
+MESSAGE(STATUS "BLAS Include: ${CBLAS_INC_DIR}")
-INCLUDE_DIRECTORIES(${CBLAS_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 # FIXME(gangliao): generate cblas target to track all high performance
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -27,7 +27,6 @@ endfunction()
 CheckCompilerCXX11Flag()
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 # safe_set_flag
 #
 # Set a compile flag only if compiler is support
@@ -71,6 +70,20 @@ macro(safe_set_nvflag flag_name)
    endif()
 endmacro()
+macro(safe_set_static_flag) # set c_flags and cxx_flags to static or shared
+    if (BUILD_SHARED_LIBS) 
+        return() # if build shared libs, the flags keep same with '/MD'
+    endif(BUILD_SHARED_LIBS)
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/MD")
+    endforeach(flag_var)
+endmacro()
 CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
@@ -97,9 +110,13 @@ SET(CMAKE_EXTRA_INCLUDE_FILES "")
 # Common flags. the compiler flag used for C/C++ sources whenever release or debug
 # Do not care if this flag is support for gcc.
+# https://github.com/PaddlePaddle/Paddle/issues/12773
+if (NOT WIN32)
 set(COMMON_FLAGS
    -fPIC
    -fno-omit-frame-pointer
+    -Werror
    -Wall
    -Wextra
    -Wnon-virtual-dtor
@@ -114,11 +131,6 @@ set(COMMON_FLAGS
    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
 )
-# https://github.com/PaddlePaddle/Paddle/issues/12773
-if (NOT WIN32)
-list(APPEND COMMON_FLAGS -Werror)
-endif()
 set(GPU_COMMON_FLAGS
    -fPIC
    -fno-omit-frame-pointer
@@ -133,30 +145,55 @@ set(GPU_COMMON_FLAGS
    -Wno-error=array-bounds # Warnings in Eigen::array
 )
+else(NOT WIN32)
+set(COMMON_FLAGS
+    "/w") #disable all warnings.
+set(GPU_COMMON_FLAGS
+    "/w") #disable all warnings
+endif(NOT WIN32)
 if (APPLE)
    if(NOT CMAKE_CROSSCOMPILING)
        # On Mac OS X build fat binaries with x86_64 architectures by default.
        set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
    endif()
-else()
+    # On Mac OS X register class specifier is deprecated and will cause warning error on latest clang 10.0
+    set (COMMON_FLAGS -Wno-deprecated-register)
+endif(APPLE)
+if(LINUX)
    set(GPU_COMMON_FLAGS
        -Wall
        -Wextra
        -Werror
        ${GPU_COMMON_FLAGS})
-endif()
+endif(LINUX)
 if(UNIX AND NOT APPLE)
  # except apple from nix*Os family
  set(LINUX TRUE)
 endif(UNIX AND NOT APPLE)
 foreach(flag ${COMMON_FLAGS})
    safe_set_cflag(CMAKE_C_FLAGS ${flag})
    safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
 endforeach()
 foreach(flag ${GPU_COMMON_FLAGS})
    safe_set_nvflag(${flag})
 endforeach()
+if(WIN32)
+# windows build turn off warnings.
+safe_set_static_flag()
+    foreach(flag_var
+        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
+        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/W3")
+        string(REGEX REPLACE "/W3" "/w" ${flag_var} "${${flag_var}}")
+      endif(${flag_var} MATCHES "/W3")
+    endforeach(flag_var)
+endif(WIN32)
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
 add_custom_target(paddle_apis ALL
-                  DEPENDS paddle_v2_apis paddle_fluid_apis)
+                  DEPENDS paddle_v2_apis)
 add_custom_target(paddle_docs ALL
                  DEPENDS paddle_v2_docs paddle_v2_docs_cn
-                  paddle_fluid_docs paddle_fluid_docs_cn
                  paddle_mobile_docs paddle_mobile_docs_cn)
 add_subdirectory(v2)
-add_subdirectory(fluid)
 add_subdirectory(mobile)
--- a/paddle/contrib/float16/float16_transpiler.py
+++ b/paddle/contrib/float16/float16_transpiler.py
@@ -102,7 +102,7 @@ class Float16Transpiler:
                continue
            for input_arg in current_op.input_arg_names:
                if input_arg in self.input_map:
-                    current_op.rename_input(input_arg,
+                    current_op._rename_input(input_arg,
                                             self.input_map[input_arg])
    def _remove_unused_var(self):
@@ -187,7 +187,7 @@ class Float16Transpiler:
                    shape=var.shape,
                    persistable=var.persistable)
                find_op(var)
-                var.op.rename_output(var_name, tmp_var_name)
+                var.op._rename_output(var_name, tmp_var_name)
                self.block._insert_op(
                    i,
                    type="cast",

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -13,3 +13,5 @@ if(WITH_INFERENCE)
  # NOTE: please add subdirectory inference at last.
  add_subdirectory(inference)
 endif()
+add_subdirectory(train)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
 # windows treat symbolic file as a real file, which is different with unix
 # We create a hidden file and compile it instead of origin source file.
 function(windows_symbolic TARGET)
@@ -9,10 +10,22 @@ function(windows_symbolic TARGET)
  if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu)
      message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.")
  endif()
+  # only copy the xx.cu to .xx.cu when the content are modified
+  set(copy_flag 1)
+  if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu)
+  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR)
+  file(READ ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu TARGET_STR)
+  if (SOURCE_STR STREQUAL TARGET_STR)
+    set(copy_flag 0)
+  endif()
+  endif()
+  if (copy_flag)
  add_custom_command(OUTPUT .${src}.cu
          COMMAND ${CMAKE_COMMAND} -E remove ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu
          COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc" "${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu"
          COMMENT "create hidden file of ${src}.cu")
+  endif(copy_flag)
  add_custom_target(${TARGET} ALL DEPENDS .${src}.cu)
  endforeach()
 endfunction()
@@ -56,9 +69,9 @@ else()
  cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
 if (NOT WIN32)
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
+  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
 else()
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
+  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
 endif (NOT WIN32)
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
@@ -81,6 +94,8 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu
 if(WITH_GPU)
  if (WIN32)
+    # windows treat symbolic file as a real file, which is different with unix
+    # We create a hidden file and compile it instead of origin source file.
      windows_symbolic(hidden_file SRCS data_type_transform.cu)
      nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor)
      add_dependencies(data_type_transform hidden_file)
@@ -141,20 +156,22 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
+cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
 if(WITH_DISTRIBUTE)
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass)
+  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
 if (NOT WIN32)
-  cc_library(parallel_executor SRCS parallel_executor.cc DEPS
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
-          graph graph_viz_pass multi_devices_graph_pass
+        graph build_strategy
-          multi_devices_graph_print_pass multi_devices_graph_check_pass
+        fast_threaded_ssa_graph_executor)
-          fast_threaded_ssa_graph_executor fuse_elewise_add_act_pass)
 endif() # NOT WIN32
 cc_library(prune SRCS prune.cc DEPS framework_proto)
@@ -167,15 +184,8 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
-# cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 if (NOT WIN32)
 cc_test(rw_lock_test SRCS rw_lock_test.cc)
 endif (NOT WIN32)
-# disable test temporarily.
-# TODO https://github.com/PaddlePaddle/Paddle/issues/11971
-# cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
-#         channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
-#         conditional_block_op while_op assign_op print_op executor proto_desc)
--- a/paddle/fluid/framework/channel.h
+++ b/paddle/fluid/framework/channel.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <stddef.h>            // for size_t
-#include <condition_variable>  // NOLINT
-#include <typeindex>
-#include "paddle/fluid/platform/enforce.h"
-namespace paddle {
-namespace framework {
-enum class ChannelAction {
-  SEND = 0,
-  RECEIVE = 1,
-  CLOSE = 2,
-};
-// Channel is the abstract class of buffered and un-buffered channels.
-template <typename T>
-class Channel {
- public:
-  virtual bool CanSend() = 0;
-  virtual bool CanReceive() = 0;
-  virtual void Send(T*) = 0;
-  virtual bool Receive(T*) = 0;
-  virtual size_t Cap() = 0;
-  virtual void Lock() = 0;
-  virtual void Unlock() = 0;
-  virtual bool IsClosed() = 0;
-  virtual void Close() = 0;
-  virtual ~Channel() {}
-  virtual void AddToSendQ(const void* referrer, T* data,
-                          std::shared_ptr<std::condition_variable_any> cond,
-                          std::function<bool(ChannelAction)> cb) = 0;
-  virtual void AddToReceiveQ(const void* referrer, T* data,
-                             std::shared_ptr<std::condition_variable_any> cond,
-                             std::function<bool(ChannelAction)> cb) = 0;
-  virtual void RemoveFromSendQ(const void* referrer) = 0;
-  virtual void RemoveFromReceiveQ(const void* referrer) = 0;
-};
-// Forward declaration of channel implementations.
-template <typename T>
-class ChannelImpl;
-template <typename T>
-Channel<T>* MakeChannel(size_t buffer_size) {
-  return new ChannelImpl<T>(buffer_size);
-}
-template <typename T>
-void CloseChannel(Channel<T>* ch) {
-  ch->Close();
-}
-/*
- * The ChannelHolder class serves two main purposes:
- * 1. It acts as a unified wrapper for the different kinds of
- *    channels, i.e. Buffered and Unbuffered channels. This is
- *    similar to the ReaderHolder class.
- * 2. It also helps us in TypeHiding. This is similar to the
- *    PlaceHolder implementations in variable.h and tensor.h.
- */
-class ChannelHolder {
- public:
-  template <typename T>
-  void Reset(size_t buffer_size) {
-    holder_.reset(new PlaceholderImpl<T>(buffer_size));
-  }
-  template <typename T>
-  void Send(T* data) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    PADDLE_ENFORCE_EQ(
-        holder_->Type(), std::type_index(typeid(T)),
-        "Channel type is not same as the type of the data being sent");
-    // Static cast should be safe because we have ensured that types are same
-    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
-    channel->Send(data);
-  }
-  template <typename T>
-  bool Receive(T* data) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    PADDLE_ENFORCE_EQ(
-        holder_->Type(), std::type_index(typeid(T)),
-        "Channel type is not same as the type of the data being sent");
-    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
-    return channel->Receive(data);
-  }
-  bool IsClosed() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->IsClosed();
-  }
-  bool CanSend() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->CanSend();
-  }
-  bool CanReceive() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->CanReceive();
-  }
-  void close() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->Close();
-  }
-  size_t Cap() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->Cap();
-  }
-  void Lock() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->Lock();
-  }
-  void Unlock() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->Unlock();
-  }
-  template <typename T>
-  void AddToSendQ(const void* referrer, T* data,
-                  std::shared_ptr<std::condition_variable_any> cond,
-                  std::function<bool(ChannelAction)> cb) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    if (channel != nullptr) {
-      channel->AddToSendQ(referrer, data, cond, cb);
-    }
-  }
-  template <typename T>
-  void AddToReceiveQ(const void* referrer, T* data,
-                     std::shared_ptr<std::condition_variable_any> cond,
-                     std::function<bool(ChannelAction)> cb) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    if (channel != nullptr) {
-      channel->AddToReceiveQ(referrer, data, cond, cb);
-    }
-  }
-  void RemoveFromSendQ(const void* referrer) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->RemoveFromSendQ(referrer);
-  }
-  void RemoveFromReceiveQ(const void* referrer) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->RemoveFromReceiveQ(referrer);
-  }
-  inline bool IsInitialized() const { return holder_ != nullptr; }
-  inline const std::type_index Type() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->Type();
-  }
- private:
-  /**
-   * @note    Placeholder hides type T, so it doesn't appear as a template
-   *          parameter of ChannelHolder.
-   */
-  struct Placeholder {
-    virtual ~Placeholder() {}
-    virtual const std::type_index Type() const = 0;
-    virtual void* Ptr() const = 0;
-    virtual bool IsClosed() = 0;
-    virtual bool CanSend() = 0;
-    virtual bool CanReceive() = 0;
-    virtual void RemoveFromSendQ(const void* referrer) = 0;
-    virtual void RemoveFromReceiveQ(const void* referrer) = 0;
-    virtual void Close() = 0;
-    virtual void Lock() = 0;
-    virtual void Unlock() = 0;
-    virtual size_t Cap() = 0;
-  };
-  template <typename T>
-  struct PlaceholderImpl : public Placeholder {
-    explicit PlaceholderImpl(size_t buffer_size)
-        : type_(std::type_index(typeid(T))) {
-      channel_.reset(MakeChannel<T>(buffer_size));
-    }
-    virtual const std::type_index Type() const { return type_; }
-    virtual void* Ptr() const { return static_cast<void*>(channel_.get()); }
-    virtual bool IsClosed() {
-      if (channel_) {
-        return channel_->IsClosed();
-      }
-      return false;
-    }
-    virtual bool CanSend() {
-      if (channel_) {
-        return channel_->CanSend();
-      }
-      return false;
-    }
-    virtual bool CanReceive() {
-      if (channel_) {
-        return channel_->CanReceive();
-      }
-      return false;
-    }
-    virtual void RemoveFromSendQ(const void* referrer) {
-      if (channel_) {
-        channel_->RemoveFromSendQ(referrer);
-      }
-    }
-    virtual void RemoveFromReceiveQ(const void* referrer) {
-      if (channel_) {
-        channel_->RemoveFromReceiveQ(referrer);
-      }
-    }
-    virtual void Close() {
-      if (channel_) channel_->Close();
-    }
-    virtual size_t Cap() {
-      if (channel_)
-        return channel_->Cap();
-      else
-        return -1;
-    }
-    virtual void Lock() {
-      if (channel_) channel_->Lock();
-    }
-    virtual void Unlock() {
-      if (channel_) channel_->Unlock();
-    }
-    std::unique_ptr<Channel<T>> channel_;
-    const std::type_index type_;
-  };
-  // Pointer to a PlaceholderImpl object
-  std::unique_ptr<Placeholder> holder_;
-};
-}  // namespace framework
-}  // namespace paddle
-#include "paddle/fluid/framework/channel_impl.h"
--- a/paddle/fluid/framework/channel_impl.h
+++ b/paddle/fluid/framework/channel_impl.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <stddef.h>  // for size_t
-#include <atomic>
-#include <condition_variable>  // NOLINT
-#include <deque>
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/platform/enforce.h"
-namespace paddle {
-namespace framework {
-template <typename T>
-class ChannelImpl : public paddle::framework::Channel<T> {
-  friend Channel<T> *paddle::framework::MakeChannel<T>(size_t);
-  friend void paddle::framework::CloseChannel<T>(Channel<T> *);
- public:
-  virtual bool CanSend();
-  virtual bool CanReceive();
-  virtual void Send(T *);
-  virtual bool Receive(T *);
-  virtual size_t Cap() { return cap_; }
-  virtual void Lock();
-  virtual void Unlock();
-  virtual bool IsClosed();
-  virtual void Close();
-  explicit ChannelImpl(size_t);
-  virtual ~ChannelImpl();
-  virtual void AddToSendQ(const void *referrer, T *data,
-                          std::shared_ptr<std::condition_variable_any> cond,
-                          std::function<bool(ChannelAction)> cb);
-  virtual void AddToReceiveQ(const void *referrer, T *data,
-                             std::shared_ptr<std::condition_variable_any> cond,
-                             std::function<bool(ChannelAction)> cb);
-  virtual void RemoveFromSendQ(const void *referrer);
-  virtual void RemoveFromReceiveQ(const void *referrer);
- private:
-  struct QueueMessage {
-    T *data;
-    std::shared_ptr<std::condition_variable_any> cond;
-    bool chan_closed = false;
-    bool completed = false;
-    const void *referrer;  // TODO(thuan): figure out better way to do this
-    std::function<bool(ChannelAction)> callback;
-    explicit QueueMessage(T *item)
-        : data(item), cond(std::make_shared<std::condition_variable_any>()) {}
-    QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
-        : data(item), cond(cond) {}
-    void Wait(std::unique_lock<std::recursive_mutex> &lock) {
-      cond->wait(lock, [this]() { return completed; });
-    }
-    void Notify() {
-      completed = true;
-      cond->notify_all();
-    }
-  };
-  void send_return() {
-    send_ctr--;
-    destructor_cond_.notify_all();
-  }
-  bool recv_return(bool value) {
-    recv_ctr--;
-    destructor_cond_.notify_all();
-    return value;
-  }
-  std::shared_ptr<QueueMessage> get_first_message(
-      std::deque<std::shared_ptr<QueueMessage>> *queue, ChannelAction action) {
-    while (!queue->empty()) {
-      // Check whether this message was added by Select
-      // If this was added by Select then execute the callback
-      // to check if you can execute this message. The callback
-      // can return false if some other case was executed in Select.
-      // In that case just discard this QueueMessage and process next.
-      std::shared_ptr<QueueMessage> m = queue->front();
-      queue->pop_front();
-      if (m->callback == nullptr || m->callback(action)) return m;
-    }
-    return nullptr;
-  }
-  size_t cap_;
-  std::recursive_mutex mu_;
-  bool closed_;
-  std::deque<T> buf_;
-  std::deque<std::shared_ptr<QueueMessage>> recvq;
-  std::deque<std::shared_ptr<QueueMessage>> sendq;
-  std::atomic<unsigned> send_ctr{0};
-  std::atomic<unsigned> recv_ctr{0};
-  std::condition_variable_any destructor_cond_;
-};
-template <typename T>
-ChannelImpl<T>::ChannelImpl(size_t capacity)
-    : cap_(capacity), closed_(false), send_ctr(0), recv_ctr(0) {
-  PADDLE_ENFORCE_GE(capacity, 0);
-}
-template <typename T>
-bool ChannelImpl<T>::CanSend() {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  return !closed_ && (!recvq.empty() || buf_.size() < cap_);
-}
-template <typename T>
-bool ChannelImpl<T>::CanReceive() {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  return !(closed_ && buf_.empty()) && (!sendq.empty() || buf_.size() > 0);
-}
-template <typename T>
-void ChannelImpl<T>::Send(T *item) {
-  send_ctr++;
-  std::unique_lock<std::recursive_mutex> lock{mu_};
-  // If channel is closed, throw exception
-  if (closed_) {
-    send_return();
-    lock.unlock();
-    PADDLE_THROW("Cannot send on closed channel");
-  }
-  // If there is a receiver, directly pass the value we want
-  // to send to the receiver, bypassing the channel buffer if any
-  if (!recvq.empty()) {
-    std::shared_ptr<QueueMessage> m =
-        get_first_message(&recvq, ChannelAction::SEND);
-    if (m != nullptr) {
-      *(m->data) = std::move(*item);
-      m->Notify();
-      send_return();
-      return;
-    } else {
-      Send(item);
-      send_return();
-      return;
-    }
-  }
-  // Unbuffered channel will always bypass this
-  // If buffered channel has space in buffer,
-  // write the element to the buffer.
-  if (buf_.size() < cap_) {
-    // Copy to buffer
-    buf_.push_back(std::move(*item));
-    send_return();
-    return;
-  }
-  // Block on channel, because some receiver will complete
-  // the operation for us
-  auto m = std::make_shared<QueueMessage>(item);
-  sendq.push_back(m);
-  m->Wait(lock);
-  if (m->chan_closed) {
-    send_return();
-    lock.unlock();
-    PADDLE_THROW("Cannot send on closed channel");
-  }
-  send_return();
-}
-template <typename T>
-bool ChannelImpl<T>::Receive(T *item) {
-  recv_ctr++;
-  std::unique_lock<std::recursive_mutex> lock{mu_};
-  // If channel is closed and buffer is empty or
-  // channel is unbuffered
-  if (closed_ && buf_.empty()) return recv_return(false);
-  // If there is a sender, directly receive the value we want
-  // from the sender. In case of a buffered channel, read from
-  // buffer and move front of send queue to the buffer
-  if (!sendq.empty()) {
-    std::shared_ptr<QueueMessage> m =
-        get_first_message(&sendq, ChannelAction::RECEIVE);
-    if (buf_.size() > 0) {
-      // Case 1 : Channel is Buffered
-      // Do Data transfer from front of buffer
-      // and add a QueueMessage to the buffer
-      *item = std::move(buf_.front());
-      buf_.pop_front();
-      // If first message from sendq is not null
-      // add it to the buffer and notify it
-      if (m != nullptr) {
-        // Copy to buffer
-        buf_.push_back(std::move(*(m->data)));
-        m->Notify();
-      }  // Ignore if there is no first message
-    } else {
-      // Case 2: Channel is Unbuffered
-      // Do data transfer from front of SendQ
-      // If front is nullptr, then recursively call itself
-      if (m != nullptr) {
-        *item = std::move(*(m->data));
-        m->Notify();
-      } else {
-        return recv_return(Receive(item));
-      }
-    }
-    return recv_return(true);
-  }
-  // If this is a buffered channel and there are items in buffer
-  if (buf_.size() > 0) {
-    // Directly read from buffer
-    *item = std::move(buf_.front());
-    buf_.pop_front();
-    // return true
-    return recv_return(true);
-  }
-  // No sender available, block on this channel
-  // Some receiver will complete the option for us
-  auto m = std::make_shared<QueueMessage>(item);
-  recvq.push_back(m);
-  m->Wait(lock);
-  return recv_return(!m->chan_closed);
-}
-template <typename T>
-void ChannelImpl<T>::Lock() {
-  mu_.lock();
-}
-template <typename T>
-void ChannelImpl<T>::Unlock() {
-  mu_.unlock();
-}
-template <typename T>
-bool ChannelImpl<T>::IsClosed() {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  return closed_;
-}
-template <typename T>
-void ChannelImpl<T>::Close() {
-  std::unique_lock<std::recursive_mutex> lock{mu_};
-  if (closed_) {
-    // TODO(abhinavarora): closing an already closed channel should panic
-    lock.unlock();
-    return;
-  }
-  closed_ = true;
-  // Empty the readers
-  while (!recvq.empty()) {
-    std::shared_ptr<QueueMessage> m = recvq.front();
-    recvq.pop_front();
-    m->chan_closed = true;
-    // Execute callback function (if any)
-    if (m->callback != nullptr) {
-      m->callback(ChannelAction::CLOSE);
-    }
-    m->Notify();
-  }
-  // Empty the senders
-  while (!sendq.empty()) {
-    std::shared_ptr<QueueMessage> m = sendq.front();
-    sendq.pop_front();
-    m->chan_closed = true;
-    // Execute callback function (if any)
-    if (m->callback != nullptr) {
-      m->callback(ChannelAction::CLOSE);
-    }
-    m->Notify();
-  }
-}
-template <typename T>
-void ChannelImpl<T>::AddToSendQ(
-    const void *referrer, T *data,
-    std::shared_ptr<std::condition_variable_any> cond,
-    std::function<bool(ChannelAction)> cb) {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  auto m = std::make_shared<QueueMessage>(data, cond);
-  m->referrer = referrer;
-  m->callback = cb;
-  sendq.push_back(m);
-}
-template <typename T>
-void ChannelImpl<T>::AddToReceiveQ(
-    const void *referrer, T *data,
-    std::shared_ptr<std::condition_variable_any> cond,
-    std::function<bool(ChannelAction)> cb) {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  auto m = std::make_shared<QueueMessage>(data, cond);
-  m->referrer = referrer;
-  m->callback = cb;
-  recvq.push_back(m);
-}
-template <typename T>
-void ChannelImpl<T>::RemoveFromSendQ(const void *referrer) {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  for (auto it = sendq.begin(); it != sendq.end();) {
-    std::shared_ptr<QueueMessage> sendMsg = (std::shared_ptr<QueueMessage>)*it;
-    if (sendMsg->referrer == referrer) {
-      it = sendq.erase(it);
-    } else {
-      ++it;
-    }
-  }
-}
-template <typename T>
-void ChannelImpl<T>::RemoveFromReceiveQ(const void *referrer) {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  for (auto it = recvq.begin(); it != recvq.end();) {
-    std::shared_ptr<QueueMessage> recvMsg = (std::shared_ptr<QueueMessage>)*it;
-    if (recvMsg->referrer == referrer) {
-      it = recvq.erase(it);
-    } else {
-      ++it;
-    }
-  }
-}
-template <typename T>
-ChannelImpl<T>::~ChannelImpl() {
-  Close();
-  // The destructor must wait for all readers and writers to complete their task
-  // The channel has been closed, so we will not accept new readers and writers
-  std::unique_lock<std::recursive_mutex> lock{mu_};
-  destructor_cond_.wait(lock,
-                        [this]() { return send_ctr == 0 && recv_ctr == 0; });
-}
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/channel_test.cc
+++ b/paddle/fluid/framework/channel_test.cc
--- a/paddle/fluid/framework/concurrency_test.cc
+++ b/paddle/fluid/framework/concurrency_test.cc
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <thread>  // NOLINT
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-USE_NO_KERNEL_OP(go);
-USE_NO_KERNEL_OP(channel_close);
-USE_NO_KERNEL_OP(channel_create);
-USE_NO_KERNEL_OP(channel_recv);
-USE_NO_KERNEL_OP(channel_send);
-USE_NO_KERNEL_OP(elementwise_add);
-USE_NO_KERNEL_OP(select);
-USE_NO_KERNEL_OP(conditional_block);
-USE_NO_KERNEL_OP(equal);
-USE_NO_KERNEL_OP(assign);
-USE_NO_KERNEL_OP(while);
-USE_NO_KERNEL_OP(print);
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-namespace paddle {
-namespace framework {
-template <typename T>
-LoDTensor *CreateVariable(Scope *scope, const p::CPUPlace &place,
-                          std::string name, T value) {
-  // Create LoDTensor<int> of dim [1]
-  auto var = scope->Var(name);
-  auto tensor = var->GetMutable<LoDTensor>();
-  tensor->Resize({1});
-  T *expect = tensor->mutable_data<T>(place);
-  expect[0] = value;
-  return tensor;
-}
-void AddOp(const std::string &type, const VariableNameMap &inputs,
-           const VariableNameMap &outputs, AttributeMap attrs,
-           BlockDesc *block) {
-  // insert op
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto &kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto &kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
-  }
-  op->SetAttrMap(attrs);
-}
-void AddCase(ProgramDesc *program, Scope *scope, p::CPUPlace *place,
-             BlockDesc *casesBlock, int caseId, int caseType,
-             std::string caseChannel, std::string caseVarName,
-             std::function<void(BlockDesc *, Scope *)> func) {
-  std::string caseCondName = std::string("caseCond") + std::to_string(caseId);
-  std::string caseCondXVarName =
-      std::string("caseCondX") + std::to_string(caseId);
-  BlockDesc *caseBlock = program->AppendBlock(*casesBlock);
-  func(caseBlock, scope);
-  CreateVariable(scope, *place, caseCondName, false);
-  CreateVariable(scope, *place, caseCondXVarName, caseId);
-  CreateVariable(scope, *place, caseVarName, caseId);
-  scope->Var("step_scope");
-  AddOp("equal", {{"X", {caseCondXVarName}}, {"Y", {"caseToExecute"}}},
-        {{"Out", {caseCondName}}}, {}, casesBlock);
-  AddOp("conditional_block", {{"X", {caseCondName}}, {"Params", {}}},
-        {{"Out", {}}, {"Scope", {"step_scope"}}},
-        {{"sub_block", caseBlock}, {"is_scalar_condition", true}}, casesBlock);
-}
-void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
-                        BlockDesc *parentBlock, std::string dataChanName,
-                        std::string quitChanName) {
-  BlockDesc *whileBlock = program->AppendBlock(*parentBlock);
-  CreateVariable(scope, *place, "whileExitCond", true);
-  CreateVariable(scope, *place, "caseToExecute", -1);
-  CreateVariable(scope, *place, "case1var", 0);
-  CreateVariable(scope, *place, "xtemp", 0);
-  // TODO(thuan): Need to create fibXToSend, since channel send moves the actual
-  // data,
-  // which causes the data to be no longer accessible to do the fib calculation
-  // TODO(abhinav): Change channel send to do a copy instead of a move!
-  CreateVariable(scope, *place, "fibXToSend", 0);
-  CreateVariable(scope, *place, "fibX", 0);
-  CreateVariable(scope, *place, "fibY", 1);
-  CreateVariable(scope, *place, "quitVar", 0);
-  BlockDesc *casesBlock = program->AppendBlock(*whileBlock);
-  std::function<void(BlockDesc * caseBlock)> f = [](BlockDesc *caseBlock) {};
-  // TODO(thuan): Remove this once we change channel send to do a copy instead
-  // of move
-  AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"fibXToSend"}}}, {}, whileBlock);
-  // Case 0: Send to dataChanName
-  std::function<void(BlockDesc * caseBlock, Scope * scope)> case0Func = [&](
-      BlockDesc *caseBlock, Scope *scope) {
-    AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"xtemp"}}}, {}, caseBlock);
-    AddOp("assign", {{"X", {"fibY"}}}, {{"Out", {"fibX"}}}, {}, caseBlock);
-    AddOp("elementwise_add", {{"X", {"xtemp"}}, {"Y", {"fibY"}}},
-          {{"Out", {"fibY"}}}, {}, caseBlock);
-  };
-  AddCase(program, scope, place, casesBlock, 0, 1, dataChanName, "fibXToSend",
-          case0Func);
-  std::string case0Config =
-      std::string("0,1,") + dataChanName + std::string(",fibXToSend");
-  // Case 1: Receive from quitChanName
-  std::function<void(BlockDesc * caseBlock, Scope * scope)> case2Func = [&](
-      BlockDesc *caseBlock, Scope *scope) {
-    // Exit the while loop after we receive from quit channel.
-    // We assign a false to "whileExitCond" variable, which will
-    // break out of while_op loop
-    CreateVariable(scope, *place, "whileFalse", false);
-    AddOp("assign", {{"X", {"whileFalse"}}}, {{"Out", {"whileExitCond"}}}, {},
-          caseBlock);
-  };
-  AddCase(program, scope, place, casesBlock, 1, 2, quitChanName, "quitVar",
-          case2Func);
-  std::string case1Config =
-      std::string("1,2,") + quitChanName + std::string(",quitVar");
-  // Select block
-  AddOp("select", {{"X", {dataChanName, quitChanName}},
-                   {"case_to_execute", {"caseToExecute"}}},
-        {{"Out", {}}},
-        {{"sub_block", casesBlock},
-         {"cases", std::vector<std::string>{case0Config, case1Config}}},
-        whileBlock);
-  scope->Var("stepScopes");
-  AddOp("while",
-        {{"X", {dataChanName, quitChanName}}, {"Condition", {"whileExitCond"}}},
-        {{"Out", {}}, {"StepScopes", {"stepScopes"}}},
-        {{"sub_block", whileBlock}}, parentBlock);
-}
-TEST(Concurrency, Go_Op) {
-  Scope scope;
-  p::CPUPlace place;
-  // Initialize scope variables
-  p::CPUDeviceContext ctx(place);
-  // Create channel variable
-  scope.Var("Channel");
-  // Create Variables, x0 will be put into channel,
-  // result will be pulled from channel
-  CreateVariable(&scope, place, "Status", false);
-  CreateVariable(&scope, place, "x0", 99);
-  CreateVariable(&scope, place, "result", 0);
-  framework::Executor executor(place);
-  ProgramDesc program;
-  BlockDesc *block = program.MutableBlock(0);
-  // Create channel OP
-  AddOp("channel_create", {}, {{"Out", {"Channel"}}},
-        {{"capacity", 10}, {"data_type", f::proto::VarType::LOD_TENSOR}},
-        block);
-  // Create Go Op routine
-  BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
-  AddOp("channel_send", {{"Channel", {"Channel"}}, {"X", {"x0"}}},
-        {{"Status", {"Status"}}}, {}, goOpBlock);
-  // Create Go Op
-  AddOp("go", {{"X", {"Channel", "x0"}}}, {}, {{"sub_block", goOpBlock}},
-        block);
-  // Create Channel Receive Op
-  AddOp("channel_recv", {{"Channel", {"Channel"}}},
-        {{"Status", {"Status"}}, {"Out", {"result"}}}, {}, block);
-  // Create Channel Close Op
-  AddOp("channel_close", {{"Channel", {"Channel"}}}, {}, {}, block);
-  // Check the result tensor to make sure it is set to 0
-  const LoDTensor &tensor = (scope.FindVar("result"))->Get<LoDTensor>();
-  auto *initialData = tensor.data<int>();
-  EXPECT_EQ(initialData[0], 0);
-  executor.Run(program, &scope, 0, true, true);
-  // After we call executor.run, the Go operator should do a channel_send to
-  // set the "result" variable to 99.
-  auto *finalData = tensor.data<int>();
-  EXPECT_EQ(finalData[0], 99);
-}
-/**
- * This test implements the fibonacci function using go_op and select_op
- */
-TEST(Concurrency, Select) {
-  Scope scope;
-  p::CPUPlace place;
-  // Initialize scope variables
-  p::CPUDeviceContext ctx(place);
-  CreateVariable(&scope, place, "Status", false);
-  CreateVariable(&scope, place, "result", 0);
-  CreateVariable(&scope, place, "currentXFib", 0);
-  framework::Executor executor(place);
-  ProgramDesc program;
-  BlockDesc *block = program.MutableBlock(0);
-  // Create channel OP
-  std::string dataChanName = "Channel";
-  scope.Var(dataChanName);
-  AddOp("channel_create", {}, {{"Out", {dataChanName}}},
-        {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
-  std::string quitChanName = "Quit";
-  scope.Var(quitChanName);
-  AddOp("channel_create", {}, {{"Out", {quitChanName}}},
-        {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
-  // Create Go Op routine, which loops 10 times over fibonacci sequence
-  CreateVariable(&scope, place, "xReceiveVar", 0);
-  BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
-  for (int i = 0; i < 10; ++i) {
-    AddOp("channel_recv", {{"Channel", {dataChanName}}},
-          {{"Status", {"Status"}}, {"Out", {"currentXFib"}}}, {}, goOpBlock);
-    AddOp("print", {{"In", {"currentXFib"}}}, {{"Out", {"currentXFib"}}},
-          {{"first_n", 100},
-           {"summarize", -1},
-           {"print_tensor_name", false},
-           {"print_tensor_type", true},
-           {"print_tensor_shape", false},
-           {"print_tensor_lod", false},
-           {"print_phase", std::string("FORWARD")},
-           {"message", std::string("X: ")}},
-          goOpBlock);
-  }
-  CreateVariable(&scope, place, "quitSignal", 0);
-  AddOp("channel_send", {{"Channel", {quitChanName}}, {"X", {"quitSignal"}}},
-        {{"Status", {"Status"}}}, {}, goOpBlock);
-  // Create Go Op
-  AddOp("go", {{"X", {dataChanName, quitChanName}}}, {},
-        {{"sub_block", goOpBlock}}, block);
-  AddFibonacciSelect(&scope, &place, &program, block, dataChanName,
-                     quitChanName);
-  // Create Channel Close Op
-  AddOp("channel_close", {{"Channel", {dataChanName}}}, {}, {}, block);
-  AddOp("channel_close", {{"Channel", {quitChanName}}}, {}, {}, block);
-  executor.Run(program, &scope, 0, true, true);
-  // After we call executor.run, "result" variable should be equal to 34
-  // (which is 10 loops through fibonacci sequence)
-  const LoDTensor &tensor = (scope.FindVar("currentXFib"))->Get<LoDTensor>();
-  auto *finalData = tensor.data<int>();
-  EXPECT_EQ(finalData[0], 34);
-}
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -54,3 +54,8 @@ cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_execu
 #        device_context reduce_op_handle )
 cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
        DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
+cc_library(build_strategy SRCS build_strategy.cc DEPS
+        graph_viz_pass multi_devices_graph_pass
+        multi_devices_graph_print_pass multi_devices_graph_check_pass
+        fuse_elewise_add_act_pass)
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/details/build_strategy.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+namespace paddle {
+namespace framework {
+namespace details {
+class ParallelExecutorPassBuilder : public ir::PassBuilder {
+ public:
+  explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
+      : ir::PassBuilder(), strategy_(strategy) {
+    // Add a graph viz pass to record a graph.
+    if (!strategy_.debug_graphviz_path_.empty()) {
+      auto viz_pass = AppendPass("graph_viz_pass");
+      const std::string graph_path = string::Sprintf(
+          "%s%s", strategy_.debug_graphviz_path_.c_str(), "_original_graph");
+      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
+    }
+    // Add op fusion.
+    if (strategy.fuse_elewise_add_act_ops_) {
+      auto fuse_elewise_add_act_pass = AppendPass("fuse_elewise_add_act_pass");
+      // Add a graph viz pass to record a graph.
+      if (!strategy.debug_graphviz_path_.empty()) {
+        auto viz_pass = AppendPass("graph_viz_pass");
+        const std::string graph_path = string::Sprintf(
+            "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
+        viz_pass->Set<std::string>("graph_viz_path",
+                                   new std::string(graph_path));
+      }
+    }
+    // Convert graph to run on multi-devices.
+    auto multi_devices_pass = AppendPass("multi_devices_pass");
+    multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy",
+                                                         &strategy_);
+    // Add a graph print pass to record a graph with device info.
+    if (!strategy_.debug_graphviz_path_.empty()) {
+      auto multi_devices_print_pass = AppendPass("multi_devices_print_pass");
+      multi_devices_print_pass->SetNotOwned<const std::string>(
+          "debug_graphviz_path", &strategy_.debug_graphviz_path_);
+      multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
+          "graph_printer", new details::GraphvizSSAGraphPrinter);
+    }
+    // Verify that the graph is correct for multi-device executor.
+    AppendPass("multi_devices_check_pass");
+  }
+ private:
+  BuildStrategy strategy_;
+};
+std::shared_ptr<ir::PassBuilder> BuildStrategy::CreatePassesFromStrategy()
+    const {
+  pass_builder_.reset(new ParallelExecutorPassBuilder(*this));
+  return pass_builder_;
+}
+std::unique_ptr<ir::Graph> BuildStrategy::Apply(
+    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
+    const std::string &loss_var_name,
+    const std::unordered_set<std::string> &param_names,
+    const std::vector<Scope *> &local_scopes,
+#ifdef PADDLE_WITH_CUDA
+    const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const {
+#else
+    const bool use_cuda) const {
+#endif
+  // Create a default one if not initialized by user.
+  if (!pass_builder_) {
+    CreatePassesFromStrategy();
+  }
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
+  for (std::shared_ptr<ir::Pass> &pass : pass_builder_->AllPasses()) {
+    if (pass->Type() == "multi_devices_pass") {
+      pass->Erase("places");
+      pass->SetNotOwned<const std::vector<platform::Place>>("places", &places);
+      pass->Erase("loss_var_name");
+      pass->SetNotOwned<const std::string>("loss_var_name", &loss_var_name);
+      pass->Erase("params");
+      pass->SetNotOwned<const std::unordered_set<std::string>>("params",
+                                                               &param_names);
+      pass->Erase("local_scopes");
+      pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
+                                                    &local_scopes);
+#ifdef PADDLE_WITH_CUDA
+      platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
+      pass->Erase("nccl_ctxs");
+      pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
+#endif
+    }
+    graph = pass->Apply(std::move(graph));
+  }
+  return graph;
+}
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
+USE_PASS(fuse_elewise_add_act_pass);
+USE_PASS(graph_viz_pass);
+USE_PASS(multi_devices_pass);
+USE_PASS(multi_devices_check_pass);
+USE_PASS(multi_devices_print_pass);
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -15,6 +15,17 @@
 #pragma once
 #include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/pass_builder.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
 namespace paddle {
 namespace framework {
@@ -57,6 +68,30 @@ struct BuildStrategy {
  bool fuse_elewise_add_act_ops_{false};
  bool enable_data_balance_{false};
+  // User normally doesn't need to call this API.
+  // The PassBuilder allows for more customized insert, remove of passes
+  // from python side.
+  // A new PassBuilder is created based on configs defined above and
+  // passes are owned by the PassBuilder.
+  std::shared_ptr<ir::PassBuilder> CreatePassesFromStrategy() const;
+  // Apply the passes built by the pass_builder_. The passes will be
+  // applied to the Program and output an ir::Graph.
+  std::unique_ptr<ir::Graph> Apply(
+      const ProgramDesc &main_program,
+      const std::vector<platform::Place> &places,
+      const std::string &loss_var_name,
+      const std::unordered_set<std::string> &param_names,
+      const std::vector<Scope *> &local_scopes,
+#ifdef PADDLE_WITH_CUDA
+      const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const;
+#else
+      const bool use_cuda) const;
+#endif
+ private:
+  mutable std::shared_ptr<ir::PassBuilder> pass_builder_;
 };
 }  // namespace details

--- a/paddle/fluid/framework/details/cow_ptr.h
+++ b/paddle/fluid/framework/details/cow_ptr.h
@@ -20,79 +20,37 @@ namespace paddle {
 namespace framework {
 namespace details {
-// Change it to thread safe flags if needed.
+template <class T>
-class ThreadUnsafeOwnershipFlags {
+class COWPtr {
 public:
-  explicit ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
+  typedef std::shared_ptr<T> RefPtr;
-  ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
-  ThreadUnsafeOwnershipFlags& operator=(
-      const ThreadUnsafeOwnershipFlags& other) = delete;
-  ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
-  void SetOwnership(bool flag) { flag_ = flag; }
-  // Invoke the callback if it is not owned.
-  template <typename Callback>
-  void AcquireOwnershipOnce(Callback acquire) {
-    if (!flag_) {
-      acquire();
-      flag_ = true;
-    }
-  }
 private:
-  bool flag_;
+  RefPtr m_sp;
-};
-// Copy-On-Write pointer.
-// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
-//
-// The template parameter OwnershipFlags should have:
-//   * a constructor takes a bool. True if own.
-//   * SetOwnership(bool flag).
-//   * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
-//     owned.
-//
-// https://en.wikipedia.org/wiki/Copy-on-write
-template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
-class COWPtr {
 public:
-  // Ctor from raw pointer.
+  COWPtr() : m_sp(nullptr) {}
-  explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
+  explicit COWPtr(T* t) : m_sp(t) {}
-  // Move methods. Steal ownership from origin
+  const T& Data() const { return *m_sp; }
-  COWPtr(COWPtr&& other)
-      : payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
-  COWPtr& operator=(COWPtr&& origin) = default;
-  // Copy methods. Not own payload
-  COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
-  COWPtr& operator=(const COWPtr& other) {
-    payload_ = other.payload_;
-    ownership_.SetOwnership(false);
-    return *this;
-  }
-  // Access read only data.
-  const T& Data() const { return *payload_; }
-  // Access mutable data. If the data is not owned, the data will be copied
-  // before.
  T* MutableData() {
-    ownership_.AcquireOwnershipOnce(
+    DetachIfNotUnique();
-        [this] { payload_.reset(new T(*payload_)); });
+    return m_sp.get();
-    return payload_.get();
  }
- private:
+  void DetachIfNotUnique() {
-  // Actual data pointer.
+    T* tmp = m_sp.get();
-  std::shared_ptr<T> payload_;
+    if (!(tmp == nullptr || m_sp.unique())) {
+      Detach();
+    }
+  }
-  // Ownership flag.
+  void Detach() {
-  OwnershipFlags ownership_;
+    T* tmp = m_sp.get();
+    m_sp = RefPtr(new T(*tmp));
+  }
 };
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/cow_ptr_test.cc
+++ b/paddle/fluid/framework/details/cow_ptr_test.cc
@@ -30,6 +30,14 @@ TEST(COWPtr, all) {
  ASSERT_EQ(ptr2.Data(), 10);
 }
+TEST(COWPtr, change_old) {
+  COWPtr<int> ptr(new int{0});
+  COWPtr<int> ptr2 = ptr;
+  *ptr.MutableData() = 10;
+  ASSERT_EQ(ptr2.Data(), 0);
+  ASSERT_EQ(ptr.Data(), 10);
+}
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/reference_count_pass.cc
+++ b/paddle/fluid/framework/details/reference_count_pass.cc
@@ -80,16 +80,16 @@ std::unique_ptr<ir::Graph> ReferenceCountPass::ApplyImpl(
      // This is weird but there is really some variables without var_desc
      // in computation_op
      if (var_desc == nullptr) {
-        if (compute_op->Node()->Op()->Block()->FindVar(var_name) == nullptr)
+        var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name);
-          continue;
+        if (var_desc == nullptr) continue;
-      } else {
+      }
      if (var_desc->Persistable()) continue;
      auto var_type = var_desc->Proto()->type().type();
      if (var_type != proto::VarType::LOD_TENSOR &&
          var_type != proto::VarType::SELECTED_ROWS) {
        continue;
      }
-      }
      // compute op only runs in one device
      if (ref_cnts[place.device]->count(var_name))

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -76,15 +75,13 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
    var->GetMutable<platform::PlaceList>();
  } else if (var_type == proto::VarType::READER) {
    var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::CHANNEL) {
-    var->GetMutable<ChannelHolder>();
  } else if (var_type == proto::VarType::RAW) {
    // GetMutable will be called in operator
  } else {
    PADDLE_THROW(
        "Variable type %d is not in "
        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
+        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
        var_type);
  }
 }

--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -126,7 +126,6 @@ message VarType {
    LOD_TENSOR_ARRAY = 13;
    PLACE_LIST = 14;
    READER = 15;
-    CHANNEL = 16;
    // Any runtime decided variable type is raw
    // raw variables should manage their own allocations
    // in operators like nccl_op
@@ -158,12 +157,6 @@ message VarType {
  message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
  optional ReaderDesc reader = 5;
-  message ChannelDesc {
-    required Type data_type = 1;
-    required int64 capacity = 2;
-  }
-  optional ChannelDesc channel = 6;
  message Tuple { repeated Type element_type = 1; }
  optional Tuple tuple = 7;
 }

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
 set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h)
 file(WRITE ${pass_file} "// Generated by the paddle/fluid/framework/ir/CMakeLists.txt.  DO NOT EDIT!\n\n")
+file(APPEND ${pass_file} "\#pragma once\n")
 file(APPEND ${pass_file} "\#include \"paddle/fluid/framework/ir/pass.h\"\n")
@@ -28,12 +29,13 @@ cc_library(graph_pattern_detector SRCS graph_pattern_detector.cc DEPS graph grap
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(fc_fuse_pass inference)
-if(WITH_MKLDNN)
+if (WITH_MKLDNN)
    pass_library(conv_relu_mkldnn_fuse_pass inference)
-endif()
+endif ()
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
 pass_library(fc_lstm_fuse_pass inference)
+pass_library(embedding_fc_lstm_fuse_pass inference)
 pass_library(fc_gru_fuse_pass inference)
 pass_library(seq_concat_fc_fuse_pass inference)
@@ -41,12 +43,14 @@ cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass
 set(GLOB_PASS_LIB ${PASS_LIBRARY} CACHE INTERNAL "Global PASS library")
+cc_library(pass_builder SRCS pass_builder.cc DEPS pass)
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
-if(WITH_MKLDNN)
+if (WITH_MKLDNN)
    cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
-endif()
+endif ()
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -257,6 +257,22 @@ std::unique_ptr<ir::Graph> AttentionLSTMFusePass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  PDPattern external_pattern, subblock_pattern;
+  // Use the following variables to tell whether this model is RNN1.
+  // This fuse can only works on the RNN1 model.
+  std::unordered_set<std::string> specified_vars({"data_lod_attention",
+                                                  "cell_init", "hidden_init",
+                                                  "data", "week", "minute"});
+  int count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsVar() && specified_vars.count(node->Name())) {
+      ++count;
+    }
+  }
+  if (count < specified_vars.size()) {
+    return graph;
+  }
+  // Continue to fuse.
  FindWhileOp(graph.get());
  return graph;
 }

--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass.cc
@@ -26,8 +26,6 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
  PADDLE_ENFORCE(graph.get());
  FusePassBase::Init("conv_relu_mkldnn_fuse", graph.get());
-  std::unordered_set<Node*> nodes2delete;
  GraphPatternDetector gpd;
  auto* conv_input = gpd.mutable_pattern()
                         ->NewNode("conv_relu_mkldnn_fuse/conv_input")
@@ -43,35 +41,19 @@ std::unique_ptr<ir::Graph> ConvReLUFusePass::ApplyImpl(
    VLOG(4) << "handle ConvReLU fuse";
    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
                              conv_relu_pattern);                      // Filter
-    GET_IR_NODE_FROM_SUBGRAPH(conv_bias, conv_bias, conv_relu_pattern);  // Bias
    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_relu_pattern);  // tmp
    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_relu_pattern);  // CONV op
    GET_IR_NODE_FROM_SUBGRAPH(relu_out, relu_out, conv_relu_pattern);  // Out
    GET_IR_NODE_FROM_SUBGRAPH(relu, relu, conv_relu_pattern);  // ReLU op
-    // Create an ConvReLU Node.
+    // Transform Conv node into ConvReLU node.
-    OpDesc desc;
+    OpDesc* desc = conv->Op();
-    std::string conv_relu_i_in = subgraph.at(conv_input)->Name();
+    desc->SetOutput("Output", std::vector<std::string>({relu_out->Name()}));
-    std::string conv_relu_w_in = conv_weight->Name();
+    desc->SetAttr("fuse_relu", true);
-    std::string conv_relu_b_in = conv_bias->Name();
+    GraphSafeRemoveNodes(graph.get(), {relu, conv_out});
-    std::string conv_relu_out = relu_out->Name();
-    desc.SetInput("Input", std::vector<std::string>({conv_relu_i_in}));
-    desc.SetInput("Filter", std::vector<std::string>({conv_relu_w_in}));
-    desc.SetInput("Bias", std::vector<std::string>({conv_relu_b_in}));
-    desc.SetOutput("Output", std::vector<std::string>({conv_relu_out}));
-    desc.SetType("conv2d");
-    for (auto& attr : conv->Op()->GetAttrMap()) {
-      desc.SetAttr(attr.first, attr.second);
-    }
-    desc.SetAttr("fuse_relu", true);
-    auto conv_relu_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
-    GraphSafeRemoveNodes(graph.get(), {conv, relu, conv_out});
    PADDLE_ENFORCE(subgraph.count(conv_input));
-    IR_NODE_LINK_TO(subgraph.at(conv_input), conv_relu_node);
+    IR_NODE_LINK_TO(conv, relu_out);
-    IR_NODE_LINK_TO(conv_weight, conv_relu_node);
-    IR_NODE_LINK_TO(conv_bias, conv_relu_node);
-    IR_NODE_LINK_TO(conv_relu_node, relu_out);
    found_conv_relu_count++;
  };

--- a/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/conv_relu_mkldnn_fuse_pass_tester.cc
@@ -85,19 +85,16 @@ TEST(ConvReLUFusePass, basic) {
  for (auto* node : graph->Nodes()) {
    if (node->IsOp() && node->Op()->Type() == "conv2d") {
-      if (node->Op()->HasAttr("use_mkldnn")) {
+      auto* op = node->Op();
-        bool use_mkldnn = boost::get<bool>(node->Op()->GetAttr("use_mkldnn"));
+      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
-        if (use_mkldnn) {
+      EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
-          if (node->Op()->HasAttr("fuse_relu")) {
+      ASSERT_TRUE(op->HasAttr("fuse_relu"));
-            bool fuse_relu = boost::get<bool>(node->Op()->GetAttr("fuse_relu"));
+      bool fuse_relu = boost::get<bool>(op->GetAttr("fuse_relu"));
      if (fuse_relu) {
        ++conv_relu_count;
      }
    }
  }
-      }
-    }
-  }
  EXPECT_EQ(conv_relu_count, 1);
 }

--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h"
+#include <algorithm>
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/platform/cpu_info.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+static int BuildFusion(Graph* graph, const std::string& name_scope,
+                       Scope* scope, bool with_fc_bias) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+  // Build pattern
+  PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
+                  ->assert_is_op_input("lookup_table")
+                  ->assert_var_not_persistable();
+  patterns::Embedding embedding_pattern(pattern, name_scope);
+  // TODO(jczaja): Intermediate can only be for val that are not used anywhere
+  //               but lookup table output may go into other LSTM (for reverse
+  //               direction)
+  auto* embedding_out = embedding_pattern(x);
+  patterns::FC fc_pattern(pattern, name_scope);
+  // fc_out is a tmp var, will be removed after fuse, so marked as intermediate.
+  auto* fc_out = fc_pattern(embedding_out, with_fc_bias)->AsIntermediate();
+  patterns::LSTM lstm_pattern(pattern, name_scope);
+  lstm_pattern(fc_out);
+  // Create New OpDesc
+  auto embedding_lstm_creator = [&](Node* embedding, Node* W, Node* lstm,
+                                    Node* input, Node* weight_x, Node* weight_h,
+                                    Node* bias, Node* hidden, Node* cell,
+                                    Node* xx, Node* fc_bias) {
+    OpDesc op_desc;
+    op_desc.SetType("fused_embedding_fc_lstm");
+#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()});
+    SET_IN(Ids, input);
+    SET_IN(WeightH, weight_h);
+    // Neet to have this passed as We need Wc data for peephole connections
+    SET_IN(Bias, bias);
+#undef SET_IN
+    // Multiply embeddings with Weights
+    PADDLE_ENFORCE(scope);
+    const std::string& embeddings = patterns::UniqueKey("Embeddings");
+    auto* embeddings_var = scope->Var(embeddings);
+    PADDLE_ENFORCE(embeddings_var);
+    auto* embeddings_tensor =
+        embeddings_var->GetMutable<framework::LoDTensor>();
+    // Get WeightX size: [single_embedding, fc_size]
+    // and embedding size: [dict_size, single_embedding]
+    // and create new size of embeddings eg. [dict_size , hidden_size]
+    auto* embedding_var = scope->FindVar(W->Name());
+    PADDLE_ENFORCE(embedding_var);
+    const auto& embedding_tensor = embedding_var->Get<framework::LoDTensor>();
+    const auto& weightx_tensor =
+        scope->FindVar(weight_x->Name())->Get<framework::LoDTensor>();
+    embeddings_tensor->Resize(
+        {embedding_tensor.dims()[0], weightx_tensor.dims()[1]});
+    // Multiplie embeddings via WeightsX and add bias
+    auto embedding_data = embedding_tensor.data<float>();
+    auto weightx_data = weightx_tensor.data<float>();
+    auto embeddings_data =
+        embeddings_tensor->mutable_data<float>(platform::CPUPlace());
+    // Adding biases to GEMM result to be
+    auto* lstm_bias_var = scope->FindVar(bias->Name());
+    PADDLE_ENFORCE(lstm_bias_var);
+    const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
+    auto alpha = 1.0f;
+    auto beta = 1.0f;
+    int m = embedding_tensor.dims()[0];
+    int n = weightx_tensor.dims()[1];
+    int k = embedding_tensor.dims()[1];
+    // Copy only gate biases values (only actual bias data, not peephole
+    // weights)
+    std::vector<float> combined_biases;
+    combined_biases.reserve(n);
+    std::copy_n(lstm_bias_tensor.data<float>(), n,
+                std::back_inserter(combined_biases));
+    if (with_fc_bias) {
+      // Add FC-bias with LSTM-bias (into GEMM result to be)
+      auto* fc_bias_var = scope->FindVar(fc_bias->Name());
+      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
+      for (int i = 0; i < fc_bias_tensor.numel(); i++) {
+        combined_biases[i] += fc_bias_tensor.data<float>()[i];
+      }
+    }
+    // broadcast biases
+    std::vector<float> ones(m, 1.0f);
+    paddle::operators::math::CBlas<float>::GEMM(
+        CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, 1, alpha, &ones[0], 1,
+        &combined_biases[0], n, 0.0f, embeddings_data, n);
+    // Wx*embeddings + biases
+    paddle::operators::math::CBlas<float>::GEMM(
+        CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha,
+        embedding_data, k, weightx_data, n, beta, embeddings_data, n);
+    op_desc.SetInput("Embeddings", {embeddings});
+    // Create temp variables.
+    const std::string BatchedInput = patterns::UniqueKey("BatchedInput");
+    const std::string BatchedCellPreAct =
+        patterns::UniqueKey("BatchedCellPreAct");
+    const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
+    scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
+    scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
+    scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
+    op_desc.SetInput("H0", {});
+    op_desc.SetInput("C0", {});
+    op_desc.SetOutput("Hidden", {hidden->Name()});
+    op_desc.SetOutput("Cell", {cell->Name()});
+    op_desc.SetOutput("XX", {xx->Name()});
+    op_desc.SetOutput("BatchedGate", {BatchedGate});
+    op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
+    op_desc.SetOutput("BatchedInput", {BatchedInput});
+    op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
+    op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
+    // TODO(TJ): get from attr
+    op_desc.SetAttr("use_seq", true);
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+#define OP_SET_OUT(x)                            \
+  const std::string x = patterns::UniqueKey(#x); \
+  op_desc.SetOutput(#x, {x});                    \
+  scope->Var(x)->GetMutable<LoDTensor>()
+    OP_SET_OUT(BatchedCell);
+    OP_SET_OUT(BatchedHidden);
+    OP_SET_OUT(ReorderedH0);
+    OP_SET_OUT(ReorderedC0);
+#undef OP_SET_OUT
+    auto* op = graph->CreateOpNode(&op_desc);
+    IR_NODE_LINK_TO(input, op);
+    IR_NODE_LINK_TO(weight_x, op);
+    IR_NODE_LINK_TO(weight_h, op);
+    IR_NODE_LINK_TO(bias, op);
+    IR_NODE_LINK_TO(op, hidden);
+    return op;
+  };
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Cell, Cell, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table, lookup_table, embedding_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(W, W, embedding_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
+    // TODO(jczaja): Add support for is_sparse / is_distributed
+    auto is_sparse = boost::get<bool>(lookup_table->Op()->GetAttr("is_sparse"));
+    auto is_distributed =
+        boost::get<bool>(lookup_table->Op()->GetAttr("is_distributed"));
+    if (is_sparse == true || is_distributed == true) {
+      return;
+    }
+    if (with_fc_bias) {
+      GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
+      embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight,
+                             Bias, Hidden, Cell, fc_out, fc_bias);
+      // Remove unneeded nodes.
+      // TODO(jczaja): Proper removing of lookup table
+      std::unordered_set<const Node*> marked_nodes(
+          //{lookup_table, mul, lstm, elementwise_add, fc_bias, W});
+          {mul, lstm, elementwise_add, fc_bias});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    } else {
+      GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern);
+      embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight,
+                             Bias, Hidden, Cell, fc_out, nullptr);
+      // Remove unneeded nodes.
+      // TODO(jczaja): Proper removing of lookup table
+      // std::unordered_set<const Node*> marked_nodes({lookup_table, W, mul,
+      // lstm});
+      std::unordered_set<const Node*> marked_nodes({mul, lstm});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    }
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+  return fusion_count;
+}
+std::unique_ptr<ir::Graph> EmbeddingFCLSTMFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 true /*with_fc_bias*/);
+  AddStatis(fusion_count);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(embedding_fc_lstm_fuse_pass,
+              paddle::framework::ir::EmbeddingFCLSTMFusePass);
--- a/paddle/fluid/inference/api/timer.h
+++ b/paddle/fluid/inference/api/timer.h
@@ -11,29 +11,30 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
-#include <chrono>  // NOLINT
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 namespace paddle {
-namespace inference {
+namespace framework {
+namespace ir {
+// Fusing of Embedding , FC and LSTM op
-// Timer for timer
+// Just FC without bias
-class Timer {
+class EmbeddingFCLSTMFusePass : public FusePassBase {
 public:
-  std::chrono::high_resolution_clock::time_point start;
+  virtual ~EmbeddingFCLSTMFusePass() {}
-  std::chrono::high_resolution_clock::time_point startu;
+ protected:
-  void tic() { start = std::chrono::high_resolution_clock::now(); }
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
-  double toc() {
-    startu = std::chrono::high_resolution_clock::now();
+  const std::string name_scope_{"embedding_fc_lstm_fuse"};
-    std::chrono::duration<double> time_span =
-        std::chrono::duration_cast<std::chrono::duration<double>>(startu -
-                                                                  start);
-    double used_time_ms = static_cast<double>(time_span.count()) * 1000.0;
-    return used_time_ms;
-  }
 };
-}  // namespace inference
+}  // namespace ir
+}  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc
@@ -77,10 +77,12 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
    const std::string BatchedCellPreAct =
        patterns::UniqueKey("BatchedCellPreAct");
    const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
+    const std::string CheckedCell = patterns::UniqueKey("CheckedCell");
    scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
    scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
    scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
+    scope->Var(CheckedCell)->GetMutable<framework::LoDTensor>();
    op_desc.SetInput("H0", {});
    op_desc.SetInput("C0", {});
@@ -90,6 +92,7 @@ int BuildFusion(Graph* graph, const std::string& name_scope, Scope* scope,
    op_desc.SetOutput("BatchedGate", {BatchedGate});
    op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
    op_desc.SetOutput("BatchedInput", {BatchedInput});
+    op_desc.SetOutput("CheckedCell", {CheckedCell});
    op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
    op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
    // TODO(TJ): get from attr

--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include <algorithm>
+#include <deque>
 #include <unordered_set>
-#include "paddle/fluid/framework/ir/graph_helper.h"
 namespace paddle {
 namespace framework {
 namespace ir {
@@ -113,6 +113,74 @@ std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
  return adj_list;
 }
+size_t GraphNum(const Graph &graph) {
+  std::unordered_set<ir::Node *> nodes = graph.Nodes();
+  std::unordered_set<ir::Node *> visited_nodes;
+  visited_nodes.reserve(nodes.size());
+  std::deque<ir::Node *> q_nodes;
+  std::vector<std::unordered_set<ir::Node *>> graph_nodes;
+  std::unordered_set<ir::Node *> g_nodes;
+  size_t graph_count = 0;
+  auto traverse_nodes = [&visited_nodes,
+                         &q_nodes](const std::vector<ir::Node *> &nodes) {
+    std::copy_if(
+        nodes.begin(), nodes.end(), std::back_inserter(q_nodes),
+        [&visited_nodes](Node *node) { return !visited_nodes.count(node); });
+  };
+  while (visited_nodes.size() != nodes.size()) {
+    if (!q_nodes.empty()) {
+      auto cur_node = q_nodes.front();
+      q_nodes.pop_front();
+      visited_nodes.insert(cur_node);
+      g_nodes.insert(cur_node);
+      traverse_nodes(cur_node->inputs);
+      traverse_nodes(cur_node->outputs);
+    } else {
+      ++graph_count;
+      if (g_nodes.size()) {
+        graph_nodes.emplace_back(g_nodes);
+      }
+      g_nodes.clear();
+      for (auto &n : nodes) {
+        if (visited_nodes.count(n) == 0) {
+          q_nodes.push_back(n);
+          break;
+        }
+      }
+    }
+  }
+  if (g_nodes.size()) {
+    graph_nodes.emplace_back(g_nodes);
+  }
+  if (VLOG_IS_ON(10)) {
+    VLOG(10) << "graph_num: " << graph_nodes.size();
+    for (auto &g_n : graph_nodes) {
+      VLOG(10) << "graph_nodes: " << g_n.size();
+      if (g_n.size() < 10) {
+        std::stringstream out;
+        for (auto &node : g_n) {
+          out << "\nNode: " << node->Name() << " in [";
+          for (auto &n : node->inputs) {
+            out << n->Name() << ", ";
+          }
+          out << "], out[";
+          for (auto &n : node->outputs) {
+            out << n->Name() << ", ";
+          }
+          out << "]";
+        }
+        VLOG(10) << out.str();
+      }
+    }
+  }
+  return graph_count;
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -27,6 +27,8 @@ namespace ir {
 // Test if the graph contains circle.
 bool HasCircle(const Graph &graph);
+size_t GraphNum(const Graph &graph);
 // Topology Sort the operations in the graph from inputs to outputs.
 // `graph` cannot contain circle.
 std::vector<ir::Node *> TopologySortOperations(const Graph &graph);

--- a/paddle/fluid/framework/ir/graph_helper_test.cc
+++ b/paddle/fluid/framework/ir/graph_helper_test.cc
@@ -120,6 +120,97 @@ TEST(GraphHelperTest, Basic) {
  ASSERT_EQ(node_map.at("op2"), 1UL);
  ASSERT_TRUE(node_map.at("op3") < node_map.at("op5"));
 }
+void BuildZeroGraph(Graph* g) {}
+void BuildOneGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  v4->outputs.push_back(o5);
+}
+void BuildTwoGraphs(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  //  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  //  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  //  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  //  v4->outputs.push_back(o5);
+}
+TEST(GraphHelperTest, GraphNum) {
+  ProgramDesc prog;
+  Graph g(prog);
+  BuildZeroGraph(&g);
+  ASSERT_EQ(GraphNum(g), 0);
+  Graph g2(prog);
+  BuildOneGraph(&g2);
+  ASSERT_EQ(GraphNum(g2), 1);
+  Graph g3(prog);
+  BuildTwoGraphs(&g3);
+  ASSERT_EQ(GraphNum(g3), 2);
+}
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -638,11 +638,6 @@ PDNode *patterns::ConvReLU::operator()(
                              ->AsInput()
                              ->assert_is_persistable_var()
                              ->assert_is_op_input("conv2d", "Filter");
-  // Bias
-  auto *conv_bias_var = pattern->NewNode(conv_bias_repr())
-                            ->AsInput()
-                            ->assert_is_persistable_var()
-                            ->assert_is_op_input("conv2d", "Bias");
  // intermediate variable, will be removed in the IR after fuse.
  auto *conv_out_var = pattern->NewNode(conv_out_repr())
                           ->AsIntermediate()
@@ -653,8 +648,7 @@ PDNode *patterns::ConvReLU::operator()(
                           ->AsOutput()
                           ->assert_is_op_output("relu");
-  conv_op->LinksFrom({conv_input, conv_weight_var, conv_bias_var})
+  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
-      .LinksTo({conv_out_var});
  relu_op->LinksFrom({conv_out_var}).LinksTo({relu_out_var});
  return relu_out_var;
 }
@@ -698,6 +692,24 @@ PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x,
  }
 }
+PDNode *patterns::Embedding::operator()(PDNode *x) {
+  x->assert_is_op_input("lookup_table", "Ids");
+  auto *lookup_table_op =
+      pattern->NewNode(lookup_table_repr())->assert_is_op("lookup_table");
+#define NEW_NODE(arg__, io__)                    \
+  auto *arg__ = pattern->NewNode(arg__##_repr()) \
+                    ->assert_is_op_##io__("lookup_table", #arg__);
+  NEW_NODE(W, input);
+  NEW_NODE(Out, output);
+#undef NEW_NODE
+  lookup_table_op->LinksFrom({x, W});
+  lookup_table_op->LinksTo({Out});
+  return Out;
+}
 PDNode *patterns::LSTM::operator()(PDNode *x) {
  x->assert_is_op_input("lstm", "Input");
  auto *lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm");

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -379,7 +379,7 @@ struct PatternBase {
 // op: conv + relu
 // named nodes:
 // conv_input, conv_weight,
-// conv_bias, conv_out, conv,
+// conv_out, conv,
 // relu_out, relu
 struct ConvReLU : public PatternBase {
  ConvReLU(PDPattern* pattern, const std::string& name_scope)
@@ -392,7 +392,6 @@ struct ConvReLU : public PatternBase {
  PATTERN_DECL_NODE(relu);
  // declare variable node's name
  PATTERN_DECL_NODE(conv_weight);
-  PATTERN_DECL_NODE(conv_bias);
  PATTERN_DECL_NODE(conv_out);
  PATTERN_DECL_NODE(relu_out);
 };
@@ -419,6 +418,23 @@ struct FC : public PatternBase {
  PATTERN_DECL_NODE(Out);
 };
+// Embedding
+struct Embedding : public PatternBase {
+  Embedding(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "embedding") {}
+  PDNode* operator()(PDNode* x);
+  // declare operator node's name
+  PATTERN_DECL_NODE(lookup_table);
+  // Inputs
+  //
+  PATTERN_DECL_NODE(Ids);
+  PATTERN_DECL_NODE(W);  // embeddings
+  // Outputs
+  PATTERN_DECL_NODE(Out);
+};
 struct LSTM : public PatternBase {
  LSTM(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "lstm") {}

--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
@@ -14,6 +14,8 @@
 #include "paddle/fluid/framework/ir/graph_traits.h"
+#include <vector>
 namespace paddle {
 namespace framework {
 namespace ir {

--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -19,7 +19,6 @@ namespace paddle {
 namespace framework {
 namespace ir {
 std::unique_ptr<Graph> Pass::Apply(std::unique_ptr<Graph> graph) const {
-  PADDLE_ENFORCE(!applied_, "Pass can only Apply() once.");
  PADDLE_ENFORCE(graph.get(), "graph passed to Pass::Apply() cannot be empty.");
  for (const std::string& attr : required_pass_attrs_) {
    PADDLE_ENFORCE(attrs_.find(attr) != attrs_.end(),

--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -42,6 +42,8 @@ class Pass {
    attr_dels_.clear();
  }
+  std::string Type() const { return type_; }
  std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph) const;
  // Get a reference to the attributed previously set.
@@ -52,6 +54,21 @@ class Pass {
    return *boost::any_cast<AttrType *>(attrs_.at(attr_name));
  }
+  bool Has(const std::string &attr_name) const {
+    return attrs_.find(attr_name) != attrs_.end();
+  }
+  void Erase(const std::string &attr_name) {
+    if (!Has(attr_name)) {
+      return;
+    }
+    if (attr_dels_.find(attr_name) != attr_dels_.end()) {
+      attr_dels_[attr_name]();
+      attr_dels_.erase(attr_name);
+    }
+    attrs_.erase(attr_name);
+  }
  // Set a pointer to the attribute. Pass takes ownership of the attribute.
  template <typename AttrType>
  void Set(const std::string &attr_name, AttrType *attr) {
@@ -68,13 +85,15 @@ class Pass {
  // should delete the attribute.
  template <typename AttrType>
  void SetNotOwned(const std::string &attr_name, AttrType *attr) {
-    PADDLE_ENFORCE(attrs_.count(attr_name) == 0);
+    PADDLE_ENFORCE(attrs_.count(attr_name) == 0, "%s already set in the pass",
+                   attr_name);
    attrs_[attr_name] = attr;
  }
 protected:
-  virtual std::unique_ptr<Graph> ApplyImpl(
+  virtual std::unique_ptr<Graph> ApplyImpl(std::unique_ptr<Graph> graph) const {
-      std::unique_ptr<Graph> graph) const = 0;
+    LOG(FATAL) << "Calling virtual Pass not implemented.";
+  }
 private:
  template <typename PassType>
@@ -89,7 +108,10 @@ class Pass {
    required_graph_attrs_.insert(attrs.begin(), attrs.end());
  }
+  void RegisterType(const std::string &type) { type_ = type; }
  mutable bool applied_{false};
+  std::string type_;
  std::unordered_set<std::string> required_pass_attrs_;
  std::unordered_set<std::string> required_graph_attrs_;
  std::map<std::string, boost::any> attrs_;
@@ -143,10 +165,11 @@ struct PassRegistrar : public Registrar {
    PADDLE_ENFORCE(!PassRegistry::Instance().Has(pass_type),
                   "'%s' is registered more than once.", pass_type);
    PassRegistry::Instance().Insert(
-        pass_type, [this]() -> std::unique_ptr<Pass> {
+        pass_type, [this, pass_type]() -> std::unique_ptr<Pass> {
          std::unique_ptr<Pass> pass(new PassType());
          pass->RegisterRequiredPassAttrs(this->required_pass_attrs_);
          pass->RegisterRequiredGraphAttrs(this->required_graph_attrs_);
+          pass->RegisterType(pass_type);
          return pass;
        });
  }

--- a/paddle/fluid/framework/ir/pass_builder.cc
+++ b/paddle/fluid/framework/ir/pass_builder.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/framework/ir/pass_builder.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+std::shared_ptr<Pass> PassBuilder::AppendPass(const std::string& pass_type) {
+  auto pass = ir::PassRegistry::Instance().Get(pass_type);
+  passes_.emplace_back(pass.release());
+  return passes_.back();
+}
+void PassBuilder::RemovePass(size_t idx) {
+  PADDLE_ENFORCE(passes_.size() > idx);
+  passes_.erase(passes_.begin() + idx);
+}
+std::shared_ptr<Pass> PassBuilder::InsertPass(size_t idx,
+                                              const std::string& pass_type) {
+  PADDLE_ENFORCE(passes_.size() >= idx);
+  std::shared_ptr<Pass> pass(
+      ir::PassRegistry::Instance().Get(pass_type).release());
+  passes_.insert(passes_.begin() + idx, std::move(pass));
+  return passes_[idx];
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/pass_builder.h
+++ b/paddle/fluid/framework/ir/pass_builder.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+class PassBuilder {
+ public:
+  PassBuilder() {}
+  virtual ~PassBuilder() {}
+  // Append a new pass to the end.
+  std::shared_ptr<Pass> AppendPass(const std::string& pass_type);
+  // Insert a new pass after `idx`.
+  std::shared_ptr<Pass> InsertPass(size_t idx, const std::string& pass_type);
+  // Remove a new pass at `idx`.
+  void RemovePass(size_t idx);
+  // Returns a list of all passes.
+  std::vector<std::shared_ptr<Pass>> AllPasses() const { return passes_; }
+ protected:
+  std::vector<std::shared_ptr<Pass>> passes_;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/pass_test.cc
+++ b/paddle/fluid/framework/ir/pass_test.cc
@@ -82,12 +82,10 @@ TEST(PassTest, TestPassAttrCheck) {
  ASSERT_EQ(graph->Get<int>("copy_test_pass_attr"), 2);
  ASSERT_EQ(graph->Get<int>("copy_test_graph_attr"), 2);
-  try {
+  // Allow apply more than once.
+  graph.reset(new Graph(prog));
+  graph->Set<int>("test_graph_attr", new int);
  graph = pass->Apply(std::move(graph));
-  } catch (paddle::platform::EnforceNotMet e) {
-    exception = std::string(e.what());
-  }
-  ASSERT_TRUE(exception.find("Pass can only Apply() once") != exception.npos);
  pass = PassRegistry::Instance().Get("test_pass");
  pass->SetNotOwned<int>("test_pass_attr", &val);

--- a/paddle/fluid/framework/mixed_vector.h
+++ b/paddle/fluid/framework/mixed_vector.h
@@ -17,10 +17,13 @@
 #include <algorithm>
 #include <initializer_list>
 #include <memory>
+#include <mutex>  // NOLINT
+#include <utility>
 #include <vector>
+#include "paddle/fluid/framework/details/cow_ptr.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "glog/logging.h"
@@ -28,173 +31,167 @@ namespace paddle {
 namespace framework {
 #if defined(PADDLE_WITH_CUDA)
-// Vector<T> implements the std::vector interface, and can get Data or
+namespace details {
-// MutableData from any place. The data will be synced implicitly inside.
+struct CUDABuffer {
-template <typename T>
+  void *data_{nullptr};
-class Vector {
+  size_t size_{0};
- public:
+  platform::CUDAPlace place_;
-  using value_type = T;
-  // Default ctor. Create empty Vector
+  CUDABuffer() {}
-  Vector() { InitEmpty(); }
+  CUDABuffer(platform::Place place, size_t size)
+      : size_(size), place_(boost::get<platform::CUDAPlace>(place)) {
-  // Fill vector with value. The vector size is `count`.
+    data_ = memory::Alloc(place_, size);
-  explicit Vector(size_t count, const T &value = T()) {
-    InitEmpty();
-    if (count != 0) {
-      resize(count);
-      T *ptr = begin();
-      for (size_t i = 0; i < count; ++i) {
-        ptr[i] = value;
-      }
-    }
  }
-  // Ctor with init_list
+  ~CUDABuffer() { ClearMemory(); }
-  Vector(std::initializer_list<T> init) {
-    if (init.size() == 0) {
+  CUDABuffer(const CUDABuffer &o) = delete;
-      InitEmpty();
+  CUDABuffer &operator=(const CUDABuffer &o) = delete;
-    } else {
-      InitByIter(init.size(), init.begin(), init.end());
+  void Resize(platform::Place place, size_t size) {
+    ClearMemory();
+    place_ = boost::get<platform::CUDAPlace>(place);
+    data_ = memory::Alloc(place_, size);
+    PADDLE_ENFORCE_NOT_NULL(data_);
+    size_ = size;
  }
+  void Swap(CUDABuffer &o) {
+    std::swap(data_, o.data_);
+    std::swap(place_, o.place_);
+    std::swap(size_, o.size_);
  }
-  // implicit cast from std::vector.
+ private:
-  template <typename U>
+  void ClearMemory() const {
-  Vector(const std::vector<U> &dat) {  // NOLINT
+    if (data_ != nullptr) {
-    if (dat.size() == 0) {
+      memory::Free(place_, data_);
-      InitEmpty();
-    } else {
-      InitByIter(dat.size(), dat.begin(), dat.end());
    }
  }
+};
+}  // namespace details
-  // Copy ctor
+// Vector<T> implements the std::vector interface, and can get Data or
-  Vector(const Vector<T> &other) { this->operator=(other); }
+// MutableData from any place. The data will be synced implicitly inside.
+template <typename T>
+class Vector {
+ public:
+  using value_type = T;
+  using iterator = typename std::vector<T>::iterator;
+  using const_iterator = typename std::vector<T>::const_iterator;
-  // Copy operator
+ private:
-  Vector<T> &operator=(const Vector<T> &other) {
+  // The actual class to implement vector logic
-    if (other.size() != 0) {
+  class VectorData {
-      this->InitByIter(other.size(), other.begin(), other.end());
+   public:
-    } else {
+    VectorData() : flag_(kDataInCPU) {}
-      InitEmpty();
+    VectorData(size_t count, const T &value)
-    }
+        : cpu_(count, value), flag_(kDataInCPU) {}
-    return *this;
+    VectorData(std::initializer_list<T> init) : cpu_(init), flag_(kDataInCPU) {}
-  }
+    template <typename U>
+    explicit VectorData(const std::vector<U> &dat)
+        : cpu_(dat), flag_(kDataInCPU) {}
+    ~VectorData() {}
-  // Move ctor
+    VectorData(const VectorData &o) {
-  Vector(Vector<T> &&other) {
+      o.ImmutableCPU();
-    this->size_ = other.size_;
+      cpu_ = o.cpu_;
-    this->flag_ = other.flag_;
+      flag_ = kDataInCPU;
-    if (other.cuda_vec_.memory_size()) {
-      this->cuda_vec_.ShareDataWith(other.cuda_vec_);
-    }
-    if (other.cpu_vec_.memory_size()) {
-      this->cpu_vec_.ShareDataWith(other.cpu_vec_);
    }
+    VectorData &operator=(const VectorData &o) {
+      o.ImmutableCPU();
+      cpu_ = o.cpu_;
+      flag_ = kDataInCPU;
+      details::CUDABuffer null;
+      gpu_.Swap(null);
+      return *this;
    }
-  // CPU data access method. Mutable.
    T &operator[](size_t i) {
      MutableCPU();
-    return const_cast<T *>(cpu_vec_.data<T>())[i];
+      return cpu_[i];
    }
-  // CPU data access method. Immutable.
    const T &operator[](size_t i) const {
      ImmutableCPU();
-    return cpu_vec_.data<T>()[i];
+      return cpu_[i];
    }
-  // std::vector iterator methods. Based on CPU data access method
+    size_t size() const { return cpu_.size(); }
-  size_t size() const { return size_; }
-  T *begin() { return capacity() == 0 ? &EmptyDummy() : &this->operator[](0); }
+    iterator begin() {
+      MutableCPU();
+      return cpu_.begin();
+    }
-  T *end() {
+    iterator end() {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+      MutableCPU();
+      return cpu_.end();
    }
-  T &front() { return *begin(); }
+    T &front() {
+      MutableCPU();
+      return cpu_.front();
+    }
    T &back() {
-    auto it = end();
+      MutableCPU();
-    --it;
+      return cpu_.back();
-    return *it;
    }
-  const T *begin() const {
+    const_iterator begin() const {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](0);
+      ImmutableCPU();
+      return cpu_.begin();
    }
-  const T *end() const {
+    const_iterator end() const {
-    return capacity() == 0 ? &EmptyDummy() : &this->operator[](size());
+      ImmutableCPU();
+      return cpu_.end();
    }
-  const T *cbegin() const { return begin(); }
-  const T *cend() const { return end(); }
    const T &back() const {
-    auto it = end();
+      ImmutableCPU();
-    --it;
+      return cpu_.back();
-    return *it;
    }
-  T *data() { return begin(); }
+    T *data() { return &(*this)[0]; }
-  const T *data() const { return begin(); }
+    const T *data() const { return &(*this)[0]; }
-  const T &front() const { return *begin(); }
+    const T &front() const {
-  // end of std::vector iterator methods
+      ImmutableCPU();
+      return cpu_.front();
+    }
    // assign this from iterator.
    // NOTE: the iterator must support `end-begin`
    template <typename Iter>
    void assign(Iter begin, Iter end) {
-    InitByIter(end - begin, begin, end);
+      MutableCPU();
+      cpu_.assign(begin, end);
    }
    // push_back. If the previous capacity is not enough, the memory will
    // double.
    void push_back(T elem) {
-    if (size_ + 1 > capacity()) {
+      MutableCPU();
-      reserve((size_ + 1) << 1);
+      cpu_.push_back(elem);
-    }
-    *end() = elem;
-    ++size_;
    }
    // extend a vector by iterator.
    // NOTE: the iterator must support end-begin
    template <typename It>
    void Extend(It begin, It end) {
-    size_t pre_size = size_;
+      MutableCPU();
-    resize(pre_size + (end - begin));
+      auto out_it = std::back_inserter<std::vector<T>>(this->cpu_);
-    T *ptr = this->begin() + pre_size;
+      std::copy(begin, end, out_it);
-    for (; begin < end; ++begin, ++ptr) {
-      *ptr = *begin;
-    }
    }
    // resize the vector
    void resize(size_t size) {
-    if (size + 1 <= capacity()) {
-      size_ = size;
-    } else {
      MutableCPU();
-      Tensor cpu_tensor;
+      cpu_.resize(size);
-      platform::Place cpu = platform::CPUPlace();
-      T *ptr = cpu_tensor.mutable_data<T>(
-          framework::make_ddim({static_cast<int64_t>(size)}), cpu);
-      const T *old_ptr =
-          cpu_vec_.memory_size() == 0 ? nullptr : cpu_vec_.data<T>();
-      if (old_ptr != nullptr) {
-        std::copy(old_ptr, old_ptr + size_, ptr);
-      }
-      size_ = size;
-      cpu_vec_.ShareDataWith(cpu_tensor);
-    }
    }
    // get cuda ptr. immutable
@@ -202,7 +199,7 @@ class Vector {
      PADDLE_ENFORCE(platform::is_gpu_place(place),
                     "CUDA Data must on CUDA place");
      ImmutableCUDA(place);
-    return cuda_vec_.data<T>();
+      return reinterpret_cast<T *>(gpu_.data_);
    }
    // get cuda ptr. mutable
@@ -214,77 +211,39 @@ class Vector {
    // clear
    void clear() {
-    size_ = 0;
+      cpu_.clear();
      flag_ = kDirty | kDataInCPU;
    }
-  size_t capacity() const {
+    size_t capacity() const { return cpu_.capacity(); }
-    return cpu_vec_.memory_size() / SizeOfType(typeid(T));
-  }
    // reserve data
-  void reserve(size_t size) {
+    void reserve(size_t size) const { cpu_.reserve(size); }
-    size_t pre_size = size_;
-    resize(size);
-    resize(pre_size);
-  }
-  // the unify method to access CPU or CUDA data. immutable.
-  const T *Data(platform::Place place) const {
-    if (platform::is_gpu_place(place)) {
-      return CUDAData(place);
-    } else {
-      return data();
-    }
-  }
-  // the unify method to access CPU or CUDA data. mutable.
-  T *MutableData(platform::Place place) {
-    if (platform::is_gpu_place(place)) {
-      return CUDAMutableData(place);
-    } else {
-      return data();
-    }
-  }
    // implicit cast operator. Vector can be cast to std::vector implicitly.
    operator std::vector<T>() const {
-    std::vector<T> result;
+      ImmutableCPU();
-    result.resize(size());
+      return cpu_;
-    std::copy(begin(), end(), result.begin());
-    return result;
    }
-  bool operator==(const Vector<T> &other) const {
+    bool operator==(const VectorData &other) const {
-    if (size() != other.size()) return false;
+      ImmutableCPU();
-    auto it1 = cbegin();
+      other.ImmutableCPU();
-    auto it2 = other.cbegin();
+      return cpu_ == other.cpu_;
-    for (; it1 < cend(); ++it1, ++it2) {
-      if (*it1 != *it2) {
-        return false;
-      }
-    }
-    return true;
    }
- private:
+    std::mutex &Mutex() const { return mtx_; }
-  void InitEmpty() {
-    size_ = 0;
-    flag_ = kDataInCPU;
-  }
-  template <typename Iter>
+    std::unique_ptr<platform::CUDAPlace> CUDAPlace() const {
-  void InitByIter(size_t size, Iter begin, Iter end) {
+      if (gpu_.data_ == nullptr) {
-    platform::Place cpu = platform::CPUPlace();
+        return nullptr;
-    T *ptr = this->cpu_vec_.template mutable_data<T>(
+      } else {
-        framework::make_ddim({static_cast<int64_t>(size)}), cpu);
+        return std::unique_ptr<platform::CUDAPlace>(
-    for (size_t i = 0; i < size; ++i) {
+            new platform::CUDAPlace(gpu_.place_));
-      *ptr++ = *begin++;
+      }
-    }
-    flag_ = kDataInCPU | kDirty;
-    size_ = size;
    }
+   private:
    enum DataFlag {
      kDataInCPU = 0x01,
      kDataInCUDA = 0x02,
@@ -294,8 +253,15 @@ class Vector {
    void CopyToCPU() const {
      // COPY GPU Data To CPU
-    TensorCopy(cuda_vec_, platform::CPUPlace(), &cpu_vec_);
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
-    WaitPlace(cuda_vec_.place());
+          platform::DeviceContextPool::Instance().Get(
+              platform::Place(gpu_.place_)));
+      auto stream = dev_ctx->stream();
+      void *src = gpu_.data_;
+      void *dst = cpu_.data();
+      memory::Copy(platform::CPUPlace(), dst, gpu_.place_, src, gpu_.size_,
+                   stream);
+      dev_ctx->Wait();
    }
    void MutableCPU() {
@@ -308,16 +274,12 @@ class Vector {
    void ImmutableCUDA(platform::Place place) const {
      if (IsDirty()) {
        if (IsInCPU()) {
-        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
+          CopyCPUDataToCUDA(place);
-                   &cuda_vec_);
-        WaitPlace(place);
          UnsetFlag(kDirty);
          SetFlag(kDataInCUDA);
-      } else if (IsInCUDA() && !(place == cuda_vec_.place())) {
+        } else if (IsInCUDA() &&
-        framework::Tensor tmp;
+                   !(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
-        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
+          PADDLE_THROW("This situation should not happen");
-        WaitPlace(cuda_vec_.place());
-        cuda_vec_.ShareDataWith(tmp);
          // Still dirty
        } else {
          // Dirty && DataInCUDA && Device is same
@@ -326,17 +288,10 @@ class Vector {
      } else {
        if (!IsInCUDA()) {
          // Even data is not dirty. However, data is not in CUDA. Copy data.
-        TensorCopy(cpu_vec_, boost::get<platform::CUDAPlace>(place),
+          CopyCPUDataToCUDA(place);
-                   &cuda_vec_);
-        WaitPlace(place);
          SetFlag(kDataInCUDA);
-      } else if (!(place == cuda_vec_.place())) {
+        } else if (!(boost::get<platform::CUDAPlace>(place) == gpu_.place_)) {
-        framework::Tensor tmp;
+          PADDLE_THROW("This situation should not happen.");
-        WaitPlace(cuda_vec_.place());
-        TensorCopy(cuda_vec_, boost::get<platform::CUDAPlace>(place), &tmp);
-        WaitPlace(cuda_vec_.place());
-        WaitPlace(place);
-        cuda_vec_.ShareDataWith(tmp);
        } else {
          // Not Dirty && DataInCUDA && Device is same
          // Do nothing.
@@ -344,9 +299,20 @@ class Vector {
      }
    }
+    void CopyCPUDataToCUDA(const platform::Place &place) const {
+      void *src = cpu_.data();
+      gpu_.Resize(place, cpu_.size() * sizeof(T));
+      void *dst = gpu_.data_;
+      auto *dev_ctx = static_cast<platform::CUDADeviceContext *>(
+          platform::DeviceContextPool::Instance().Get(place));
+      auto stream = dev_ctx->stream();
+      memory::Copy(gpu_.place_, dst, platform::CPUPlace(), src, gpu_.size_,
+                   stream);
+    }
    void ImmutableCPU() const {
-    if (IsDirty() &&
+      if (IsDirty() && !IsInCPU()) {  // If data has been changed in CUDA, or
-        !IsInCPU()) {  // If data has been changed in CUDA, or CPU has no data.
+                                      // CPU has no data.
        CopyToCPU();
        UnsetFlag(kDirty);
      }
@@ -362,23 +328,178 @@ class Vector {
    bool IsInCPU() const { return flag_ & kDataInCPU; }
-  static void WaitPlace(const platform::Place place) {
+    mutable std::vector<T> cpu_;
+    mutable details::CUDABuffer gpu_;
+    mutable int flag_;
+    mutable std::mutex mtx_;
+  };
+ public:
+  // Default ctor. Create empty Vector
+  Vector() : m_(new VectorData()) {}
+  // Fill vector with value. The vector size is `count`.
+  explicit Vector(size_t count, const T &value = T())
+      : m_(new VectorData(count, value)) {}
+  // Ctor with init_list
+  Vector(std::initializer_list<T> init) : m_(new VectorData(init)) {}
+  // implicit cast from std::vector.
+  template <typename U>
+  Vector(const std::vector<U> &dat) : m_(new VectorData(dat)) {  // NOLINT
+  }
+  // Copy ctor
+  Vector(const Vector<T> &other) { m_ = other.m_; }
+  // Copy operator
+  Vector<T> &operator=(const Vector<T> &other) {
+    m_ = other.m_;
+    return *this;
+  }
+  // Move ctor
+  Vector(Vector<T> &&other) { m_ = std::move(other.m_); }
+  // CPU data access method. Mutable.
+  T &operator[](size_t i) { return (*m_.MutableData())[i]; }
+  // CPU data access method. Immutable.
+  const T &operator[](size_t i) const { return m_.Data()[i]; }
+  // std::vector iterator methods. Based on CPU data access method
+  size_t size() const { return m_.Data().size(); }
+  iterator begin() { return m_.MutableData()->begin(); }
+  iterator end() { return m_.MutableData()->end(); }
+  T &front() { return m_.MutableData()->front(); }
+  T &back() { return m_.MutableData()->back(); }
+  const_iterator begin() const { return m_.Data().begin(); }
+  const_iterator end() const { return m_.Data().end(); }
+  const_iterator cbegin() const { return begin(); }
+  const_iterator cend() const { return end(); }
+  const T &back() const { return m_.Data().back(); }
+  T *data() { return m_.MutableData()->data(); }
+  const T *data() const { return m_.Data().data(); }
+  const T &front() const { return m_.Data().front(); }
+  // end of std::vector iterator methods
+  // assign this from iterator.
+  // NOTE: the iterator must support `end-begin`
+  template <typename Iter>
+  void assign(Iter begin, Iter end) {
+    m_.MutableData()->assign(begin, end);
+  }
+  // push_back. If the previous capacity is not enough, the memory will
+  // double.
+  void push_back(T elem) { m_.MutableData()->push_back(elem); }
+  // extend a vector by iterator.
+  // NOTE: the iterator must support end-begin
+  template <typename It>
+  void Extend(It begin, It end) {
+    m_.MutableData()->Extend(begin, end);
+  }
+  // resize the vector
+  void resize(size_t size) {
+    if (m_.Data().size() != size) {
+      m_.MutableData()->resize(size);
+    }
+  }
+  // get cuda ptr. immutable
+  const T *CUDAData(platform::Place place) const {
+    {
+      auto &mtx = m_.Data().Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_.Data().CUDAPlace();
+      if (cuda_place == nullptr ||
+          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+        return m_.Data().CUDAData(place);
+      }
+    }
+    // If m_ contains CUDAData in a different place. Detach manually.
+    m_.Detach();
+    return CUDAData(place);
+  }
+  // get cuda ptr. mutable
+  T *CUDAMutableData(platform::Place place) {
+    {
+      auto &mtx = m_.Data().Mutex();
+      std::lock_guard<std::mutex> guard(mtx);
+      auto cuda_place = m_.Data().CUDAPlace();
+      if (cuda_place == nullptr ||
+          *cuda_place == boost::get<platform::CUDAPlace>(place)) {
+        return m_.MutableData()->CUDAMutableData(place);
+      }
+    }
+    // If m_ contains CUDAData in a different place. Detach manually.
+    m_.Detach();
+    return CUDAMutableData(place);
+  }
+  // clear
+  void clear() { m_.MutableData()->clear(); }
+  size_t capacity() const { return m_.Data().capacity(); }
+  // reserve data
+  void reserve(size_t size) { m_.Data().reserve(size); }
+  // the unify method to access CPU or CUDA data. immutable.
+  const T *Data(platform::Place place) const {
+    if (platform::is_gpu_place(place)) {
+      return CUDAData(place);
+    } else {
+      return data();
+    }
+  }
+  // the unify method to access CPU or CUDA data. mutable.
+  T *MutableData(platform::Place place) {
    if (platform::is_gpu_place(place)) {
-      platform::DeviceContextPool::Instance()
+      return CUDAMutableData(place);
-          .Get(boost::get<platform::CUDAPlace>(place))
+    } else {
-          ->Wait();
+      return data();
    }
  }
-  static T &EmptyDummy() {
+  // implicit cast operator. Vector can be cast to std::vector implicitly.
-    static T dummy = T();
+  operator std::vector<T>() const { return m_.Data(); }
-    return dummy;
+  bool operator==(const Vector<T> &other) const {
+    if (size() != other.size()) return false;
+    auto it1 = cbegin();
+    auto it2 = other.cbegin();
+    for (; it1 < cend(); ++it1, ++it2) {
+      if (*it1 != *it2) {
+        return false;
+      }
+    }
+    return true;
  }
-  mutable int flag_;
+  const void *Handle() const { return &m_.Data(); }
-  mutable Tensor cpu_vec_;
-  mutable Tensor cuda_vec_;
+ private:
-  size_t size_;
+  // Vector is an COW object.
+  mutable details::COWPtr<VectorData> m_;
 };
 #else  // PADDLE_WITH_CUDA

--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/feed_fetch_method.h"
+#include "paddle/fluid/framework/lod_rank_table.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/reader.h"
+#include "paddle/fluid/string/pretty_log.h"
+namespace paddle {
+namespace framework {
+// These code can be shared with Executor.
+static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
+  if (var_type == proto::VarType::LOD_TENSOR) {
+    var->GetMutable<LoDTensor>();
+  } else if (var_type == proto::VarType::SELECTED_ROWS) {
+    var->GetMutable<SelectedRows>();
+  } else if (var_type == proto::VarType::FEED_MINIBATCH) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::FETCH_LIST) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarType::STEP_SCOPES) {
+    var->GetMutable<std::vector<framework::Scope>>();
+  } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
+    var->GetMutable<LoDRankTable>();
+  } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
+    var->GetMutable<LoDTensorArray>();
+  } else if (var_type == proto::VarType::PLACE_LIST) {
+    var->GetMutable<platform::PlaceList>();
+  } else if (var_type == proto::VarType::READER) {
+    var->GetMutable<ReaderHolder>();
+  } else if (var_type == proto::VarType::RAW) {
+    // GetMutable will be called in operator
+  } else {
+    PADDLE_THROW(
+        "Variable type %d is not in "
+        "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
+        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
+        var_type);
+  }
+}
+void NaiveExecutor::Prepare(Scope *parent_scope,
+                            const ProgramDesc &program_desc, int block_id,
+                            bool with_feed_fetch_ops) {
+  if (!parent_scope) {
+    scope_ = new framework::Scope;
+  } else {
+    scope_ = &parent_scope->NewScope();
+  }
+  CreateVariables(program_desc, scope_, block_id);
+  CreateOps(program_desc, block_id, with_feed_fetch_ops);
+}
+void NaiveExecutor::Run() {
+  for (auto &op : ops_) {
+    VLOG(4) << "run " << op->Type();
+    op->Run(*scope_, place_);
+  }
+}
+void NaiveExecutor::CreateVariables(const ProgramDesc &desc, Scope *scope,
+                                    int block_id) {
+  PADDLE_ENFORCE(scope);
+  auto &global_block = desc.Block(block_id);
+  const Scope *ancestor_scope = scope;
+  while (ancestor_scope->parent()) {
+    ancestor_scope = ancestor_scope->parent();
+  }
+  if (ancestor_scope != scope) {
+    for (auto &var : global_block.AllVars()) {
+      if (var->Name() == framework::kEmptyVarName) {
+        continue;
+      }
+      // Create persistable vars in ancestor scope.
+      if (var->Persistable()) {
+        auto *ptr = const_cast<Scope *>(ancestor_scope)->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " global, which pointer is " << ptr;
+      } else {  // Create temporary variables in local scope.
+        auto *ptr = scope->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(3) << "Create Variable " << var->Name()
+                << " locally, which pointer is " << ptr;
+      }
+    }
+  } else {
+    for (auto &var : global_block.AllVars()) {
+      auto *ptr = scope->Var(var->Name());
+      InitializeVariable(ptr, var->GetType());
+      VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+              << ptr;
+    }
+  }
+}
+void NaiveExecutor::CreateOps(const ProgramDesc &desc, int block_id,
+                              bool with_feed_fetch_ops) {
+  for (const auto &op_desc : desc.Block(block_id).AllOps()) {
+    if (!with_feed_fetch_ops &&
+        (op_desc->Type() == "feed" || op_desc->Type() == "fetch")) {
+      string::PrettyLogEndl(string::Style::detail(), "---  skip [%s], %s -> %s",
+                            op_desc->Input("X")[0], op_desc->Type(),
+                            op_desc->Output("Out")[0]);
+      continue;
+    }
+    ops_.emplace_back(OpRegistry::CreateOp(*op_desc));
+  }
+}
+LoDTensor *NaiveExecutor::FindTensor(const std::string &name) {
+  PADDLE_ENFORCE(scope_, "Need to init scope first");
+  auto *var = scope_->FindVar(name);
+  PADDLE_ENFORCE(var, "No variable [%s] in the scope");
+  auto *tensor = const_cast<LoDTensor *>(&var->Get<LoDTensor>());
+  return tensor;
+}
+void NaiveExecutor::CleanFeedFetchOps() {
+  std::vector<std::unique_ptr<OperatorBase>> ops;
+  for (auto &op : ops_) {
+    if (op->Type() != "feed" && op->Type() != "fetch") {
+      ops.emplace_back(std::move(op));
+    }
+  }
+  ops_.swap(ops);
+}
+void NaiveExecutor::EnableMKLDNN(const ProgramDesc &program) {
+#ifdef PADDLE_WITH_MKLDNN
+  VLOG(3) << "use_mkldnn=True";
+  for (size_t block_id = 0; block_id < program.Size(); ++block_id) {
+    auto *block = const_cast<ProgramDesc &>(program).MutableBlock(block_id);
+    for (auto *op : block->AllOps()) {
+      if (op->HasAttr("use_mkldnn")) {
+        op->SetAttr("use_mkldnn", true);
+      }
+    }
+  }
+#else
+  LOG(WARNING)
+      << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option";
+#endif
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/naive_executor.h
+++ b/paddle/fluid/framework/naive_executor.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/platform/device_context.h"
+namespace paddle {
+namespace framework {
+/*
+ * Simple, intuitive and effective. Only single thread is supported, and
+ * currently designed for inference.
+ */
+class NaiveExecutor {
+ public:
+  explicit NaiveExecutor(const platform::Place& place) : place_(place) {}
+  // Create child scope.
+  // Create variables.
+  // @with_feed_fetch_ops: whether to work with the feed and fetch operators.
+  void Prepare(Scope* parent_scope, const ProgramDesc& program_desc,
+               int block_id, bool with_feed_fetch_ops);
+  // Run all the operators.
+  void Run();
+  // Get an tensor to operating directly, without the need for feed_ops.
+  LoDTensor* FindTensor(const std::string& name);
+  Scope* scope() { return scope_; }
+  void CleanFeedFetchOps();
+  void EnableMKLDNN(const ProgramDesc& program);
+ protected:
+  void CreateVariables(const ProgramDesc& desc, Scope* scope, int block_id);
+  void CreateOps(const ProgramDesc& desc, int block_id,
+                 bool with_feed_fetch_ops);
+ private:
+  const platform::Place place_;
+  // Catch the required resource to avoid recreate.
+  std::vector<std::unique_ptr<OperatorBase>> ops_;
+  Scope* scope_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/naive_executor_test.cc
+++ b/paddle/fluid/framework/naive_executor_test.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/naive_executor.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+namespace paddle {
+namespace framework {
+TEST(NaiveExecutor, Basic) {
+  ProgramDesc program;
+  auto* main_block = program.MutableBlock(0);
+  auto* a = main_block->Var("a");  // input
+  auto* b = main_block->Var("b");  // input
+  auto* c = main_block->Var("c");  // input
+  a->SetType(proto::VarType::LOD_TENSOR);
+  b->SetType(proto::VarType::LOD_TENSOR);
+  c->SetType(proto::VarType::LOD_TENSOR);
+  auto* add = main_block->AppendOp();
+  add->SetType("elementwise_add");
+  add->SetInput("X", {"a"});
+  add->SetInput("Y", {"b"});
+  add->SetOutput("Out", {"c"});
+  auto place = platform::CPUPlace();
+  NaiveExecutor exe(place);
+  exe.Prepare(nullptr, program, 0, false /*with feed fetch ops*/);
+  auto* a_tensor = exe.FindTensor("a");
+  auto* b_tensor = exe.FindTensor("b");
+  auto* c_tensor = exe.FindTensor("c");
+  a_tensor->Resize({1, 4});
+  b_tensor->Resize({1, 4});
+  c_tensor->Resize({1, 4});
+  b_tensor->mutable_data<float>(place);
+  a_tensor->mutable_data<float>(place);
+  float a_arr[] = {0, 1, 2, 3};
+  float b_arr[] = {0.0, .1, .2, .3};
+  std::copy_n(a_arr, 4, a_tensor->mutable_data<float>(place));
+  std::copy_n(b_arr, 4, b_tensor->mutable_data<float>(place));
+  exe.Run();
+  auto* c_data = c_tensor->mutable_data<float>(place);
+  for (int i = 0; i < 4; i++) {
+    EXPECT_NEAR(c_data[i], 1.1 * i, 1e-3);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
+USE_OP(elementwise_add);
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -13,21 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/parallel_executor.h"
 #include <string>
 #include <tuple>
 #include <vector>
+#include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_viz_pass.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
+#include "paddle/fluid/framework/details/multi_devices_helper.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -35,80 +33,6 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-std::unique_ptr<ir::Graph> ApplyParallelExecutorPass(
-    const ProgramDesc &main_program, const std::vector<platform::Place> &places,
-    const std::string &loss_var_name,
-    const std::unordered_set<std::string> &param_names,
-    const std::vector<Scope *> &local_scopes, const bool use_cuda,
-#ifdef PADDLE_WITH_CUDA
-    const BuildStrategy &strategy, platform::NCCLContextMap *nccl_ctxs) {
-#else
-    const BuildStrategy &strategy) {
-#endif
-  // Convert the program to graph.
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(main_program));
-  // Apply a graph viz pass to record a graph.
-  if (!strategy.debug_graphviz_path_.empty()) {
-    auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
-    const std::string graph_path = string::Sprintf(
-        "%s%s", strategy.debug_graphviz_path_.c_str(), "_original_graph");
-    viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
-    graph = viz_pass->Apply(std::move(graph));
-  }
-  // Apply op fusion.
-  if (strategy.fuse_elewise_add_act_ops_) {
-    auto fuse_elewise_add_act_pass =
-        ir::PassRegistry::Instance().Get("fuse_elewise_add_act_pass");
-    graph = fuse_elewise_add_act_pass->Apply(std::move(graph));
-    // Apply a graph viz pass to record a graph.
-    if (!strategy.debug_graphviz_path_.empty()) {
-      auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass");
-      const std::string graph_path = string::Sprintf(
-          "%s%s", strategy.debug_graphviz_path_.c_str(), "_fused_graph");
-      viz_pass->Set<std::string>("graph_viz_path", new std::string(graph_path));
-      graph = viz_pass->Apply(std::move(graph));
-    }
-  }
-  // Convert graph to run on multi-devices.
-  auto multi_devices_pass =
-      ir::PassRegistry::Instance().Get("multi_devices_pass");
-  multi_devices_pass->SetNotOwned<const std::vector<platform::Place>>("places",
-                                                                      &places);
-  multi_devices_pass->SetNotOwned<const std::string>("loss_var_name",
-                                                     &loss_var_name);
-  multi_devices_pass->SetNotOwned<const std::unordered_set<std::string>>(
-      "params", &param_names);
-  multi_devices_pass->SetNotOwned<const std::vector<Scope *>>("local_scopes",
-                                                              &local_scopes);
-  multi_devices_pass->SetNotOwned<const BuildStrategy>("strategy", &strategy);
-#ifdef PADDLE_WITH_CUDA
-  platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr;
-  multi_devices_pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
-#endif
-  graph = multi_devices_pass->Apply(std::move(graph));
-  // Apply a graph print pass to record a graph with device info.
-  if (!strategy.debug_graphviz_path_.empty()) {
-    auto multi_devices_print_pass =
-        ir::PassRegistry::Instance().Get("multi_devices_print_pass");
-    multi_devices_print_pass->SetNotOwned<const std::string>(
-        "debug_graphviz_path", &strategy.debug_graphviz_path_);
-    multi_devices_print_pass->Set<details::GraphvizSSAGraphPrinter>(
-        "graph_printer", new details::GraphvizSSAGraphPrinter);
-    graph = multi_devices_print_pass->Apply(std::move(graph));
-  }
-  // Verify that the graph is correct for multi-device executor.
-  auto multi_devices_check_pass =
-      ir::PassRegistry::Instance().Get("multi_devices_check_pass");
-  graph = multi_devices_check_pass->Apply(std::move(graph));
-  return graph;
-}
 class ParallelExecutorPrivate {
 public:
  explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
@@ -199,10 +123,9 @@ ParallelExecutor::ParallelExecutor(
 // Step 3. Convert main_program to SSA form and dependency graph. Also, insert
 // ncclOp
 #ifdef PADDLE_WITH_CUDA
-  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+  std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
      main_program, member_->places_, loss_var_name, params,
-      member_->local_scopes_, member_->use_cuda_, build_strategy,
+      member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get());
-      member_->nccl_ctxs_.get());
  auto max_memory_size = GetEagerDeletionThreshold();
  if (max_memory_size >= 0) {
@@ -228,11 +151,19 @@ ParallelExecutor::ParallelExecutor(
    }
  }
 #else
-  std::unique_ptr<ir::Graph> graph = ApplyParallelExecutorPass(
+  std::unique_ptr<ir::Graph> graph =
-      main_program, member_->places_, loss_var_name, params,
+      build_strategy.Apply(main_program, member_->places_, loss_var_name,
-      member_->local_scopes_, member_->use_cuda_, build_strategy);
+                           params, member_->local_scopes_, member_->use_cuda_);
 #endif
+  if (VLOG_IS_ON(5)) {
+    // If the loss_var_name is given, the number of graph should be only one.
+    if (loss_var_name.size()) {
+      PADDLE_ENFORCE_EQ(ir::GraphNum(*graph), 1,
+                        "The number of graph should be only one");
+    }
+  }
  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
    member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
        exec_strategy, member_->local_scopes_, places, std::move(graph)));
@@ -319,6 +250,13 @@ void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
 #ifdef PADDLE_WITH_CUDA
  if (!gcs_.empty()) {
    ResetReferenceCount();
+    for (auto &pair : cur_ref_cnts_) {
+      auto &name_map = *(pair.second);
+      for (auto &fetch_name : fetch_tensors) {
+        name_map.erase(fetch_name);
+      }
+      name_map.erase(fetched_var_name);
+    }
  }
 #endif
  auto fetch_data = member_->executor_->Run(fetch_tensors);
@@ -373,12 +311,6 @@ ParallelExecutor::~ParallelExecutor() {
 }  // namespace framework
 }  // namespace paddle
-USE_PASS(fuse_elewise_add_act_pass);
-USE_PASS(graph_viz_pass);
-USE_PASS(multi_devices_pass);
-USE_PASS(multi_devices_check_pass);
-USE_PASS(multi_devices_print_pass);
 #ifdef PADDLE_WITH_CUDA
 USE_PASS(reference_count_pass);
 #endif
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,14 +14,14 @@ limitations under the License. */
 #pragma once
-#include <paddle/fluid/framework/details/build_strategy.h>
 #include <atomic>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
+#include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
-#include "paddle/fluid/framework/details/multi_devices_graph_pass.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"

--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
@@ -46,6 +46,7 @@ struct RWLock {
 private:
  pthread_rwlock_t lock_;
 };
+// TODO(paddle-dev): Support RWLock for WIN32 for correctness.
 #else
 // https://stackoverflow.com/questions/7125250/making-pthread-rwlock-wrlock-recursive
 // In windows, rw_lock seems like a hack. Use empty object and do nothing.

--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -27,8 +27,11 @@ class SelectedRowsTester : public ::testing::Test {
    selected_rows_.reset(new SelectedRows(rows, height));
    Tensor* value = selected_rows_->mutable_value();
-    value->mutable_data<float>(
+    auto* data = value->mutable_data<float>(
        make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
+    for (int64_t i = 0; i < value->numel(); ++i) {
+      data[i] = static_cast<float>(i);
+    }
  }
 protected:
@@ -60,6 +63,10 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
  ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
  ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
  ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
+  auto* dst_data = dst_tensor.value().data<float>();
+  for (int64_t i = 0; i < dst_tensor.value().numel(); ++i) {
+    ASSERT_EQ(dst_data[i], static_cast<float>(i));
+  }
 }
 TEST(SelectedRows, SparseTable) {

--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/var_desc.h"

--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -87,8 +87,6 @@ class VarDesc {
  void SetDataTypes(
      const std::vector<proto::VarType::Type> &multiple_data_type);
-  void SetCapacity(int64_t capacity);
  proto::VarType::Type GetDataType() const;
  std::vector<proto::VarType::Type> GetDataTypes() const;
@@ -110,10 +108,8 @@ class VarDesc {
  void SetPersistable(bool persistable) { desc_.set_persistable(persistable); }
 private:
-  const proto::VarType::ChannelDesc &channel_desc() const;
  const proto::VarType::TensorDesc &tensor_desc() const;
  std::vector<proto::VarType::TensorDesc> tensor_descs() const;
-  proto::VarType::ChannelDesc *mutable_channel_desc();
  proto::VarType::TensorDesc *mutable_tensor_desc();
  std::vector<proto::VarType::TensorDesc *> mutable_tensor_descs();

--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -41,8 +40,6 @@ inline proto::VarType::Type ToVarType(std::type_index type) {
    return proto::VarType_Type_SELECTED_ROWS;
  } else if (IsType<ReaderHolder>(type)) {
    return proto::VarType_Type_READER;
-  } else if (IsType<ChannelHolder>(type)) {
-    return proto::VarType_Type_CHANNEL;
  } else {
    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
  }
@@ -66,9 +63,6 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
    case proto::VarType_Type_READER:
      visitor(var.Get<ReaderHolder>());
      return;
-    case proto::VarType_Type_CHANNEL:
-      visitor(var.Get<ChannelHolder>());
-      return;
    default:
      PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
  }

--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -66,6 +66,7 @@ class Analyzer : public OrderedRegistry<PassManager> {
      // Manual update the passes here.
      "infer_clean_graph_pass",       //
      "attention_lstm_fuse_pass",     //
+      "embedding_fc_lstm_fuse_pass",  //
      "fc_lstm_fuse_pass",            //
      "mul_lstm_fuse_pass",           //
      "fc_gru_fuse_pass",             //

--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
--- a/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_to_fluid_pass.cc
--- a/paddle/fluid/inference/analysis/subgraph_splitter.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.cc
--- a/paddle/fluid/inference/analysis/subgraph_splitter.h
+++ b/paddle/fluid/inference/analysis/subgraph_splitter.h
--- a/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
+++ b/paddle/fluid/inference/analysis/subgraph_splitter_tester.cc
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.cc
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass.h
--- a/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
+++ b/paddle/fluid/inference/analysis/tensorrt_subgraph_pass_tester.cc
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
--- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
--- a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lac_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
--- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc
--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
--- a/paddle/fluid/inference/tests/book/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/book/CMakeLists.txt
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
--- a/paddle/fluid/operators/auc_op.cc
+++ b/paddle/fluid/operators/auc_op.cc
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
--- a/paddle/fluid/operators/channel_close_op.cc
+++ b/paddle/fluid/operators/channel_close_op.cc
--- a/paddle/fluid/operators/channel_create_op.cc
+++ b/paddle/fluid/operators/channel_create_op.cc
--- a/paddle/fluid/operators/channel_recv_op.cc
+++ b/paddle/fluid/operators/channel_recv_op.cc
--- a/paddle/fluid/operators/channel_send_op.cc
+++ b/paddle/fluid/operators/channel_send_op.cc
--- a/paddle/fluid/operators/concurrency/CMakeLists.txt
+++ b/paddle/fluid/operators/concurrency/CMakeLists.txt
--- a/paddle/fluid/operators/concurrency/channel_util.cc
+++ b/paddle/fluid/operators/concurrency/channel_util.cc
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
--- a/paddle/fluid/operators/cub_reduce.h
+++ b/paddle/fluid/operators/cub_reduce.h
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
--- a/paddle/fluid/operators/detection_map_op.h
+++ b/paddle/fluid/operators/detection_map_op.h
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
--- a/paddle/fluid/operators/extract_rows_op.cc
+++ b/paddle/fluid/operators/extract_rows_op.cc
--- a/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
--- a/paddle/fluid/operators/concurrency/channel_util.h
+++ b/paddle/fluid/operators/concurrency/channel_util.h
--- a/paddle/fluid/operators/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fusion_gru_op.cc
--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
--- a/paddle/fluid/operators/math/cpu_lstm_compute.cc
+++ b/paddle/fluid/operators/math/cpu_lstm_compute.cc
--- a/paddle/fluid/operators/math/cpu_lstm_compute.h
+++ b/paddle/fluid/operators/math/cpu_lstm_compute.h
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
--- a/paddle/fluid/operators/math/depthwise_conv.h
+++ b/paddle/fluid/operators/math/depthwise_conv.h
--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
--- a/paddle/fluid/operators/pool_mkldnn_op.cc
+++ b/paddle/fluid/operators/pool_mkldnn_op.cc
--- a/paddle/fluid/operators/reduce_mean_op.cu
+++ b/paddle/fluid/operators/reduce_mean_op.cu
--- a/paddle/fluid/operators/reduce_sum_op.cu
+++ b/paddle/fluid/operators/reduce_sum_op.cu
--- a/paddle/fluid/operators/sampling_id_op.cc
+++ b/paddle/fluid/operators/sampling_id_op.cc
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
--- a/paddle/fluid/operators/select_op.cc
+++ b/paddle/fluid/operators/select_op.cc
--- a/paddle/fluid/operators/sequence_erase_op.cc
+++ b/paddle/fluid/operators/sequence_erase_op.cc
--- a/paddle/fluid/operators/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_slice_op.h
--- a/paddle/fluid/operators/sgd_op.cu
+++ b/paddle/fluid/operators/sgd_op.cu
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
--- a/paddle/fluid/operators/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op.cc
--- a/paddle/fluid/operators/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt_engine_op.h
--- a/paddle/fluid/operators/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt_engine_op_test.cc
--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
--- a/paddle/fluid/operators/while_op.cc
+++ b/paddle/fluid/operators/while_op.cc
--- a/paddle/fluid/platform/dynload/cublas.h
+++ b/paddle/fluid/platform/dynload/cublas.h
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
--- a/paddle/fluid/platform/dynload/curand.h
+++ b/paddle/fluid/platform/dynload/curand.h
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
--- a/paddle/fluid/string/pretty_log.h
+++ b/paddle/fluid/string/pretty_log.h
--- a/paddle/fluid/train/CMakeLists.txt
+++ b/paddle/fluid/train/CMakeLists.txt
--- a/paddle/fluid/train/test_train_recognize_digits.cc
+++ b/paddle/fluid/train/test_train_recognize_digits.cc
--- a/paddle/legacy/trainer/tests/CMakeLists.txt
+++ b/paddle/legacy/trainer/tests/CMakeLists.txt
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
--- a/python/paddle/fluid/contrib/op_frequence.py
+++ b/python/paddle/fluid/contrib/op_frequence.py
--- a/python/paddle/fluid/contrib/quantize/__init__.py
+++ b/python/paddle/fluid/contrib/quantize/__init__.py
--- a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
--- a/python/paddle/fluid/tests/no_test_concurrency.py
+++ b/python/paddle/fluid/tests/no_test_concurrency.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/dist_ctr.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr.py
--- a/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
+++ b/python/paddle/fluid/tests/unittests/dist_ctr_reader.py
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/dist_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/dist_text_classification.py
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
--- a/python/paddle/fluid/tests/unittests/dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/notest_concurrency.py
+++ b/python/paddle/fluid/tests/notest_concurrency.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_simnet_bow.py
--- a/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_text_classification.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
--- a/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_op.py
--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_pass_builder.py
+++ b/python/paddle/fluid/tests/unittests/test_pass_builder.py
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
--- a/python/setup.py.in
+++ b/python/setup.py.in