Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/add_prefech_hs

50fce879 · JiabinYang · c2e851f7 · 30aad884 · 50fce879 · 50fce879
80 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,7 +54,7 @@ option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
 option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
 option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
-option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler"        OFF)
+option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
 option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
@@ -254,6 +254,12 @@ elseif()
    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
 endif()
+if (WITH_PROFILER)
+    find_package(Gperftools REQUIRED)
+    include_directories(${GPERFTOOLS_INCLUDE_DIR})
+    add_definitions(-DWITH_GPERFTOOLS)
+endif()
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation

--- a/cmake/FindGperftools.cmake
+++ b/cmake/FindGperftools.cmake
+# Tries to find Gperftools.
+#
+# Usage of this module as follows:
+#
+#     find_package(Gperftools)
+#
+# Variables used by this module, they can change the default behaviour and need
+# to be set before calling find_package:
+#
+#  Gperftools_ROOT_DIR  Set this variable to the root installation of
+#                       Gperftools if the module has problems finding
+#                       the proper installation path.
+#
+# Variables defined by this module:
+#
+#  GPERFTOOLS_FOUND              System has Gperftools libs/headers
+#  GPERFTOOLS_LIBRARIES          The Gperftools libraries (tcmalloc & profiler)
+#  GPERFTOOLS_INCLUDE_DIR        The location of Gperftools headers
+find_library(GPERFTOOLS_TCMALLOC
+  NAMES tcmalloc
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+find_library(GPERFTOOLS_PROFILER
+  NAMES profiler
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER
+  NAMES tcmalloc_and_profiler
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+find_path(GPERFTOOLS_INCLUDE_DIR
+  NAMES gperftools/heap-profiler.h
+  HINTS ${Gperftools_ROOT_DIR}/include)
+set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER})
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+  Gperftools
+  DEFAULT_MSG
+  GPERFTOOLS_LIBRARIES
+  GPERFTOOLS_INCLUDE_DIR)
+mark_as_advanced(
+  Gperftools_ROOT_DIR
+  GPERFTOOLS_TCMALLOC
+  GPERFTOOLS_PROFILER
+  GPERFTOOLS_TCMALLOC_AND_PROFILER
+  GPERFTOOLS_LIBRARIES
+  GPERFTOOLS_INCLUDE_DIR)
+# create IMPORTED targets
+if (Gperftools_FOUND AND NOT TARGET gperftools::tcmalloc)
+  add_library(gperftools::tcmalloc UNKNOWN IMPORTED)
+  set_target_properties(gperftools::tcmalloc PROPERTIES
+    IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC}
+    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
+  add_library(gperftools::profiler UNKNOWN IMPORTED)
+  set_target_properties(gperftools::profiler PROPERTIES
+    IMPORTED_LOCATION ${GPERFTOOLS_PROFILER}
+    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
+endif()
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -86,6 +86,7 @@ endif(NOT WITH_GOLANG)
 if(WITH_GPU)
    add_definitions(-DPADDLE_WITH_CUDA)
+    add_definitions(-DEIGEN_USE_GPU)
    FIND_PACKAGE(CUDA REQUIRED)

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -110,6 +110,14 @@ function(find_fluid_modules TARGET_NAME)
  endif()
 endfunction(find_fluid_modules)
+function(common_link TARGET_NAME)
+  if (WITH_PROFILER)
+    target_link_libraries(${TARGET_NAME} gperftools::profiler)
+  endif()
+endfunction()
 # find all third_party modules is used for paddle static library
 # for reduce the dependency when building the inference libs.
 set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY)
@@ -274,6 +282,7 @@ function(cc_library TARGET_NAME)
      endif()
      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
+      common_link(${TARGET_NAME})
    endif()
    # cpplint code style
@@ -340,6 +349,7 @@ function(cc_binary TARGET_NAME)
  if(cc_binary_DEPS)
    target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS})
    add_dependencies(${TARGET_NAME} ${cc_binary_DEPS})
+    common_link(${TARGET_NAME})
  endif()
 endfunction(cc_binary)
@@ -362,6 +372,7 @@ function(cc_test TARGET_NAME)
      target_link_libraries(${TARGET_NAME} ${win32_deps})
    endif(WIN32)
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    common_link(${TARGET_NAME})
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
@@ -420,6 +431,7 @@ function(nv_binary TARGET_NAME)
    if(nv_binary_DEPS)
      target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
      add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
+      common_link(${TARGET_NAME})
    endif()
  endif()
 endfunction(nv_binary)
@@ -433,6 +445,7 @@ function(nv_test TARGET_NAME)
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})
    if (nv_test_SERIAL)
        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
@@ -499,6 +512,7 @@ function(hip_binary TARGET_NAME)
    if(hip_binary_DEPS)
      target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
      add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
+      common_link(${TARGET_NAME})
    endif()
  endif()
 endfunction(hip_binary)
@@ -518,6 +532,7 @@ function(hip_test TARGET_NAME)
    set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
+    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
 endfunction(hip_test)
@@ -560,6 +575,7 @@ function(go_library TARGET_NAME)
  endif()
  if(go_library_DEPS)
    add_dependencies(${TARGET_NAME} ${go_library_DEPS})
+    common_link(${TARGET_NAME})
  endif(go_library_DEPS)
  # The "source file" of the library is `${dummyfile}` which never

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -129,11 +129,13 @@ cc_test(version_test SRCS version_test.cc DEPS version)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
-if(NOT WIN32)
+if(WITH_NGRAPH)
-cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
+  if(NOT WIN32)
-cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
+    cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
-  shape_inference data_transform lod_tensor profiler)
+    cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
-endif(NOT WIN32)
+      shape_inference data_transform lod_tensor profiler ngraph)
+  endif(NOT WIN32)
+endif(WITH_NGRAPH)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
@@ -169,11 +171,15 @@ if(WITH_DISTRIBUTE)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 else()
-  if(NOT WIN32)
+  if(WITH_NGRAPH)
-    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph_operator variable_helper)
+    if(NOT WIN32)
-  else(NOT WIN32)
+      cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass ngraph ngraph_operator variable_helper)
+    else(NOT WIN32)
+      cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
+    endif(NOT WIN32)
+  else(WITH_NGRAPH)
    cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper)
-  endif(NOT WIN32)
+  endif(WITH_NGRAPH)
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()

--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/ngraph_operator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/transfer_scope_cache.h"
@@ -26,6 +25,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
+#ifdef PADDLE_WITH_NGRAPH
+#include "paddle/fluid/framework/ngraph_operator.h"
+#endif
 DECLARE_bool(benchmark);
 DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run");
 DEFINE_bool(use_ngraph, false, "Use NGRAPH to run");
@@ -88,11 +91,11 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op,
 static void EnableFusedOp(ExecutorPrepareContext* ctx) {
 #ifdef PADDLE_WITH_NGRAPH
  VLOG(3) << "use_ngraph=True";
-  auto intervals = FusedOperator::FusedOpIntervals(&ctx->ops_);
+  auto intervals = NgraphOperator::NgraphOpIntervals(&ctx->ops_);
  for (auto& interval : intervals) {
-    auto* fused_op = new FusedOperator(ctx->prog_, ctx->block_id_,
+    auto* ng_op = new NgraphOperator(ctx->prog_, ctx->block_id_, interval.at(0),
-                                       interval.at(0), interval.at(1));
+                                     interval.at(1));
-    *interval[0] = std::unique_ptr<OperatorBase>(fused_op);
+    *interval[0] = std::unique_ptr<OperatorBase>(ng_op);
  }
  for (auto it = intervals.rbegin(); it != intervals.rend(); ++it) {
    ctx->ops_.erase(it->at(0) + 1, it->at(1));

--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef PADDLE_WITH_NGRAPH
 #include <algorithm>
 #include <functional>
 #include <vector>
@@ -27,14 +26,15 @@ namespace paddle {
 namespace framework {
 static std::shared_ptr<ngraph::Node> GetNode(
-    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    const std::shared_ptr<OperatorBase>& op, const std::string name,
    const VariableNameMap& var_map,
    std::shared_ptr<
        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
        ngb_node_map) {
-  auto& var_names = var_map.at(prm);
+  auto& var_names = var_map.at(name);
  PADDLE_ENFORCE_EQ(var_names.size(), 1,
-                    "op %s prm %s expects one associated var", op->Type(), prm);
+                    "op %s name %s expects one associated var", op->Type(),
+                    name);
  if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
    return (*ngb_node_map)[var_names[0]];
  } else {
@@ -43,42 +43,42 @@ static std::shared_ptr<ngraph::Node> GetNode(
 }
 static std::shared_ptr<ngraph::Node> GetInputNode(
-    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    const std::shared_ptr<OperatorBase>& op, const std::string name,
    std::shared_ptr<
        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
        ngb_node_map) {
-  return GetNode(op, prm, op->Inputs(), ngb_node_map);
+  return GetNode(op, name, op->Inputs(), ngb_node_map);
 }
 static std::shared_ptr<ngraph::Node> GetOutputNode(
-    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    const std::shared_ptr<OperatorBase>& op, const std::string name,
    std::shared_ptr<
        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
        ngb_node_map) {
-  return GetNode(op, prm, op->Outputs(), ngb_node_map);
+  return GetNode(op, name, op->Outputs(), ngb_node_map);
 }
 static void SetOutputNode(
-    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    const std::shared_ptr<OperatorBase>& op, const std::string name,
    std::shared_ptr<ngraph::Node> node,
    std::shared_ptr<
        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
        ngb_node_map) {
-  auto& var_names = op->Outputs().at(prm);
+  auto& var_names = op->Outputs().at(name);
  if (var_names.size() == 1) {
    (*ngb_node_map)[var_names[0]] = node;
  } else if (var_names.size() == 0) {
    (*ngb_node_map)[""] = node;
  } else {
-    PADDLE_THROW("prm %s has more than 1 var_names.", prm);
+    PADDLE_THROW("name %s has more than 1 var_names.", name);
  }
 }
 static bool HasOutput(const std::shared_ptr<OperatorBase>& op,
-                      const std::string prm) {
+                      const std::string name) {
  auto& outputs = op->Outputs();
-  if (outputs.find(prm) == outputs.end()) return false;
+  if (outputs.find(name) == outputs.end()) return false;
-  return outputs.at(prm).size() > 0;
+  return outputs.at(name).size() > 0;
 }
 template <typename T>
@@ -118,4 +118,3 @@ void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
 }  // namespace framework
 }  // namespace paddle
-#endif
--- a/paddle/fluid/framework/ngraph_bridge.h
+++ b/paddle/fluid/framework/ngraph_bridge.h
@@ -14,8 +14,6 @@ limitations under the License. */
 #pragma once
-#ifdef PADDLE_WITH_NGRAPH
 #include <algorithm>
 #include <map>
 #include <string>
@@ -53,4 +51,3 @@ class NgraphBridge {
 }  // namespace framework
 }  // namespace paddle
-#endif
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef PADDLE_WITH_NGRAPH
 #include <glog/logging.h>
 #include <algorithm>
@@ -58,16 +57,16 @@ typedef enum {                /* nGraph support state on ops          */
 } op_state;
 // perform graph build through bridge and execute computation
-class NgraphOperator {
+class NgraphEngine {
 public:
-  explicit NgraphOperator(const Scope& scope, const platform::Place& place,
+  explicit NgraphEngine(const Scope& scope, const platform::Place& place,
-                          const std::vector<std::shared_ptr<OperatorBase>>& ops,
+                        const std::vector<std::shared_ptr<OperatorBase>>& ops,
-                          const std::unordered_map<
+                        const std::unordered_map<
-                              std::string, ngraph::element::Type>& var_type_map,
+                            std::string, ngraph::element::Type>& var_type_map,
-                          const std::unordered_set<std::string>& persist,
+                        const std::unordered_set<std::string>& persist,
-                          const std::unordered_set<std::string>& fetches,
+                        const std::unordered_set<std::string>& fetches,
-                          const std::unordered_set<std::string>& post_op_inputs,
+                        const std::unordered_set<std::string>& post_op_inputs,
-                          op_state ng_op_state)
+                        op_state ng_op_state)
      : scope_(scope),
        place_(place),
        fused_ops_(ops),
@@ -132,7 +131,7 @@ class NgraphOperator {
 };
 std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-FusedOperator::FusedOpIntervals(
+NgraphOperator::NgraphOpIntervals(
    std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops) {
  std::vector<std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
      intervals;
@@ -185,7 +184,7 @@ FusedOperator::FusedOpIntervals(
  return intervals;
 }
-FusedOperator::FusedOperator(
+NgraphOperator::NgraphOperator(
    const ProgramDesc& prog, size_t block_id,
    std::vector<std::unique_ptr<OperatorBase>>::iterator start,
    std::vector<std::unique_ptr<OperatorBase>>::iterator end,
@@ -215,7 +214,7 @@ FusedOperator::FusedOperator(
  Process();
 }
-void FusedOperator::Process() {
+void NgraphOperator::Process() {
  auto& bdesc = pdesc_.Block(block_);
  for (auto& var : bdesc.AllVars()) {
    if (!(var->GetType() == proto::VarType::SELECTED_ROWS ||
@@ -251,8 +250,8 @@ void FusedOperator::Process() {
  }
 }
-void FusedOperator::RunImpl(const Scope& scope,
+void NgraphOperator::RunImpl(const Scope& scope,
-                            const platform::Place& place) const {
+                             const platform::Place& place) const {
  op_state ng_op_state = PARTIAL_TEST;
  auto& bdesc = pdesc_.Block(block_);
  for (auto* op : bdesc.AllOps()) {
@@ -266,19 +265,19 @@ void FusedOperator::RunImpl(const Scope& scope,
    ng_op_state = ng_op_state == PARTIAL_TEST ? FULL_TEST : FULL_TRAIN;
  }
-  NgraphOperator ngraph_op(scope, place, fused_ops_, var_type_map_,
+  NgraphEngine ngraph_engine(scope, place, fused_ops_, var_type_map_,
-                           persistables_, fetches_, post_op_inputs_,
+                             persistables_, fetches_, post_op_inputs_,
-                           ng_op_state);
+                             ng_op_state);
-  ngraph_op.Run(scope, place);
+  ngraph_engine.Run(scope, place);
 }
 std::unordered_map<std::string, std::shared_ptr<ngraph::Function>>
-    NgraphOperator::func_cache_ = {};
+    NgraphEngine::func_cache_ = {};
-std::shared_ptr<ngraph::runtime::Backend> NgraphOperator::backend_ =
+std::shared_ptr<ngraph::runtime::Backend> NgraphEngine::backend_ =
    ngraph::runtime::Backend::create("CPU");
-void NgraphOperator::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
+void NgraphEngine::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
  op->RuntimeInferShape(scope_, place_);
  for (auto& var_name_item : op->Inputs()) {
    for (auto& var_name : var_name_item.second) {
@@ -301,7 +300,7 @@ void NgraphOperator::GetNgInputShape(std::shared_ptr<OperatorBase> op) {
  }
 }
-void NgraphOperator::BuildNgNodes() {
+void NgraphEngine::BuildNgNodes() {
  for (auto& var_name : var_out_) {
    if (var_node_map_->find(var_name) == var_node_map_->end()) {
      auto* var = scope_.FindVar(var_name);
@@ -323,7 +322,7 @@ void NgraphOperator::BuildNgNodes() {
  }
 }
-void NgraphOperator::BuildNgIO() {
+void NgraphEngine::BuildNgIO() {
  std::unordered_set<std::string> inputs;
  std::unordered_set<std::string> outputs;
@@ -395,7 +394,7 @@ void NgraphOperator::BuildNgIO() {
  }
 }
-void NgraphOperator::BuildNgFunction() {
+void NgraphEngine::BuildNgFunction() {
  BuildNgNodes();
  ngraph_function_ = nullptr;
  ngraph::NodeVector func_outputs;
@@ -416,7 +415,7 @@ void NgraphOperator::BuildNgFunction() {
      std::make_shared<ngraph::Function>(func_outputs, func_inputs);
 }
-std::shared_ptr<std::string> NgraphOperator::GetCacheKey() {
+std::shared_ptr<std::string> NgraphEngine::GetCacheKey() {
  auto cache_key = std::make_shared<std::string>("");
  *cache_key += std::to_string(fused_ops_.size());
  for (auto& op : fused_ops_) {
@@ -444,7 +443,7 @@ std::shared_ptr<std::string> NgraphOperator::GetCacheKey() {
  return cache_key;
 }
-void NgraphOperator::GetNgFunction() {
+void NgraphEngine::GetNgFunction() {
  bool cache_on = true;
  if (cache_on) {
    std::string cache_key_val = *GetCacheKey();
@@ -459,8 +458,7 @@ void NgraphOperator::GetNgFunction() {
  }
 }
-void NgraphOperator::Run(const Scope& scope,
+void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const {
-                         const platform::Place& place) const {
  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_in;
  std::vector<std::shared_ptr<ngraph::runtime::Tensor>> t_out;
@@ -545,7 +543,6 @@ void NgraphOperator::Run(const Scope& scope,
  }
  backend_->call(ngraph_function_, t_out, t_in);
-}  // NgraphOperator::RunImpl
+}  // NgraphEngine::RunImpl
 }  // namespace framework
 }  // namespace paddle
-#endif
--- a/paddle/fluid/framework/ngraph_operator.h
+++ b/paddle/fluid/framework/ngraph_operator.h
@@ -14,8 +14,6 @@ limitations under the License. */
 #pragma once
-#ifdef PADDLE_WITH_NGRAPH
 #include <algorithm>
 #include <string>
 #include <unordered_map>
@@ -34,14 +32,14 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-class FusedOperator : public OperatorBase {
+class NgraphOperator : public OperatorBase {
 public:
  static std::vector<
      std::vector<std::vector<std::unique_ptr<OperatorBase>>::iterator>>
-  FusedOpIntervals(
+  NgraphOpIntervals(
      std::vector<std::unique_ptr<paddle::framework::OperatorBase>>* ops);
-  explicit FusedOperator(
+  explicit NgraphOperator(
      const ProgramDesc& prog, size_t block_id,
      std::vector<std::unique_ptr<OperatorBase>>::iterator start,
      std::vector<std::unique_ptr<OperatorBase>>::iterator end,
@@ -64,4 +62,3 @@ class FusedOperator : public OperatorBase {
 };
 }  // namespace framework
 }  // namespace paddle
-#endif
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -319,7 +319,7 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
 #define USE_OP(op_type)   \
  USE_OP_ITSELF(op_type); \
  USE_OP_KERNEL(op_type)
-// clang-format off
+// clang-format on
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -30,13 +30,36 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
+#ifdef WITH_GPERFTOOLS
+#include "gperftools/profiler.h"
+#endif
+DEFINE_string(pe_profile_fname, "",
+              "Profiler filename for PE, which generated by gperftools."
+              "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable.");
 namespace paddle {
 namespace framework {
+static std::once_flag gProfileOnce;
+#ifdef WITH_GPERFTOOLS
+static bool gProfileStarted = false;
+#endif
 class ParallelExecutorPrivate {
 public:
  explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
-      : places_(places) {}
+      : places_(places) {
+    if (!FLAGS_pe_profile_fname.empty()) {
+      std::call_once(gProfileOnce, [] {
+#ifdef WITH_GPERFTOOLS
+        ProfilerStart(FLAGS_pe_profile_fname.c_str());
+        gProfileStarted = true;
+#else
+        LOG(WARNING) << "Paddle is not compiled with gperftools. "
+                        "FLAGS_pe_profile_fname will be ignored";
+#endif
+      });
+    }
+  }
  ~ParallelExecutorPrivate() {
    if (own_local_scope_) {
@@ -270,6 +293,12 @@ void ParallelExecutor::BCastParamsToDevices(
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                           const std::string &fetched_var_name) {
+#ifdef WITH_GPERFTOOLS
+  if (gProfileStarted) {
+    ProfilerFlush();
+  }
+#endif
  platform::RecordBlock b(0);
 #ifdef PADDLE_WITH_CUDA
  if (!gcs_.empty()) {

--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -44,9 +44,10 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
    argument->SetMainProgram(program.release());
  } else if (argument->model_program_path_valid() &&
             argument->model_params_path_valid()) {
-    auto program =
+    auto program = LoadModel(
-        LoadModel(argument->model_program_path(), argument->model_params_path(),
+        argument->model_program_path(), argument->model_params_path(),
-                  argument->scope_ptr(), place, argument->model_from_memory());
+        argument->scope_ptr(), place,
+        argument->model_from_memory_valid() && argument->model_from_memory());
    argument->SetMainProgram(program.release());
  } else {
    PADDLE_THROW(

--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
-set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor)
+set(INFERENCE_EXTRA_DEPS paddle_inference_api paddle_fluid_api ir_pass_manager analysis_predictor benchmark)
 if(WITH_GPU AND TENSORRT_FOUND)
    set(INFERENCE_EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} analysis ${analysis_deps} ir_pass_manager analysis_predictor)

--- a/paddle/fluid/inference/tests/api/tester_helper.h
+++ b/paddle/fluid/inference/tests/api/tester_helper.h
@@ -30,8 +30,10 @@
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/tests/api/config_printer.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
+#include "paddle/fluid/inference/utils/benchmark.h"
 #include "paddle/fluid/platform/profiler.h"
+DEFINE_string(model_name, "", "model name");
 DEFINE_string(infer_model, "", "model path");
 DEFINE_string(infer_data, "", "data file");
 DEFINE_int32(batch_size, 1, "batch size.");
@@ -40,6 +42,8 @@ DEFINE_bool(test_all_data, false, "Test the all dataset in data file.");
 DEFINE_int32(num_threads, 1, "Running the inference program in multi-threads.");
 DEFINE_bool(use_analysis, true,
            "Running the inference program in analysis mode.");
+DEFINE_bool(record_benchmark, false,
+            "Record benchmark after profiling the model");
 DECLARE_bool(profile);
 DECLARE_int32(paddle_num_threads);
@@ -192,8 +196,16 @@ void TestOneThreadPrediction(
        predictor->Run(inputs[j], outputs, batch_size);
      }
    }
-    PrintTime(batch_size, num_times, 1, 0, run_timer.toc() / num_times,
-              inputs.size());
+    double latency = run_timer.toc() / num_times;
+    PrintTime(batch_size, num_times, 1, 0, latency, inputs.size());
+    if (FLAGS_record_benchmark) {
+      Benchmark benchmark;
+      benchmark.SetName(FLAGS_model_name);
+      benchmark.SetBatchSize(batch_size);
+      benchmark.SetLatency(latency);
+      benchmark.PersistToFile("benchmark_record.txt");
+    }
  }
 }

--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -135,6 +135,9 @@ TEST(TensorRT_resnext50, compare) {
 TEST(TensorRT_resnext50, profile) {
  std::string model_dir = FLAGS_infer_model + "/resnext50";
+  // Set FLAGS_record_benchmark to true to record benchmark to file.
+  // FLAGS_record_benchmark=true;
+  FLAGS_model_name = "resnext50";
  profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt);
 }

--- a/paddle/fluid/inference/utils/benchmark.cc
+++ b/paddle/fluid/inference/utils/benchmark.cc
@@ -30,7 +30,7 @@ std::string Benchmark::SerializeToString() const {
  ss << '\n';
  ss << name_ << "\t";
-  ss << batch_size_ << "\t";
+  ss << batch_size_ << "\t\t";
  ss << num_threads_ << "\t";
  ss << latency_ << "\t";
  ss << 1000.0 / latency_;

--- a/paddle/fluid/inference/utils/visualizer.cc
+++ b/paddle/fluid/inference/utils/visualizer.cc
@@ -26,9 +26,6 @@ DEFINE_string(model_dir, "", "model directory");
 DEFINE_string(model_program_path, "", "model program path");
 DEFINE_string(model_params_path, "", "model params path");
-USE_PASS(graph_viz_pass);
-USE_PASS(graph_to_program_pass);
 using paddle::inference::analysis::Argument;
 namespace paddle {
@@ -40,7 +37,6 @@ void Visualizer::SetArgument(Argument *argument) { argument_ = argument; }
 bool Visualizer::Run() {
  paddle::framework::InitDevices(false);
  paddle::inference::analysis::Analyzer().Run(argument_);
  return true;
 }
@@ -77,7 +73,7 @@ int main(int argc, char *argv[]) {
  // Only 1 pass, default filename is 0_ir_origin.dot
  // For more details, looking for paddle::inference::analysis::IRPassManager
-  argument.SetIrAnalysisPasses({"graph_viz_pass"});
+  argument.SetIrAnalysisPasses({"infer_clean_graph_pass", "graph_viz_pass"});
  std::unique_ptr<paddle::framework::Scope> scope{
      new paddle::framework::Scope()};
@@ -90,3 +86,7 @@ int main(int argc, char *argv[]) {
  return 0;
 }
+USE_PASS(infer_clean_graph_pass);
+USE_PASS(graph_viz_pass);
+USE_PASS(graph_to_program_pass);
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -301,23 +301,22 @@ template <typename T>
 struct GeluFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Out>
  void operator()(Device d, X x, Out out) const {
-    auto temp =
+    auto temp = (x * static_cast<T>(M_SQRT1_2)).erf();
-        ((x * static_cast<T>(M_SQRT1_2)).erf()).template cast<T>().eval();
    out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
  }
 };
 template <typename T>
 struct GeluGradFunctor : BaseActivationFunctor<T> {
-  bool Inplace() const { return IsInplace("gelu"); }
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    auto temp = (static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2) * x *
+    auto first = static_cast<T>(0.5) *
-                 ((-static_cast<T>(0.5) * x.square()).exp()))
+                 (static_cast<T>(1) + ((x * static_cast<T>(M_SQRT1_2)).erf()));
-                    .template cast<T>()
-                    .eval();
+    auto second = static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2) * x *
-    dx.device(d) = dout * (out / x + temp);
+                  (-static_cast<T>(0.5) * x.square()).exp();
+    dx.device(d) = dout * (first + second);
  }
 };

--- a/paddle/fluid/operators/bilinear_tensor_product_op.cu
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/bilinear_tensor_product_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/concat_mkldnn_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <memory>
+#include "paddle/fluid/operators/concat_op.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+namespace paddle {
+namespace operators {
+using framework::DataLayout;
+using framework::Tensor;
+using mkldnn::memory;
+using mkldnn::primitive;
+using mkldnn::concat;
+using mkldnn::stream;
+using platform::to_void_cast;
+static void EnforceLayouts(const std::vector<const Tensor*> inputs) {
+  for (auto* input : inputs) {
+    const bool is_layout_correct = input->layout() == DataLayout::kMKLDNN;
+    const bool is_format_defined =
+        input->format() != memory::format::format_undef;
+    PADDLE_ENFORCE(is_layout_correct && is_format_defined,
+                   "Wrong layout/format set for Input tensor");
+  }
+}
+static memory::primitive_desc CreateMemPrimDesc(const Tensor& input,
+                                                const mkldnn::engine& engine) {
+  constexpr auto data_type = mkldnn::memory::f32;
+  const auto dims = paddle::framework::vectorize2int(input.dims());
+  const auto format = input.format();
+  auto description = memory::desc(dims, data_type, format);
+  auto mem_prim_desc = memory::primitive_desc(description, engine);
+  return mem_prim_desc;
+}
+static mkldnn::memory::format GetDstMemFormat(
+    const concat::primitive_desc& concat_pd) {
+  return (memory::format)concat_pd.dst_primitive_desc().desc().data.format;
+}
+static platform::CPUPlace GetCpuPlace(
+    const paddle::framework::ExecutionContext& ctx) {
+  auto place = ctx.GetPlace();
+  PADDLE_ENFORCE(paddle::platform::is_cpu_place(place),
+                 "It must use CPUPlace.");
+  return boost::get<platform::CPUPlace>(place);
+}
+static const mkldnn::engine& GetMKLDNNEngine(
+    const paddle::framework::ExecutionContext& ctx) {
+  auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
+  return dev_ctx.GetEngine();
+}
+template <typename T>
+class ConcatPrimitiveFactory {
+ public:
+  concat::primitive_desc CreateConcatPrimDescriptor(
+      const std::vector<const Tensor*> multi_input, Tensor* output,
+      int concat_axis, const mkldnn::engine& mkldnn_engine) {
+    CreateSourcesDescriptors(multi_input, mkldnn_engine);
+    auto dst_desc = CreateDstMemDescriptor(output);
+    return concat::primitive_desc(dst_desc, concat_axis, srcs_pd);
+  }
+  concat CreateConcatPrimitive(const concat::primitive_desc& concat_pd,
+                               Tensor* output, platform::CPUPlace place) {
+    CreateSourcePrimitiveAts();
+    dst_mem = CreateDstMemory(concat_pd, output, place);
+    return concat(concat_pd, inputs, dst_mem.get());
+  }
+ private:
+  memory::desc CreateDstMemDescriptor(Tensor* output) {
+    auto dst_dims = paddle::framework::vectorize2int(output->dims());
+    return memory::desc(dst_dims, platform::MKLDNNGetDataType<T>(),
+                        memory::format::any);
+  }
+  mkldnn::memory CreateDstMemory(const concat::primitive_desc& concat_pd,
+                                 Tensor* output, platform::CPUPlace place) {
+    return memory(concat_pd.dst_primitive_desc(),
+                  output->mutable_data<T>(place));
+  }
+  void CreateSourcesDescriptors(const std::vector<const Tensor*> multi_input,
+                                const mkldnn::engine& mkldnn_engine) {
+    for (size_t i = 0; i < multi_input.size(); i++) {
+      auto mem_prim_desc = CreateMemPrimDesc(*multi_input[i], mkldnn_engine);
+      srcs_pd.push_back(mem_prim_desc);
+      srcs.push_back(
+          memory(mem_prim_desc, to_void_cast(multi_input[i]->data<T>())));
+    }
+  }
+  void CreateSourcePrimitiveAts() {
+    inputs.reserve(srcs.size());
+    for (size_t i = 0; i < srcs.size(); i++) {
+      inputs.push_back(srcs[i]);
+    }
+  }
+ private:
+  std::vector<memory::primitive_desc> srcs_pd;
+  std::vector<memory> srcs;
+  std::vector<primitive::at> inputs;
+  boost::optional<memory> dst_mem;  // TODO(mgallus): change to std::optional
+};                                  // upon introduction of C++17 to paddle
+template <typename T>
+class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    auto place = GetCpuPlace(ctx);
+    const auto& mkldnn_engine = GetMKLDNNEngine(ctx);
+    auto multi_input = ctx.MultiInput<Tensor>("X");
+    EnforceLayouts(multi_input);
+    Tensor* output = ctx.Output<Tensor>("Out");
+    int64_t concat_axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    ConcatPrimitiveFactory<T> prim_creator;
+    auto concat_pd = prim_creator.CreateConcatPrimDescriptor(
+        multi_input, output, static_cast<int>(concat_axis), mkldnn_engine);
+    auto concat = prim_creator.CreateConcatPrimitive(concat_pd, output, place);
+    stream(stream::kind::eager).submit({concat}).wait();
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(GetDstMemFormat(concat_pd));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace,
+                   ops::ConcatMKLDNNOpKernel<float>)
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/concat_op.h"
 #include <string>
 #include <vector>
+#ifdef PADDLE_WITH_MKLDNN
+#include <paddle/fluid/platform/mkldnn_helper.h>
+#endif
 namespace paddle {
 namespace operators {
 using framework::Tensor;
@@ -59,6 +62,22 @@ class ConcatOp : public framework::OperatorWithKernel {
    ctx->SetOutputDim("Out", out_dims);
    ctx->ShareLoD("X", /*->*/ "Out");
  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    auto input_data_type =
+        framework::GetDataTypeOfVar(ctx.MultiInputVar("X")[0]);
+#ifdef PADDLE_WITH_MKLDNN
+    if (platform::CanMKLDNNBeUsed(ctx)) {
+      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
+    }
+#endif
+    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+  }
 };
 class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -66,6 +85,10 @@ class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() override {
    AddInput("X", "Input tensors of concat operator.").AsDuplicable();
    AddOutput("Out", "Output tensor of concat operator.");
+    AddAttr<bool>(
+        "use_mkldnn",
+        "(bool, default false) Indicates if MKL-DNN kernel will be used")
+        .SetDefault(false);
    AddAttr<int>("axis",
                 "The axis along which the input tensors will be concatenated.")
        .SetDefault(0);

--- a/paddle/fluid/operators/cos_sim_op.cu
+++ b/paddle/fluid/operators/cos_sim_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/cos_sim_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/crop_op.cu
+++ b/paddle/fluid/operators/crop_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/crop_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/distributed/brpc_client.cc
+++ b/paddle/fluid/operators/distributed/brpc_client.cc
@@ -158,7 +158,7 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) {
  for (int i = 0; i < FLAGS_brpc_channel_num; ++i) {
    std::shared_ptr<ChannelContext> c(new ChannelContext());
    if (c->channel.Init(ep.c_str(), &options) != 0) {
-      LOG(ERROR) << "Fail to initialize channel";
+      LOG(FATAL) << "Fail to initialize channel";
      return nullptr;
    }

--- a/paddle/fluid/operators/distributed/grpc_client.cc
+++ b/paddle/fluid/operators/distributed/grpc_client.cc
@@ -390,8 +390,7 @@ void GRPCClient::Proceed() {
      VLOG(3) << c->GetVarHandlePtr()->String() << " process";
      c->Process();
    } else if (c->status_.error_code() == grpc::StatusCode::DEADLINE_EXCEEDED) {
-      // FIXME(gongwb): parse error_details?
+      LOG(FATAL) << c->GetVarHandlePtr()->String()
-      LOG(ERROR) << c->GetVarHandlePtr()->String()
                 << " meets grpc error, error_code:" << c->status_.error_code()
                 << " error_message:" << c->status_.error_message()
                 << " error_details:" << c->status_.error_details();

--- a/paddle/fluid/operators/dropout_op.cu
+++ b/paddle/fluid/operators/dropout_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include <thrust/device_ptr.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/random.h>

--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/platform/float16.h"

--- a/paddle/fluid/operators/elementwise/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise/elementwise_max_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cu
@@ -8,8 +8,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise/elementwise_pow_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise/elementwise_sub_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/expand_op.cu
+++ b/paddle/fluid/operators/expand_op.cu
@@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/expand_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/gru_unit_op.cu
+++ b/paddle/fluid/operators/gru_unit_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/gru_unit_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/hinge_loss_op.cu
+++ b/paddle/fluid/operators/hinge_loss_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/hinge_loss_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/huber_loss_op.cu
+++ b/paddle/fluid/operators/huber_loss_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/huber_loss_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/im2sequence_op.cu
+++ b/paddle/fluid/operators/im2sequence_op.cu
@@ -11,8 +11,6 @@
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/im2sequence_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/isfinite_op.cu
+++ b/paddle/fluid/operators/isfinite_op.cu
@@ -11,8 +11,6 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/isfinite_op.h"
 #include "paddle/fluid/platform/float16.h"

--- a/paddle/fluid/operators/l1_norm_op.cu
+++ b/paddle/fluid/operators/l1_norm_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/l1_norm_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/log_loss_op.cu
+++ b/paddle/fluid/operators/log_loss_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/log_loss_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/math/context_project.cu
+++ b/paddle/fluid/operators/math/context_project.cu
@@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/math/context_project.h"
 namespace paddle {

--- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
+++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
@@ -79,16 +79,16 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
  }
 };
-#define INTRIAVX_FLOAT(isa, block)                                             \
+#define INTRIAVX_FLOAT(isa, jit_block)                                         \
  template <>                                                                  \
-  LayerNormKernelImpl<float, isa, block>::LayerNormKernelImpl(int right)       \
+  LayerNormKernelImpl<float, isa, jit_block>::LayerNormKernelImpl(int right)   \
      : LayerNormKernel<float>() {                                             \
    this->num_ = right;                                                        \
    this->rest_ = this->num_ % YMM_FLOAT_BLOCK;                                \
    this->end_ = this->num_ - this->rest_;                                     \
  }                                                                            \
  template <>                                                                  \
-  void LayerNormKernelImpl<float, platform::avx, block>::Compute(              \
+  void LayerNormKernelImpl<float, isa, jit_block>::Compute(                    \
      float* x, float* out, float* mean, float* var, const float* scale,       \
      const float* bias, int height, const float epsilon) const {              \
    __m256 sum;                                                                \
@@ -97,6 +97,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
    __m256 tmp;                                                                \
    size_t offset;                                                             \
    size_t j;                                                                  \
+    size_t block = YMM_FLOAT_BLOCK;                                            \
    __m256 reverse_num_vec =                                                   \
        _mm256_div_ps(_mm256_set1_ps(1.0), _mm256_set1_ps(this->num_));        \
    __m256 epsilon_vec = _mm256_set1_ps(epsilon);                              \
@@ -221,12 +222,14 @@ INTRIAVX_FLOAT(platform::avx, kEQ8);
 INTRIAVX_FLOAT(platform::avx, kGT8LT16);
 INTRIAVX_FLOAT(platform::avx, kEQ16);
 INTRIAVX_FLOAT(platform::avx, kGT16);
-#endif
-#ifdef __AVX2__
 INTRIAVX_FLOAT(platform::avx2, kEQ8);
 INTRIAVX_FLOAT(platform::avx2, kGT8LT16);
 INTRIAVX_FLOAT(platform::avx2, kEQ16);
 INTRIAVX_FLOAT(platform::avx2, kGT16);
+INTRIAVX_FLOAT(platform::avx512f, kEQ8);
+INTRIAVX_FLOAT(platform::avx512f, kGT8LT16);
+INTRIAVX_FLOAT(platform::avx512f, kEQ16);
+INTRIAVX_FLOAT(platform::avx512f, kGT16);
 #endif
 #undef INTRIAVX_FLOAT

--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/operators/math/blas.h"

--- a/paddle/fluid/operators/math/sequence2batch.cu
+++ b/paddle/fluid/operators/math/sequence2batch.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/math/sequence2batch.h"
 namespace paddle {

--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include <vector>
 #include "paddle/fluid/operators/math/math_function.h"

--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/mean_op.h"
 #include "paddle/fluid/platform/float16.h"

--- a/paddle/fluid/operators/optimizers/adadelta_op.cu
+++ b/paddle/fluid/operators/optimizers/adadelta_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/optimizers/adadelta_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/optimizers/adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/operators/optimizers/adagrad_op.h"

--- a/paddle/fluid/operators/optimizers/adam_op.cu
+++ b/paddle/fluid/operators/optimizers/adam_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/optimizers/adam_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/optimizers/adamax_op.cu
+++ b/paddle/fluid/operators/optimizers/adamax_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/optimizers/adamax_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/decayed_adagrad_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/optimizers/decayed_adagrad_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/optimizers/ftrl_op.cu
+++ b/paddle/fluid/operators/optimizers/ftrl_op.cu
@@ -10,8 +10,6 @@ Unless required by applicable law or agreed to in writing, software distributed
 under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/optimizers/ftrl_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/proximal_adagrad_op.cu
@@ -10,8 +10,6 @@ Unless required by applicable law or agreed to in writing, software distributed
 under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/optimizers/proximal_adagrad_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/optimizers/proximal_gd_op.cu
+++ b/paddle/fluid/operators/optimizers/proximal_gd_op.cu
@@ -10,8 +10,6 @@ Unless required by applicable law or agreed to in writing, software distributed
 under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/optimizers/proximal_gd_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/optimizers/rmsprop_op.cu
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/optimizers/rmsprop_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/pad_constant_like_op.cu
+++ b/paddle/fluid/operators/pad_constant_like_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/pad_constant_like_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/pad_op.cu
+++ b/paddle/fluid/operators/pad_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/pad_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.cu
@@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/sequence_ops/sequence_pool_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/smooth_l1_loss_op.cu
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cu
@@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/smooth_l1_loss_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include <cub/cub.cuh>
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"

--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
@@ -72,10 +72,11 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
    for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
      auto rows_idx = outs_rows_idx[i];
      outs[i]->set_height(height_sections[i]);
+      auto dims = x->GetCompleteDims();
+      dims[0] = rows_idx.size();
+      outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
+      outs[i]->mutable_rows()->clear();
      if (rows_idx.size() > 0) {
-        auto dims = x->GetCompleteDims();
-        dims[0] = rows_idx.size();
-        outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
        for (auto idx : rows_idx) {
          outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
        }
@@ -98,6 +99,8 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
          }
        }
      }
+      PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(),
+                        "rows should has the same size with tensor dim 0");
    }
  }
 };

--- a/paddle/fluid/operators/squared_l2_distance_op.cu
+++ b/paddle/fluid/operators/squared_l2_distance_op.cu
@@ -11,9 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/squared_l2_distance_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/squared_l2_norm_op.cu
+++ b/paddle/fluid/operators/squared_l2_norm_op.cu
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/squared_l2_norm_op.h"
 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -8,8 +8,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define EIGEN_USE_GPU
 #include "paddle/fluid/operators/sum_op.h"
 #include "paddle/fluid/platform/float16.h"

--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -93,7 +93,7 @@ TEST(CudaAtomic, float16) {
 // unalignment of uint8
 void TestUnalign(size_t num, const int shift_bit) {
-  PADDLE_ENFORCE(num % 2 == 0, "must be a multiple of 2");
+  ASSERT_EQ(num % 2, 0);
  float16 *in1, *in2, *out;
  float16 *d_in1, *d_in2;
  size_t size = sizeof(uint8_t) * (num + shift_bit);

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/dynload/cublas.h"
 #include "paddle/fluid/platform/dynload/cudnn.h"
 #include "paddle/fluid/platform/gpu_info.h"
-#define EIGEN_USE_GPU
 #endif
 #ifdef PADDLE_WITH_MKLDNN

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -62,45 +62,54 @@ inline std::string demangle(std::string name) { return name; }
 #endif
 struct EnforceNotMet : public std::exception {
-  std::exception_ptr exp_;
  std::string err_str_;
-  EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) {
+  EnforceNotMet(std::exception_ptr e, const char* f, int l) {
-    static constexpr int TRACE_STACK_LIMIT = 100;
    try {
-      std::rethrow_exception(exp_);
+      std::rethrow_exception(e);
-    } catch (const std::exception& exp) {
+    } catch (std::exception& e) {
-      std::ostringstream sout;
+      Init(e.what(), f, l);
+    }
+  }
-      sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl;
+  template <typename... ARGS>
-      sout << "PaddlePaddle Call Stacks: " << std::endl;
+  EnforceNotMet(const char* f, int l, ARGS... args) {
+    Init(string::Sprintf(args...), f, l);
+  }
+  const char* what() const noexcept override { return err_str_.c_str(); }
+ private:
+  template <typename StrType>
+  inline void Init(StrType what, const char* f, int l) {
+    static constexpr int TRACE_STACK_LIMIT = 100;
+    std::ostringstream sout;
+    sout << string::Sprintf("%s at [%s:%d]", what, f, l) << std::endl;
+    sout << "PaddlePaddle Call Stacks: " << std::endl;
 #if !defined(_WIN32)
-      void* call_stack[TRACE_STACK_LIMIT];
+    void* call_stack[TRACE_STACK_LIMIT];
-      auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
+    auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
-      auto symbols = backtrace_symbols(call_stack, size);
+    auto symbols = backtrace_symbols(call_stack, size);
+    Dl_info info;
-      Dl_info info;
+    for (int i = 0; i < size; ++i) {
-      for (int i = 0; i < size; ++i) {
+      if (dladdr(call_stack[i], &info) && info.dli_sname) {
-        if (dladdr(call_stack[i], &info) && info.dli_sname) {
+        auto demangled = demangle(info.dli_sname);
-          auto demangled = demangle(info.dli_sname);
+        auto addr_offset = static_cast<char*>(call_stack[i]) -
-          auto addr_offset = static_cast<char*>(call_stack[i]) -
+                           static_cast<char*>(info.dli_saddr);
-                             static_cast<char*>(info.dli_saddr);
+        sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,
-          sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,
+                                2 + sizeof(void*) * 2, call_stack[i], demangled,
-                                  2 + sizeof(void*) * 2, call_stack[i],
+                                addr_offset);
-                                  demangled, addr_offset);
+      } else {
-        } else {
+        sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
-          sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
+                                call_stack[i]);
-                                  call_stack[i]);
-        }
      }
-      free(symbols);
+    }
+    free(symbols);
 #else
-      sout << "Windows not support stack backtrace yet.";
+    sout << "Windows not support stack backtrace yet.";
 #endif
-      err_str_ = sout.str();
+    err_str_ = sout.str();
-    }
  }
-  const char* what() const noexcept { return err_str_.c_str(); }
 };
 struct EOFException : public std::exception {
@@ -242,13 +251,8 @@ inline void throw_on_error(T e) {
  throw_on_error(e, "");
 }
-#define PADDLE_THROW(...)                                              \
+#define PADDLE_THROW(...) \
-  do {                                                                 \
+  throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__)
-    throw ::paddle::platform::EnforceNotMet(                           \
-        std::make_exception_ptr(                                       \
-            std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
-        __FILE__, __LINE__);                                           \
-  } while (false)
 #ifndef REPLACE_ENFORCE_GLOG
 #define PADDLE_ENFORCE(...)                                             \

--- a/paddle/fluid/platform/float16.h
+++ b/paddle/fluid/platform/float16.h
@@ -71,9 +71,6 @@ struct float16;
 }  // namespace platform
 }  // namespace paddle
-// NOTE():
-// Do not move the eigen.h header, otherwise the eigen_vector<bool> will failed.
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/hostdevice.h"
 #include "unsupported/Eigen/CXX11/Tensor"

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -336,6 +336,8 @@ PYBIND11_MODULE(core, m) {
      .def("get_tensor",
           [](SelectedRows &self) { return self.mutable_value(); },
           py::return_value_policy::reference)
+      .def("numel",
+           [](SelectedRows &self) -> int64_t { return self.value().numel(); })
      .def("set_height", &SelectedRows::set_height)
      .def("height", &SelectedRows::height)
      .def("set_rows",

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -127,7 +127,8 @@ def __bootstrap__():
        'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
        'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
        'eager_delete_tensor_gb', 'allocator_strategy',
-        'reader_queue_speed_test_mode', 'print_sub_graph_dir'
+        'reader_queue_speed_test_mode', 'print_sub_graph_dir',
+        'pe_profile_fname'
    ]
    if 'Darwin' not in sysstr:
        read_env_flags.append('use_pinned_memory')

--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -48,6 +48,7 @@ class WeightedAverage(object):
    Examples:
        .. code-block:: python
            avg = fluid.average.WeightedAverage()
            avg.add(value=2.0, weight=1)
            avg.add(value=4.0, weight=2)

--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -93,7 +93,7 @@ class TestDistMnist2x2(TestDistRunnerBase):
        # TODO(typhoonzero): fix distributed adam optimizer
        # opt = fluid.optimizer.AdamOptimizer(
        #     learning_rate=0.001, beta1=0.9, beta2=0.999)
-        opt = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
+        opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9)
        # Reader
        train_reader = paddle.batch(

--- a/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_mkldnn_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import unittest
+from test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3
+class TestMKLDNNConcatOp(TestConcatOp):
+    def setUp(self):
+        super(TestMKLDNNConcatOp, self).setUp()
+        self.attrs["use_mkldnn"] = True
+        self._cpu_only = True
+    def test_check_grad(self):
+        pass
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNConcatOp2(TestConcatOp2):
+    def setUp(self):
+        super(TestMKLDNNConcatOp2, self).setUp()
+        self.attrs["use_mkldnn"] = True
+        self._cpu_only = True
+    def test_check_grad(self):
+        pass
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+class TestMKLDNNConcatOp3(TestConcatOp3):
+    def setUp(self):
+        super(TestMKLDNNConcatOp3, self).setUp()
+        self.attrs["use_mkldnn"] = True
+        self._cpu_only = True
+    def test_check_grad(self):
+        pass
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -32,7 +32,7 @@ DEFAULT_BATCH_SIZE = 2
 class TestDistRunnerBase(object):
-    def get_model(self, batch_size=DEFAULT_BATCH_SIZE):
+    def get_model(self, batch_size=DEFAULT_BATCH_SIZE, lr=0.1):
        raise NotImplementedError(
            "get_model should be implemented by child classes.")
@@ -56,6 +56,7 @@ class TestDistRunnerBase(object):
        return t
    def run_pserver(self, args):
+        self.lr = args.lr
        self.get_model(batch_size=args.batch_size)
        # NOTE: pserver should not call memory optimize
        t = self.get_transpiler(args.trainer_id,
@@ -71,6 +72,7 @@ class TestDistRunnerBase(object):
        exe.run(pserver_prog)
    def run_trainer(self, args):
+        self.lr = args.lr
        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
            self.get_model(batch_size=args.batch_size)
@@ -189,6 +191,7 @@ def runtime_main(test_class):
    parser.add_argument(
        '--use_reader_alloc', action='store_true', required=False)
    parser.add_argument('--batch_size', required=False, type=int, default=2)
+    parser.add_argument('--lr', required=False, type=float, default=0.001)
    parser.add_argument(
        '--batch_merge_repeat', required=False, type=int, default=1)
@@ -234,6 +237,7 @@ class TestDistBase(unittest.TestCase):
        self._dc_asgd = False  # must use with async mode
        self._use_reader_alloc = True
        self._nccl2_mode = False
+        self._lr = 0.001
        self._setup_config()
        self._after_setup_config()
@@ -284,7 +288,8 @@ class TestDistBase(unittest.TestCase):
                   batch_size=DEFAULT_BATCH_SIZE,
                   batch_merge_repeat=1):
-        cmd = "%s %s --role trainer" % (self._python_interp, model)
+        cmd = "%s %s --role trainer --lr %f" % (self._python_interp, model,
+                                                self._lr)
        if batch_size != DEFAULT_BATCH_SIZE:
            cmd += " --batch_size %d" % batch_size
        if batch_merge_repeat > 1:
@@ -330,13 +335,13 @@ class TestDistBase(unittest.TestCase):
        ps0_ep, ps1_ep = self._ps_endpoints.split(",")
-        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver"
+        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver --lr %f"
        tr0_cmd = tr_cmd % \
                  (self._python_interp, model, self._ps_endpoints,
-                   0, ps0_ep, self._trainers)
+                   0, ps0_ep, self._trainers, self._lr)
        tr1_cmd = tr_cmd % \
                  (self._python_interp, model, self._ps_endpoints,
-                   1, ps1_ep, self._trainers)
+                   1, ps1_ep, self._trainers, self._lr)
        if self._sync_mode:
            tr0_cmd += " --sync_mode"
@@ -425,13 +430,13 @@ class TestDistBase(unittest.TestCase):
        worker_endpoints = self._ps_endpoints.split(",")
        w0_ep, w1_ep = worker_endpoints
-        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2"
+        tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2 --lr %f"
        tr0_cmd = tr_cmd % \
                  (self._python_interp, model, self._ps_endpoints,
-                   0, w0_ep)
+                   0, w0_ep, self._lr / 2)
        tr1_cmd = tr_cmd % \
                  (self._python_interp, model, self._ps_endpoints,
-                   1, w1_ep)
+                   1, w1_ep, self._lr / 2)
        if self._mem_opt:
            tr0_cmd += " --mem_opt"

--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
@@ -36,7 +36,7 @@ class TestDistMnistNCCL2(TestDistBase):
    def test_dist_train(self):
        import paddle.fluid as fluid
        if fluid.core.is_compiled_with_cuda():
-            self.check_with_place("dist_mnist.py", delta=1)
+            self.check_with_place("dist_mnist.py", delta=1e-5)
 class TestDistMnist2x2Lars(TestDistBase):

--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -15,7 +15,12 @@
 from __future__ import print_function
 import unittest
+from functools import partial
+import contextlib
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+import paddle.fluid as fluid
 import paddle.fluid.framework as framework
 import paddle.fluid.optimizer as optimizer
 import paddle.fluid.regularizer as regularizer
@@ -97,5 +102,134 @@ class TestL1DecayRegularizer(unittest.TestCase):
        self.assertEqual(block.ops[-3].type, 'sign')
+def bow_net(data,
+            label,
+            dict_dim,
+            is_sparse=False,
+            emb_dim=128,
+            hid_dim=128,
+            hid_dim2=96,
+            class_dim=2):
+    """
+    BOW net
+    This model is from https://github.com/PaddlePaddle/models:
+    fluid/PaddleNLP/text_classification/nets.py
+    """
+    emb = fluid.layers.embedding(
+        input=data, is_sparse=is_sparse, size=[dict_dim, emb_dim])
+    bow = fluid.layers.sequence_pool(input=emb, pool_type='sum')
+    bow_tanh = fluid.layers.tanh(bow)
+    fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh")
+    fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh")
+    prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    return avg_cost
+class TestRegularizer(unittest.TestCase):
+    def setUp(self):
+        self.word_dict = paddle.dataset.imdb.word_dict()
+        reader = paddle.batch(
+            paddle.dataset.imdb.train(self.word_dict), batch_size=8)()
+        self.train_data = [next(reader) for _ in range(5)]
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+    @contextlib.contextmanager
+    def scope_prog_guard(self, main_prog, startup_prog):
+        scope = fluid.core.Scope()
+        with fluid.unique_name.guard():
+            with fluid.scope_guard(scope):
+                with fluid.program_guard(main_prog, startup_prog):
+                    yield
+    def run_program(self, place, feed_list):
+        exe = fluid.Executor(place)
+        feeder = fluid.DataFeeder(feed_list=feed_list, place=place)
+        exe.run(fluid.default_startup_program())
+        main_prog = fluid.default_main_program()
+        param_list = [var.name for var in main_prog.block(0).all_parameters()]
+        param_sum = []
+        for data in self.train_data:
+            out = exe.run(main_prog,
+                          feed=feeder.feed(data),
+                          fetch_list=param_list)
+            p_sum = 0
+            for v in out:
+                p_sum += np.sum(np.abs(v))
+            param_sum.append(p_sum)
+        return param_sum
+    def check_l2decay_regularizer(self, place, model):
+        main_prog = fluid.framework.Program()
+        startup_prog = fluid.framework.Program()
+        startup_prog.random_seed = 1
+        with self.scope_prog_guard(
+                main_prog=main_prog, startup_prog=startup_prog):
+            data = fluid.layers.data(
+                name="words", shape=[1], dtype="int64", lod_level=1)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+            avg_cost = model(data, label, len(self.word_dict))
+            optimizer = fluid.optimizer.Adagrad(
+                learning_rate=0.1,
+                regularization=fluid.regularizer.L2Decay(1.0))
+            optimizer.minimize(avg_cost)
+            param_sum = self.run_program(place, [data, label])
+        return param_sum
+    def check_l2decay(self, place, model):
+        main_prog = fluid.framework.Program()
+        startup_prog = fluid.framework.Program()
+        startup_prog.random_seed = 1
+        with self.scope_prog_guard(
+                main_prog=main_prog, startup_prog=startup_prog):
+            data = fluid.layers.data(
+                name="words", shape=[1], dtype="int64", lod_level=1)
+            label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+            avg_cost_l2 = model(data, label, len(self.word_dict))
+            param_list = fluid.default_main_program().block(0).all_parameters()
+            para_sum = []
+            for para in param_list:
+                para_mul = fluid.layers.square(x=para)
+                para_sum.append(fluid.layers.reduce_sum(input=para_mul))
+            avg_cost_l2 += fluid.layers.sums(para_sum) * .5
+            optimizer = fluid.optimizer.Adagrad(learning_rate=0.1)
+            optimizer.minimize(avg_cost_l2)
+            param_sum = self.run_program(place, [data, label])
+        return param_sum
+    def test_l2(self):
+        for place in self.get_places():
+            dense_sparse_p_sum = []
+            for sparse in [True, False]:
+                model = partial(bow_net, is_sparse=sparse)
+                framework_l2 = self.check_l2decay_regularizer(place, model)
+                l2 = self.check_l2decay(place, model)
+                assert len(l2) == len(framework_l2)
+                for i in range(len(l2)):
+                    assert np.isclose(a=framework_l2[i], b=l2[i], rtol=5e-5)
+                dense_sparse_p_sum.append(framework_l2)
+            assert len(dense_sparse_p_sum[0]) == len(dense_sparse_p_sum[1])
+            for i in range(len(dense_sparse_p_sum[0])):
+                assert np.isclose(
+                    a=dense_sparse_p_sum[0][i],
+                    b=dense_sparse_p_sum[1][i],
+                    rtol=5e-5)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
@@ -63,6 +63,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
        # expected output selected rows
        expected_out0_rows = [0, 4]
        expected_out1_rows = [0, 2]
+        expected_out2_rows = []
        expected_out4_rows = [0]
        op = Operator(
@@ -75,6 +76,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
        self.assertEqual(outs[0].rows(), expected_out0_rows)
        self.assertEqual(outs[1].rows(), expected_out1_rows)
+        self.assertEqual(outs[2].rows(), expected_out2_rows)
        self.assertEqual(outs[4].rows(), expected_out4_rows)
        self.assertEqual(outs[0].height(), height_sections[0])
@@ -84,6 +86,9 @@ class TestSpliteSelectedRows(unittest.TestCase):
        self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 1])
        self.assertAlmostEqual(8.0, np.array(outs[4].get_tensor())[0, 1])
+        self.assertEqual(outs[2].numel(), 0)
+        self.assertEqual(outs[3].numel(), 0)
    def check_grad_with_place(self, place):
        scope = core.Scope()
        height = 10