Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/refine_eigen_tensor

81520a24 · Yu Yang · 9bd70a1e · 8175983e · 81520a24 · 81520a24
11 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,7 +54,7 @@ option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
 option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
 option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
 option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
-option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler"        OFF)
+option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
 option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
@@ -254,6 +254,12 @@ elseif()
    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in MKL only now." FORCE)
 endif()
+if (WITH_PROFILER)
+    find_package(Gperftools REQUIRED)
+    include_directories(${GPERFTOOLS_INCLUDE_DIR})
+    add_definitions(-DWITH_GPERFTOOLS)
+endif()
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation

--- a/cmake/FindGperftools.cmake
+++ b/cmake/FindGperftools.cmake
+# Tries to find Gperftools.
+#
+# Usage of this module as follows:
+#
+#     find_package(Gperftools)
+#
+# Variables used by this module, they can change the default behaviour and need
+# to be set before calling find_package:
+#
+#  Gperftools_ROOT_DIR  Set this variable to the root installation of
+#                       Gperftools if the module has problems finding
+#                       the proper installation path.
+#
+# Variables defined by this module:
+#
+#  GPERFTOOLS_FOUND              System has Gperftools libs/headers
+#  GPERFTOOLS_LIBRARIES          The Gperftools libraries (tcmalloc & profiler)
+#  GPERFTOOLS_INCLUDE_DIR        The location of Gperftools headers
+find_library(GPERFTOOLS_TCMALLOC
+  NAMES tcmalloc
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+find_library(GPERFTOOLS_PROFILER
+  NAMES profiler
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+find_library(GPERFTOOLS_TCMALLOC_AND_PROFILER
+  NAMES tcmalloc_and_profiler
+  HINTS ${Gperftools_ROOT_DIR}/lib)
+find_path(GPERFTOOLS_INCLUDE_DIR
+  NAMES gperftools/heap-profiler.h
+  HINTS ${Gperftools_ROOT_DIR}/include)
+set(GPERFTOOLS_LIBRARIES ${GPERFTOOLS_TCMALLOC_AND_PROFILER})
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(
+  Gperftools
+  DEFAULT_MSG
+  GPERFTOOLS_LIBRARIES
+  GPERFTOOLS_INCLUDE_DIR)
+mark_as_advanced(
+  Gperftools_ROOT_DIR
+  GPERFTOOLS_TCMALLOC
+  GPERFTOOLS_PROFILER
+  GPERFTOOLS_TCMALLOC_AND_PROFILER
+  GPERFTOOLS_LIBRARIES
+  GPERFTOOLS_INCLUDE_DIR)
+# create IMPORTED targets
+if (Gperftools_FOUND AND NOT TARGET gperftools::tcmalloc)
+  add_library(gperftools::tcmalloc UNKNOWN IMPORTED)
+  set_target_properties(gperftools::tcmalloc PROPERTIES
+    IMPORTED_LOCATION ${GPERFTOOLS_TCMALLOC}
+    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
+  add_library(gperftools::profiler UNKNOWN IMPORTED)
+  set_target_properties(gperftools::profiler PROPERTIES
+    IMPORTED_LOCATION ${GPERFTOOLS_PROFILER}
+    INTERFACE_INCLUDE_DIRECTORIES "${GPERFTOOLS_INCLUDE_DIR}")
+endif()
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -110,6 +110,14 @@ function(find_fluid_modules TARGET_NAME)
  endif()
 endfunction(find_fluid_modules)
+function(common_link TARGET_NAME)
+  if (WITH_PROFILER)
+    target_link_libraries(${TARGET_NAME} gperftools::profiler)
+  endif()
+endfunction()
 # find all third_party modules is used for paddle static library
 # for reduce the dependency when building the inference libs.
 set_property(GLOBAL PROPERTY FLUID_THIRD_PARTY)
@@ -274,6 +282,7 @@ function(cc_library TARGET_NAME)
      endif()
      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
+      common_link(${TARGET_NAME})
    endif()
    # cpplint code style
@@ -340,6 +349,7 @@ function(cc_binary TARGET_NAME)
  if(cc_binary_DEPS)
    target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS})
    add_dependencies(${TARGET_NAME} ${cc_binary_DEPS})
+    common_link(${TARGET_NAME})
  endif()
 endfunction(cc_binary)
@@ -362,6 +372,7 @@ function(cc_test TARGET_NAME)
      target_link_libraries(${TARGET_NAME} ${win32_deps})
    endif(WIN32)
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    common_link(${TARGET_NAME})
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
@@ -420,6 +431,7 @@ function(nv_binary TARGET_NAME)
    if(nv_binary_DEPS)
      target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
      add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
+      common_link(${TARGET_NAME})
    endif()
  endif()
 endfunction(nv_binary)
@@ -433,6 +445,7 @@ function(nv_test TARGET_NAME)
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})
    if (nv_test_SERIAL)
        set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
@@ -499,6 +512,7 @@ function(hip_binary TARGET_NAME)
    if(hip_binary_DEPS)
      target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
      add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
+      common_link(${TARGET_NAME})
    endif()
  endif()
 endfunction(hip_binary)
@@ -518,6 +532,7 @@ function(hip_test TARGET_NAME)
    set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
    target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
    add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main memory gtest gflags)
+    common_link(${TARGET_NAME})
    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
 endfunction(hip_test)
@@ -560,6 +575,7 @@ function(go_library TARGET_NAME)
  endif()
  if(go_library_DEPS)
    add_dependencies(${TARGET_NAME} ${go_library_DEPS})
+    common_link(${TARGET_NAME})
  endif(go_library_DEPS)
  # The "source file" of the library is `${dummyfile}` which never

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -30,13 +30,36 @@ limitations under the License. */
 #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
 #include "paddle/fluid/platform/profiler.h"
+#ifdef WITH_GPERFTOOLS
+#include "gperftools/profiler.h"
+#endif
+DEFINE_string(pe_profile_fname, "",
+              "Profiler filename for PE, which generated by gperftools."
+              "Only valid when compiled `WITH_PRIFILER=ON`. Empty if disable.");
 namespace paddle {
 namespace framework {
+static std::once_flag gProfileOnce;
+#ifdef WITH_GPERFTOOLS
+static bool gProfileStarted = false;
+#endif
 class ParallelExecutorPrivate {
 public:
  explicit ParallelExecutorPrivate(const std::vector<platform::Place> &places)
-      : places_(places) {}
+      : places_(places) {
+    if (!FLAGS_pe_profile_fname.empty()) {
+      std::call_once(gProfileOnce, [] {
+#ifdef WITH_GPERFTOOLS
+        ProfilerStart(FLAGS_pe_profile_fname.c_str());
+        gProfileStarted = true;
+#else
+        LOG(WARNING) << "Paddle is not compiled with gperftools. "
+                        "FLAGS_pe_profile_fname will be ignored";
+#endif
+      });
+    }
+  }
  ~ParallelExecutorPrivate() {
    if (own_local_scope_) {
@@ -270,6 +293,12 @@ void ParallelExecutor::BCastParamsToDevices(
 void ParallelExecutor::Run(const std::vector<std::string> &fetch_tensors,
                           const std::string &fetched_var_name) {
+#ifdef WITH_GPERFTOOLS
+  if (gProfileStarted) {
+    ProfilerFlush();
+  }
+#endif
  platform::RecordBlock b(0);
 #ifdef PADDLE_WITH_CUDA
  if (!gcs_.empty()) {

--- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
+++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc
@@ -79,16 +79,16 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
  }
 };
-#define INTRIAVX_FLOAT(isa, block)                                             \
+#define INTRIAVX_FLOAT(isa, jit_block)                                         \
  template <>                                                                  \
-  LayerNormKernelImpl<float, isa, block>::LayerNormKernelImpl(int right)       \
+  LayerNormKernelImpl<float, isa, jit_block>::LayerNormKernelImpl(int right)   \
      : LayerNormKernel<float>() {                                             \
    this->num_ = right;                                                        \
    this->rest_ = this->num_ % YMM_FLOAT_BLOCK;                                \
    this->end_ = this->num_ - this->rest_;                                     \
  }                                                                            \
  template <>                                                                  \
-  void LayerNormKernelImpl<float, platform::avx, block>::Compute(              \
+  void LayerNormKernelImpl<float, isa, jit_block>::Compute(                    \
      float* x, float* out, float* mean, float* var, const float* scale,       \
      const float* bias, int height, const float epsilon) const {              \
    __m256 sum;                                                                \
@@ -97,6 +97,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
    __m256 tmp;                                                                \
    size_t offset;                                                             \
    size_t j;                                                                  \
+    size_t block = YMM_FLOAT_BLOCK;                                            \
    __m256 reverse_num_vec =                                                   \
        _mm256_div_ps(_mm256_set1_ps(1.0), _mm256_set1_ps(this->num_));        \
    __m256 epsilon_vec = _mm256_set1_ps(epsilon);                              \
@@ -221,12 +222,14 @@ INTRIAVX_FLOAT(platform::avx, kEQ8);
 INTRIAVX_FLOAT(platform::avx, kGT8LT16);
 INTRIAVX_FLOAT(platform::avx, kEQ16);
 INTRIAVX_FLOAT(platform::avx, kGT16);
-#endif
-#ifdef __AVX2__
 INTRIAVX_FLOAT(platform::avx2, kEQ8);
 INTRIAVX_FLOAT(platform::avx2, kGT8LT16);
 INTRIAVX_FLOAT(platform::avx2, kEQ16);
 INTRIAVX_FLOAT(platform::avx2, kGT16);
+INTRIAVX_FLOAT(platform::avx512f, kEQ8);
+INTRIAVX_FLOAT(platform::avx512f, kGT8LT16);
+INTRIAVX_FLOAT(platform::avx512f, kEQ16);
+INTRIAVX_FLOAT(platform::avx512f, kGT16);
 #endif
 #undef INTRIAVX_FLOAT

--- a/paddle/fluid/operators/split_selected_rows_op.h
+++ b/paddle/fluid/operators/split_selected_rows_op.h
@@ -72,10 +72,11 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
    for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
      auto rows_idx = outs_rows_idx[i];
      outs[i]->set_height(height_sections[i]);
+      auto dims = x->GetCompleteDims();
+      dims[0] = rows_idx.size();
+      outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
+      outs[i]->mutable_rows()->clear();
      if (rows_idx.size() > 0) {
-        auto dims = x->GetCompleteDims();
-        dims[0] = rows_idx.size();
-        outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
        for (auto idx : rows_idx) {
          outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
        }
@@ -98,6 +99,8 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
          }
        }
      }
+      PADDLE_ENFORCE_EQ(rows_idx.size(), outs[i]->rows().size(),
+                        "rows should has the same size with tensor dim 0");
    }
  }
 };

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -62,45 +62,54 @@ inline std::string demangle(std::string name) { return name; }
 #endif
 struct EnforceNotMet : public std::exception {
-  std::exception_ptr exp_;
  std::string err_str_;
-  EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) {
+  EnforceNotMet(std::exception_ptr e, const char* f, int l) {
-    static constexpr int TRACE_STACK_LIMIT = 100;
    try {
-      std::rethrow_exception(exp_);
+      std::rethrow_exception(e);
-    } catch (const std::exception& exp) {
+    } catch (std::exception& e) {
-      std::ostringstream sout;
+      Init(e.what(), f, l);
+    }
+  }
-      sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl;
+  template <typename... ARGS>
-      sout << "PaddlePaddle Call Stacks: " << std::endl;
+  EnforceNotMet(const char* f, int l, ARGS... args) {
+    Init(string::Sprintf(args...), f, l);
+  }
+  const char* what() const noexcept override { return err_str_.c_str(); }
+ private:
+  template <typename StrType>
+  inline void Init(StrType what, const char* f, int l) {
+    static constexpr int TRACE_STACK_LIMIT = 100;
+    std::ostringstream sout;
+    sout << string::Sprintf("%s at [%s:%d]", what, f, l) << std::endl;
+    sout << "PaddlePaddle Call Stacks: " << std::endl;
 #if !defined(_WIN32)
-      void* call_stack[TRACE_STACK_LIMIT];
+    void* call_stack[TRACE_STACK_LIMIT];
-      auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
+    auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
-      auto symbols = backtrace_symbols(call_stack, size);
+    auto symbols = backtrace_symbols(call_stack, size);
+    Dl_info info;
-      Dl_info info;
+    for (int i = 0; i < size; ++i) {
-      for (int i = 0; i < size; ++i) {
+      if (dladdr(call_stack[i], &info) && info.dli_sname) {
-        if (dladdr(call_stack[i], &info) && info.dli_sname) {
+        auto demangled = demangle(info.dli_sname);
-          auto demangled = demangle(info.dli_sname);
+        auto addr_offset = static_cast<char*>(call_stack[i]) -
-          auto addr_offset = static_cast<char*>(call_stack[i]) -
+                           static_cast<char*>(info.dli_saddr);
-                             static_cast<char*>(info.dli_saddr);
+        sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,
-          sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,
+                                2 + sizeof(void*) * 2, call_stack[i], demangled,
-                                  2 + sizeof(void*) * 2, call_stack[i],
+                                addr_offset);
-                                  demangled, addr_offset);
+      } else {
-        } else {
+        sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
-          sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
+                                call_stack[i]);
-                                  call_stack[i]);
-        }
      }
-      free(symbols);
+    }
+    free(symbols);
 #else
-      sout << "Windows not support stack backtrace yet.";
+    sout << "Windows not support stack backtrace yet.";
 #endif
-      err_str_ = sout.str();
+    err_str_ = sout.str();
-    }
  }
-  const char* what() const noexcept { return err_str_.c_str(); }
 };
 struct EOFException : public std::exception {
@@ -242,13 +251,8 @@ inline void throw_on_error(T e) {
  throw_on_error(e, "");
 }
-#define PADDLE_THROW(...)                                              \
+#define PADDLE_THROW(...) \
-  do {                                                                 \
+  throw ::paddle::platform::EnforceNotMet(__FILE__, __LINE__, __VA_ARGS__)
-    throw ::paddle::platform::EnforceNotMet(                           \
-        std::make_exception_ptr(                                       \
-            std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
-        __FILE__, __LINE__);                                           \
-  } while (false)
 #ifndef REPLACE_ENFORCE_GLOG
 #define PADDLE_ENFORCE(...)                                             \

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -336,6 +336,8 @@ PYBIND11_MODULE(core, m) {
      .def("get_tensor",
           [](SelectedRows &self) { return self.mutable_value(); },
           py::return_value_policy::reference)
+      .def("numel",
+           [](SelectedRows &self) -> int64_t { return self.value().numel(); })
      .def("set_height", &SelectedRows::set_height)
      .def("height", &SelectedRows::height)
      .def("set_rows",

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -127,7 +127,8 @@ def __bootstrap__():
        'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem',
        'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size",
        'eager_delete_tensor_gb', 'allocator_strategy',
-        'reader_queue_speed_test_mode', 'print_sub_graph_dir'
+        'reader_queue_speed_test_mode', 'print_sub_graph_dir',
+        'pe_profile_fname'
    ]
    if 'Darwin' not in sysstr:
        read_env_flags.append('use_pinned_memory')

--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -48,6 +48,7 @@ class WeightedAverage(object):
    Examples:
        .. code-block:: python
            avg = fluid.average.WeightedAverage()
            avg.add(value=2.0, weight=1)
            avg.add(value=4.0, weight=2)

--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
@@ -63,6 +63,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
        # expected output selected rows
        expected_out0_rows = [0, 4]
        expected_out1_rows = [0, 2]
+        expected_out2_rows = []
        expected_out4_rows = [0]
        op = Operator(
@@ -75,6 +76,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
        self.assertEqual(outs[0].rows(), expected_out0_rows)
        self.assertEqual(outs[1].rows(), expected_out1_rows)
+        self.assertEqual(outs[2].rows(), expected_out2_rows)
        self.assertEqual(outs[4].rows(), expected_out4_rows)
        self.assertEqual(outs[0].height(), height_sections[0])
@@ -84,6 +86,9 @@ class TestSpliteSelectedRows(unittest.TestCase):
        self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 1])
        self.assertAlmostEqual(8.0, np.array(outs[4].get_tensor())[0, 1])
+        self.assertEqual(outs[2].numel(), 0)
+        self.assertEqual(outs[3].numel(), 0)
    def check_grad_with_place(self, place):
        scope = core.Scope()
        height = 10