add profiler, parallel_executor back

703b26e6 · peizhilin · 935387f3 · 703b26e6 · 703b26e6 · 703b26e6
14 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -31,9 +31,7 @@ function(windows_symbolic TARGET)
 endfunction()

 add_subdirectory(ir)
-if (NOT WIN32)
 add_subdirectory(details)
-endif (NOT WIN32)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)

@@ -118,13 +116,8 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)

-if (NOT WIN32)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
    shape_inference data_transform lod_tensor profiler)
-else()
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor)
-endif(NOT WIN32)

 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)

@@ -179,12 +172,10 @@ else()
  cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()

-if (NOT WIN32)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
        graph build_strategy
        fast_threaded_ssa_graph_executor)
-endif() # NOT WIN32

 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -13,9 +13,9 @@
 // limitations under the License.

 #pragma once
+#include <ThreadPool.h>
 #include <string>
 #include <vector>
-#include "ThreadPool.h"
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"

--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -17,7 +17,8 @@

 #ifdef _WIN32
 #define posix_memalign_free _aligned_free
-#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
+#define posix_memalign(p, a, s) \
+  (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
 #endif

 namespace paddle {

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
-if (NOT WIN32)
 proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
 py_proto_compile(profiler_py_proto SRCS profiler.proto)

@@ -6,11 +5,19 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _

 add_dependencies(profiler_py_proto profiler_py_proto_init)

+if (NOT WIN32)
 add_custom_command(TARGET profiler_py_proto POST_BUILD
        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
        COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+else(NOT WIN32)
+string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/")
+add_custom_command(TARGET profiler_py_proto POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND copy /Y *.py ${proto_dstpath}
+        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif(NOT WIN32)

 if(WITH_GPU)
@@ -60,12 +67,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)

-
-if (NOT WIN32)
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
-endif(NOT WIN32)

 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)

--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -13,17 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once

-#if !defined(_WIN32)
-#include <sys/time.h>
-#else
-#include <windows.h>
-#endif  // !_WIN32
-
-#include <time.h>
 #include <chrono>  // NOLINT
 #include <string>

 #include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.pb.h"

 namespace paddle {
@@ -32,15 +26,11 @@ namespace platform {
 ///////////////////////
 // WARN: Under Development. Don't depend on it yet.
 //////////////////////
-#if !defined(_WIN32)
 inline uint64_t PosixInNsec() {
  struct timeval tv;
  gettimeofday(&tv, nullptr);
  return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
 }
-#else
-inline uint64_t PosixInNsec() { return static_cast<uint64_t>(0); }
-#endif  // !_WIN32

 // DeviceTracer performs the following tasks:
 // 1. Register cuda callbacks for various events: kernel, memcpy, etc.

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -134,7 +134,7 @@ struct EOFException : public std::exception {
 #define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
 #else
 // there is no equivalent intrinsics in msvc.
-#define LIKELY(condition) !(condition)
+#define LIKELY(condition) (condition)
 #endif

 template <typename... Args>

--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -17,6 +17,7 @@
 #include <cstdio>
 #include <stdexcept>

+#include <time.h>
 #include <memory>
 #include <string>

@@ -27,6 +28,7 @@
 #include <dlfcn.h>     //  dladdr
 #include <execinfo.h>  // backtrace
 #include <sys/stat.h>
+#include <sys/time.h>
 #include <algorithm>  // std::accumulate
 #else
 #include <io.h>  // _popen, _pclose
@@ -57,6 +59,25 @@ static void *dlopen(const char *filename, int flag) {
  return reinterpret_cast<void *>(hModule);
 }

+static int gettimeofday(struct timeval *tp, void *tzp) {
+  time_t clock;
+  struct tm tm;
+  SYSTEMTIME wtm;
+
+  GetLocalTime(&wtm);
+  tm.tm_year = wtm.wYear - 1900;
+  tm.tm_mon = wtm.wMonth - 1;
+  tm.tm_mday = wtm.wDay;
+  tm.tm_hour = wtm.wHour;
+  tm.tm_min = wtm.wMinute;
+  tm.tm_sec = wtm.wSecond;
+  tm.tm_isdst = -1;
+  clock = mktime(&tm);
+  tp->tv_sec = clock;
+  tp->tv_usec = wtm.wMilliseconds * 1000;
+
+  return (0);
+}
 #endif  // !_WIN32

 static void ExecShellCommand(const std::string &cmd, std::string *message) {

--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/port.h"

-#include <sys/time.h>
 #include <algorithm>
 #include <iomanip>
 #include <limits>
@@ -438,10 +438,10 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
            event_items[index].total_time += event_time;
            // min time
            event_items[index].min_time =
-                std::min(event_time, event_items[index].min_time);
+                (std::min)(event_time, event_items[index].min_time);
            // max time
            event_items[index].max_time =
-                std::max(event_time, event_items[index].max_time);
+                (std::max)(event_time, event_items[index].max_time);
          }

          // remove the push marker from the list

--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -69,7 +69,6 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);

 void PopEvent(const std::string& name, const DeviceContext* dev_ctx);

-#if !defined(_WIN32)
 struct RecordEvent {
  // dev_ctx can be set to nullptr if device is cpu.
  RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
@@ -106,15 +105,6 @@ struct RecordBlock {
  std::string name_;
  uint64_t start_ns_;
 };
-#else
-// windows do not support profiler temporarily.
-struct RecordEvent {
-  RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {}
-};
-struct RecordBlock {
-  explicit RecordBlock(int block_id) {}
-};
-#endif

 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.

--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -45,16 +45,15 @@ class StreamCallbackManager {
  inline void AddCallback(Callback &&callback) const {
    auto *stream_callback_context =
        new StreamCallbackContext(this, std::forward<Callback>(callback));
-    PADDLE_ENFORCE(
 #if CUDA_VERSION >= 10000
-        cudaLaunchHostFunc(stream_, StreamCallbackManager::StreamCallbackFunc,
-                           stream_callback_context)
-#else
-        cudaStreamAddCallback(stream_,
+    PADDLE_ENFORCE(cudaLaunchHostFunc(stream_,
                                      StreamCallbackManager::StreamCallbackFunc,
-                              stream_callback_context, 0)
+                                      stream_callback_context));  // NOLINT
+#else
+    PADDLE_ENFORCE(cudaStreamAddCallback(
+        stream_, StreamCallbackManager::StreamCallbackFunc,
+        stream_callback_context, 0));  // NOLINT
 #endif
-            );  // NOLINT
  }

  void Wait() const { thread_pool_.reset(new ThreadPool(1)); }

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt

-set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder)
+set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler)
 set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc)
-if(NOT WIN32)
-  list(APPEND PYBIND_DEPS parallel_executor profiler)
-endif(NOT WIN32)
 if(WITH_PYTHON)
  if(WITH_AMD_GPU)
    hip_library(paddle_pybind SHARED

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -36,9 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
-#ifndef _WIN32
 #include "paddle/fluid/framework/parallel_executor.h"
-#endif
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -637,7 +635,6 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #endif

-#ifndef _WIN32
  py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
      .value("kDisabled", platform::ProfilerState::kDisabled)
      .value("kCPU", platform::ProfilerState::kCPU)
@@ -658,7 +655,6 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("disable_profiler", platform::DisableProfiler);
  m.def("is_profiler_enabled", platform::IsProfileEnabled);
  m.def("reset_profiler", platform::ResetProfiler);
-#endif

  py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
  pass.def(py::init())
@@ -687,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle.
      .def("remove_pass",
           [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });

-#ifndef _WIN32
  // -- python binds for parallel executor.
  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
@@ -913,7 +908,6 @@ All parameter, weight, gradient are variables in Paddle.
        pybind11::gil_scoped_release release;
        self.Run(fetch_tensors, fetched_var_name);
      });
-#endif

  BindRecordIOWriter(&m);
  return m.ptr();

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -47,8 +47,7 @@ from . import profiler
 from . import unique_name
 from . import recordio_writer
 from . import parallel_executor
-if os.name != 'nt':
-    from .parallel_executor import *
+from .parallel_executor import *
 from paddle.fluid.layers.math_op_patch import monkey_patch_variable

 Tensor = LoDTensor

--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -25,11 +25,11 @@ import os

 __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy']

-if os.name != 'nt':
-    ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
-    BuildStrategy = core.ParallelExecutor.BuildStrategy
+ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
+BuildStrategy = core.ParallelExecutor.BuildStrategy

-    class ParallelExecutor(object):
+
+class ParallelExecutor(object):
    """
    ParallelExecutor is designed for data parallelism, which focuses on distributing
    the data across different nodes and every node operates on the data in parallel.
@@ -160,8 +160,7 @@ if os.name != 'nt':
                for p in main.global_block().iter_parameters()
                if not p.stop_gradient
            ]),
-                set(cpt.to_text(var)
-                    for var in self.persistable_vars), main.desc,
+            set(cpt.to_text(var) for var in self.persistable_vars), main.desc,
            cpt.to_text(loss_name)
            if loss_name else six.u(''), scope, local_scopes, exec_strategy,
            build_strategy, num_trainers, trainer_id)