提交 703b26e6 编写于 作者: P peizhilin

add profiler, parallel_executor back

上级 935387f3
...@@ -31,9 +31,7 @@ function(windows_symbolic TARGET) ...@@ -31,9 +31,7 @@ function(windows_symbolic TARGET)
endfunction() endfunction()
add_subdirectory(ir) add_subdirectory(ir)
if (NOT WIN32)
add_subdirectory(details) add_subdirectory(details)
endif (NOT WIN32)
# ddim lib # ddim lib
proto_library(framework_proto SRCS framework.proto) proto_library(framework_proto SRCS framework.proto)
...@@ -118,13 +116,8 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) ...@@ -118,13 +116,8 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
if (NOT WIN32)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor profiler) shape_inference data_transform lod_tensor profiler)
else()
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform lod_tensor)
endif(NOT WIN32)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
...@@ -179,12 +172,10 @@ else() ...@@ -179,12 +172,10 @@ else()
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
endif() endif()
if (NOT WIN32)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS cc_library(parallel_executor SRCS parallel_executor.cc DEPS
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
graph build_strategy graph build_strategy
fast_threaded_ssa_graph_executor) fast_threaded_ssa_graph_executor)
endif() # NOT WIN32
cc_library(prune SRCS prune.cc DEPS framework_proto) cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
......
...@@ -13,9 +13,9 @@ ...@@ -13,9 +13,9 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include <ThreadPool.h>
#include <string> #include <string>
#include <vector> #include <vector>
#include "ThreadPool.h"
#include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/blocking_queue.h"
#include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/exception_holder.h"
#include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h"
......
...@@ -17,7 +17,8 @@ ...@@ -17,7 +17,8 @@
#ifdef _WIN32 #ifdef _WIN32
#define posix_memalign_free _aligned_free #define posix_memalign_free _aligned_free
#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) #define posix_memalign(p, a, s) \
(((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
#endif #endif
namespace paddle { namespace paddle {
......
if (NOT WIN32)
proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto) proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
py_proto_compile(profiler_py_proto SRCS profiler.proto) py_proto_compile(profiler_py_proto SRCS profiler.proto)
...@@ -6,11 +5,19 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _ ...@@ -6,11 +5,19 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _
add_dependencies(profiler_py_proto profiler_py_proto_init) add_dependencies(profiler_py_proto profiler_py_proto_init)
if (NOT WIN32)
add_custom_command(TARGET profiler_py_proto POST_BUILD add_custom_command(TARGET profiler_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
else(NOT WIN32)
string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/")
add_custom_command(TARGET profiler_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
COMMAND copy /Y *.py ${proto_dstpath}
COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif(NOT WIN32) endif(NOT WIN32)
if(WITH_GPU) if(WITH_GPU)
...@@ -60,12 +67,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context) ...@@ -60,12 +67,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
if (NOT WIN32)
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
endif(NOT WIN32)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
......
...@@ -13,17 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,17 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#if !defined(_WIN32)
#include <sys/time.h>
#else
#include <windows.h>
#endif // !_WIN32
#include <time.h>
#include <chrono> // NOLINT #include <chrono> // NOLINT
#include <string> #include <string>
#include "paddle/fluid/platform/dynload/cupti.h" #include "paddle/fluid/platform/dynload/cupti.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.pb.h" #include "paddle/fluid/platform/profiler.pb.h"
namespace paddle { namespace paddle {
...@@ -32,15 +26,11 @@ namespace platform { ...@@ -32,15 +26,11 @@ namespace platform {
/////////////////////// ///////////////////////
// WARN: Under Development. Don't depend on it yet. // WARN: Under Development. Don't depend on it yet.
////////////////////// //////////////////////
#if !defined(_WIN32)
inline uint64_t PosixInNsec() { inline uint64_t PosixInNsec() {
struct timeval tv; struct timeval tv;
gettimeofday(&tv, nullptr); gettimeofday(&tv, nullptr);
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec); return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
} }
#else
inline uint64_t PosixInNsec() { return static_cast<uint64_t>(0); }
#endif // !_WIN32
// DeviceTracer performs the following tasks: // DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc. // 1. Register cuda callbacks for various events: kernel, memcpy, etc.
......
...@@ -134,7 +134,7 @@ struct EOFException : public std::exception { ...@@ -134,7 +134,7 @@ struct EOFException : public std::exception {
#define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1) #define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
#else #else
// there is no equivalent intrinsics in msvc. // there is no equivalent intrinsics in msvc.
#define LIKELY(condition) !(condition) #define LIKELY(condition) (condition)
#endif #endif
template <typename... Args> template <typename... Args>
......
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
#include <cstdio> #include <cstdio>
#include <stdexcept> #include <stdexcept>
#include <time.h>
#include <memory> #include <memory>
#include <string> #include <string>
...@@ -27,6 +28,7 @@ ...@@ -27,6 +28,7 @@
#include <dlfcn.h> // dladdr #include <dlfcn.h> // dladdr
#include <execinfo.h> // backtrace #include <execinfo.h> // backtrace
#include <sys/stat.h> #include <sys/stat.h>
#include <sys/time.h>
#include <algorithm> // std::accumulate #include <algorithm> // std::accumulate
#else #else
#include <io.h> // _popen, _pclose #include <io.h> // _popen, _pclose
...@@ -57,6 +59,25 @@ static void *dlopen(const char *filename, int flag) { ...@@ -57,6 +59,25 @@ static void *dlopen(const char *filename, int flag) {
return reinterpret_cast<void *>(hModule); return reinterpret_cast<void *>(hModule);
} }
static int gettimeofday(struct timeval *tp, void *tzp) {
time_t clock;
struct tm tm;
SYSTEMTIME wtm;
GetLocalTime(&wtm);
tm.tm_year = wtm.wYear - 1900;
tm.tm_mon = wtm.wMonth - 1;
tm.tm_mday = wtm.wDay;
tm.tm_hour = wtm.wHour;
tm.tm_min = wtm.wMinute;
tm.tm_sec = wtm.wSecond;
tm.tm_isdst = -1;
clock = mktime(&tm);
tp->tv_sec = clock;
tp->tv_usec = wtm.wMilliseconds * 1000;
return (0);
}
#endif // !_WIN32 #endif // !_WIN32
static void ExecShellCommand(const std::string &cmd, std::string *message) { static void ExecShellCommand(const std::string &cmd, std::string *message) {
......
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/port.h"
#include <sys/time.h>
#include <algorithm> #include <algorithm>
#include <iomanip> #include <iomanip>
#include <limits> #include <limits>
...@@ -438,10 +438,10 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -438,10 +438,10 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
event_items[index].total_time += event_time; event_items[index].total_time += event_time;
// min time // min time
event_items[index].min_time = event_items[index].min_time =
std::min(event_time, event_items[index].min_time); (std::min)(event_time, event_items[index].min_time);
// max time // max time
event_items[index].max_time = event_items[index].max_time =
std::max(event_time, event_items[index].max_time); (std::max)(event_time, event_items[index].max_time);
} }
// remove the push marker from the list // remove the push marker from the list
......
...@@ -69,7 +69,6 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx); ...@@ -69,7 +69,6 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
void PopEvent(const std::string& name, const DeviceContext* dev_ctx); void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
#if !defined(_WIN32)
struct RecordEvent { struct RecordEvent {
// dev_ctx can be set to nullptr if device is cpu. // dev_ctx can be set to nullptr if device is cpu.
RecordEvent(const std::string& name, const DeviceContext* dev_ctx); RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
...@@ -106,15 +105,6 @@ struct RecordBlock { ...@@ -106,15 +105,6 @@ struct RecordBlock {
std::string name_; std::string name_;
uint64_t start_ns_; uint64_t start_ns_;
}; };
#else
// windows do not support profiler temporarily.
struct RecordEvent {
RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {}
};
struct RecordBlock {
explicit RecordBlock(int block_id) {}
};
#endif
// Return the event list of all threads. Assumed the returned value calls // Return the event list of all threads. Assumed the returned value calls
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
......
...@@ -45,16 +45,15 @@ class StreamCallbackManager { ...@@ -45,16 +45,15 @@ class StreamCallbackManager {
inline void AddCallback(Callback &&callback) const { inline void AddCallback(Callback &&callback) const {
auto *stream_callback_context = auto *stream_callback_context =
new StreamCallbackContext(this, std::forward<Callback>(callback)); new StreamCallbackContext(this, std::forward<Callback>(callback));
PADDLE_ENFORCE(
#if CUDA_VERSION >= 10000 #if CUDA_VERSION >= 10000
cudaLaunchHostFunc(stream_, StreamCallbackManager::StreamCallbackFunc, PADDLE_ENFORCE(cudaLaunchHostFunc(stream_,
stream_callback_context)
#else
cudaStreamAddCallback(stream_,
StreamCallbackManager::StreamCallbackFunc, StreamCallbackManager::StreamCallbackFunc,
stream_callback_context, 0) stream_callback_context)); // NOLINT
#else
PADDLE_ENFORCE(cudaStreamAddCallback(
stream_, StreamCallbackManager::StreamCallbackFunc,
stream_callback_context, 0)); // NOLINT
#endif #endif
); // NOLINT
} }
void Wait() const { thread_pool_.reset(new ThreadPool(1)); } void Wait() const { thread_pool_.reset(new ThreadPool(1)); }
......
set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder) set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler)
set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc) set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc)
if(NOT WIN32)
list(APPEND PYBIND_DEPS parallel_executor profiler)
endif(NOT WIN32)
if(WITH_PYTHON) if(WITH_PYTHON)
if(WITH_AMD_GPU) if(WITH_AMD_GPU)
hip_library(paddle_pybind SHARED hip_library(paddle_pybind SHARED
......
...@@ -36,9 +36,7 @@ limitations under the License. */ ...@@ -36,9 +36,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/lod_tensor_array.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#ifndef _WIN32
#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/parallel_executor.h"
#endif
#include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/prune.h"
#include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/reader.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
...@@ -637,7 +635,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -637,7 +635,6 @@ All parameter, weight, gradient are variables in Paddle.
#endif #endif
#endif #endif
#ifndef _WIN32
py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic()) py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
.value("kDisabled", platform::ProfilerState::kDisabled) .value("kDisabled", platform::ProfilerState::kDisabled)
.value("kCPU", platform::ProfilerState::kCPU) .value("kCPU", platform::ProfilerState::kCPU)
...@@ -658,7 +655,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -658,7 +655,6 @@ All parameter, weight, gradient are variables in Paddle.
m.def("disable_profiler", platform::DisableProfiler); m.def("disable_profiler", platform::DisableProfiler);
m.def("is_profiler_enabled", platform::IsProfileEnabled); m.def("is_profiler_enabled", platform::IsProfileEnabled);
m.def("reset_profiler", platform::ResetProfiler); m.def("reset_profiler", platform::ResetProfiler);
#endif
py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass"); py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
pass.def(py::init()) pass.def(py::init())
...@@ -687,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -687,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle.
.def("remove_pass", .def("remove_pass",
[](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); }); [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
#ifndef _WIN32
// -- python binds for parallel executor. // -- python binds for parallel executor.
py::class_<ParallelExecutor> pe(m, "ParallelExecutor"); py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC( py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
...@@ -913,7 +908,6 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -913,7 +908,6 @@ All parameter, weight, gradient are variables in Paddle.
pybind11::gil_scoped_release release; pybind11::gil_scoped_release release;
self.Run(fetch_tensors, fetched_var_name); self.Run(fetch_tensors, fetched_var_name);
}); });
#endif
BindRecordIOWriter(&m); BindRecordIOWriter(&m);
return m.ptr(); return m.ptr();
......
...@@ -47,8 +47,7 @@ from . import profiler ...@@ -47,8 +47,7 @@ from . import profiler
from . import unique_name from . import unique_name
from . import recordio_writer from . import recordio_writer
from . import parallel_executor from . import parallel_executor
if os.name != 'nt': from .parallel_executor import *
from .parallel_executor import *
from paddle.fluid.layers.math_op_patch import monkey_patch_variable from paddle.fluid.layers.math_op_patch import monkey_patch_variable
Tensor = LoDTensor Tensor = LoDTensor
......
...@@ -25,11 +25,11 @@ import os ...@@ -25,11 +25,11 @@ import os
__all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy'] __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy']
if os.name != 'nt': ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy BuildStrategy = core.ParallelExecutor.BuildStrategy
BuildStrategy = core.ParallelExecutor.BuildStrategy
class ParallelExecutor(object):
class ParallelExecutor(object):
""" """
ParallelExecutor is designed for data parallelism, which focuses on distributing ParallelExecutor is designed for data parallelism, which focuses on distributing
the data across different nodes and every node operates on the data in parallel. the data across different nodes and every node operates on the data in parallel.
...@@ -160,8 +160,7 @@ if os.name != 'nt': ...@@ -160,8 +160,7 @@ if os.name != 'nt':
for p in main.global_block().iter_parameters() for p in main.global_block().iter_parameters()
if not p.stop_gradient if not p.stop_gradient
]), ]),
set(cpt.to_text(var) set(cpt.to_text(var) for var in self.persistable_vars), main.desc,
for var in self.persistable_vars), main.desc,
cpt.to_text(loss_name) cpt.to_text(loss_name)
if loss_name else six.u(''), scope, local_scopes, exec_strategy, if loss_name else six.u(''), scope, local_scopes, exec_strategy,
build_strategy, num_trainers, trainer_id) build_strategy, num_trainers, trainer_id)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册