未验证 提交 35d5b1b9 编写于 作者: X Xin Pan 提交者: GitHub

Merge pull request #11036 from panyx0718/dist_timeline

better profiler and benchmark
...@@ -98,6 +98,8 @@ def parse_args(): ...@@ -98,6 +98,8 @@ def parse_args():
'--use_fake_data', '--use_fake_data',
action='store_true', action='store_true',
help='If set ommit the actual read data operators.') help='If set ommit the actual read data operators.')
parser.add_argument(
'--profile', action='store_true', help='If set, profile a few steps.')
parser.add_argument( parser.add_argument(
'--update_method', '--update_method',
type=str, type=str,
...@@ -108,8 +110,8 @@ def parse_args(): ...@@ -108,8 +110,8 @@ def parse_args():
return args return args
def append_nccl2_prepare(): def append_nccl2_prepare(trainer_id):
if os.getenv("PADDLE_TRAINER_ID", None) != None: if trainer_id >= 0:
# append gen_nccl_id at the end of startup program # append gen_nccl_id at the end of startup program
trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
port = os.getenv("PADDLE_PSERVER_PORT") port = os.getenv("PADDLE_PSERVER_PORT")
...@@ -136,12 +138,12 @@ def append_nccl2_prepare(): ...@@ -136,12 +138,12 @@ def append_nccl2_prepare():
}) })
return nccl_id_var, num_trainers, trainer_id return nccl_id_var, num_trainers, trainer_id
else: else:
raise Exception( raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
"must set PADDLE_TRAINER_ID env variables for dist train.") "nccl-based dist train.")
def dist_transpile(): def dist_transpile(trainer_id):
if "PADDLE_TRAINING_ROLE" not in os.environ: if trainer_id < 0:
return None, None return None, None
# the port of all pservers, needed by both trainer and pserver # the port of all pservers, needed by both trainer and pserver
...@@ -158,9 +160,6 @@ def dist_transpile(): ...@@ -158,9 +160,6 @@ def dist_transpile():
trainers = int(os.getenv("PADDLE_TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
# the IP of the local machine, needed by pserver only # the IP of the local machine, needed by pserver only
current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
# the unique trainer id, starting from 0, needed by trainer
# only
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
# the role, should be either PSERVER or TRAINER # the role, should be either PSERVER or TRAINER
training_role = os.getenv("PADDLE_TRAINING_ROLE") training_role = os.getenv("PADDLE_TRAINING_ROLE")
...@@ -295,6 +294,11 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, ...@@ -295,6 +294,11 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
iters = 0 iters = 0
start_time = time.time() start_time = time.time()
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
if args.profile and pass_id == 0 and batch_id == 5:
profiler.start_profiler("All")
elif args.profile and pass_id == 0 and batch_id == 10:
profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
if iters == args.skip_batch_num: if iters == args.skip_batch_num:
start_time = time.time() start_time = time.time()
num_samples = 0 num_samples = 0
...@@ -334,7 +338,11 @@ def print_arguments(args): ...@@ -334,7 +338,11 @@ def print_arguments(args):
def main(): def main():
args = parse_args() args = parse_args()
print_arguments(args) print_arguments(args)
nccl_id_var, num_trainers, trainer_id = None, 1, 0
# the unique trainer id, starting from 0, needed by trainer
# only
nccl_id_var, num_trainers, trainer_id = (
None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1")))
if args.use_cprof: if args.use_cprof:
pr = cProfile.Profile() pr = cProfile.Profile()
...@@ -348,7 +356,7 @@ def main(): ...@@ -348,7 +356,7 @@ def main():
fluid.memory_optimize(fluid.default_main_program()) fluid.memory_optimize(fluid.default_main_program())
if args.update_method == "pserver": if args.update_method == "pserver":
train_prog, startup_prog = dist_transpile() train_prog, startup_prog = dist_transpile(trainer_id)
if not train_prog: if not train_prog:
raise Exception( raise Exception(
"Must configure correct environments to run dist train.") "Must configure correct environments to run dist train.")
...@@ -364,7 +372,7 @@ def main(): ...@@ -364,7 +372,7 @@ def main():
train_args.append(fluid.default_startup_program()) train_args.append(fluid.default_startup_program())
if args.update_method == "nccl2": if args.update_method == "nccl2":
nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare() nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
if args.gpus == 1: if args.gpus == 1:
# NOTE: parallel executor use profiler interanlly # NOTE: parallel executor use profiler interanlly
if args.use_nvprof and args.device == 'GPU': if args.use_nvprof and args.device == 'GPU':
......
...@@ -38,6 +38,7 @@ struct EventList; ...@@ -38,6 +38,7 @@ struct EventList;
static int64_t profiler_lister_id = 0; static int64_t profiler_lister_id = 0;
static bool should_send_profile_state = false; static bool should_send_profile_state = false;
std::mutex profiler_mu;
// The profiler state, the initial value is ProfilerState::kDisabled // The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled; static ProfilerState g_state = ProfilerState::kDisabled;
...@@ -228,6 +229,8 @@ void EnableProfiler(ProfilerState state) { ...@@ -228,6 +229,8 @@ void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled, PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enbale profling, since the input state is ", "Can't enbale profling, since the input state is ",
"ProfilerState::kDisabled"); "ProfilerState::kDisabled");
std::lock_guard<std::mutex> l(profiler_mu);
if (state == g_state) { if (state == g_state) {
return; return;
} }
...@@ -295,7 +298,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, ...@@ -295,7 +298,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
} else if (g_state == ProfilerState::kAll) { } else if (g_state == ProfilerState::kAll) {
place = "All"; place = "All";
} else { } else {
PADDLE_THROW("Invalid profiler state"); PADDLE_THROW("Invalid profiler state", g_state);
} }
std::cout << "Place: " << place << std::endl; std::cout << "Place: " << place << std::endl;
...@@ -443,6 +446,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -443,6 +446,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path) { const std::string& profile_path) {
std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
// Mark the profiling stop. // Mark the profiling stop.
Mark("_stop_profiler_", nullptr); Mark("_stop_profiler_", nullptr);
...@@ -466,7 +470,7 @@ void SetProfileListener() { ...@@ -466,7 +470,7 @@ void SetProfileListener() {
std::mt19937 rng; std::mt19937 rng;
rng.seed(std::random_device()()); rng.seed(std::random_device()());
std::uniform_int_distribution<std::mt19937::result_type> dist6( std::uniform_int_distribution<std::mt19937::result_type> dist6(
1, std::numeric_limits<std::mt19937::result_type>::max()); 1, std::numeric_limits<int>::max());
profiler_lister_id = dist6(rng); profiler_lister_id = dist6(rng);
} }
int64_t ListenerId() { return profiler_lister_id; } int64_t ListenerId() { return profiler_lister_id; }
......
...@@ -495,6 +495,7 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -495,6 +495,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("enable_profiler", platform::EnableProfiler); m.def("enable_profiler", platform::EnableProfiler);
m.def("disable_profiler", platform::DisableProfiler); m.def("disable_profiler", platform::DisableProfiler);
m.def("is_profiler_enabled", platform::IsProfileEnabled);
m.def("reset_profiler", platform::ResetProfiler); m.def("reset_profiler", platform::ResetProfiler);
// -- python binds for parallel executor. // -- python binds for parallel executor.
......
...@@ -16,7 +16,10 @@ import core ...@@ -16,7 +16,10 @@ import core
from contextlib import contextmanager from contextlib import contextmanager
import os import os
__all__ = ['cuda_profiler', 'reset_profiler', 'profiler'] __all__ = [
'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
'stop_profiler'
]
NVPROF_CONFIG = [ NVPROF_CONFIG = [
"gpustarttimestamp", "gpustarttimestamp",
...@@ -72,20 +75,31 @@ def reset_profiler(): ...@@ -72,20 +75,31 @@ def reset_profiler():
core.reset_profiler() core.reset_profiler()
@contextmanager def start_profiler(state):
def profiler(state, sorted_key=None, profile_path='/tmp/profile'): """Enable the profiler.
"""The profiler interface.
Different from cuda_profiler, this profiler can be used to profile both CPU Args:
and GPU program. By defalut, it records the CPU and GPU operator kernels, state (string) : The profiling state, which should be 'CPU', 'GPU'
if you want to profile other program, you can refer the profiling tutorial or 'All'. 'CPU' means only profile CPU. 'GPU' means profiling
to add more records. GPU as well. 'All' also generates timeline.
"""
if core.is_profiler_enabled():
return
if state not in ['CPU', 'GPU', "All"]:
raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
if state == "GPU":
prof_state = core.ProfilerState.kCUDA
elif state == "CPU":
prof_state = core.ProfilerState.kCPU
else:
prof_state = core.ProfilerState.kAll
core.enable_profiler(prof_state)
def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
"""Stop the profiler.
Args: Args:
state (string) : The profiling state, which should be 'CPU' or 'GPU',
telling the profiler to use CPU timer or GPU timer for profiling.
Although users may have already specified the execution place
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
would not inherit this place.
sorted_key (string) : If None, the profiling results will be printed sorted_key (string) : If None, the profiling results will be printed
in the order of first end time of events. Otherwise, the profiling in the order of first end time of events. Otherwise, the profiling
results will be sorted by the this flag. This flag should be one results will be sorted by the this flag. This flag should be one
...@@ -98,17 +112,8 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): ...@@ -98,17 +112,8 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
profile_path (string) : If state == 'All', it will write a profile profile_path (string) : If state == 'All', it will write a profile
proto output file. proto output file.
""" """
if state not in ['CPU', 'GPU', "All"]: if not core.is_profiler_enabled():
raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.") return
if state == "GPU":
prof_state = core.ProfilerState.kCUDA
elif state == "CPU":
prof_state = core.ProfilerState.kCPU
else:
prof_state = core.ProfilerState.kAll
core.enable_profiler(prof_state)
yield
sorted_key = 'default' if sorted_key is None else sorted_key sorted_key = 'default' if sorted_key is None else sorted_key
if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']: if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
raise ValueError("The sorted_key must be None or in 'calls', 'total', " raise ValueError("The sorted_key must be None or in 'calls', 'total', "
...@@ -124,3 +129,34 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): ...@@ -124,3 +129,34 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
# TODO(qingqing) : redirect C++ ostream to Python stream. # TODO(qingqing) : redirect C++ ostream to Python stream.
# with core.ostream_redirect(stdout=True, stderr=True): # with core.ostream_redirect(stdout=True, stderr=True):
core.disable_profiler(key_map[sorted_key], profile_path) core.disable_profiler(key_map[sorted_key], profile_path)
@contextmanager
def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
"""The profiler interface.
Different from cuda_profiler, this profiler can be used to profile both CPU
and GPU program. By defalut, it records the CPU and GPU operator kernels,
if you want to profile other program, you can refer the profiling tutorial
to add more records.
Args:
state (string) : The profiling state, which should be 'CPU' or 'GPU',
telling the profiler to use CPU timer or GPU timer for profiling.
Although users may have already specified the execution place
(CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
would not inherit this place.
sorted_key (string) : If None, the profiling results will be printed
in the order of first end time of events. Otherwise, the profiling
results will be sorted by the this flag. This flag should be one
of 'calls', 'total', 'max', 'min' or 'ave'.
The `calls` means sorting by the number of calls.
The `total` means sorting by the total execution time.
The `max` means sorting by the maximum execution time.
The `min` means sorting by the minimum execution time.
The `ave` means sorting by the average execution time.
profile_path (string) : If state == 'All', it will write a profile
proto output file.
"""
start_profiler(state)
yield
stop_profiler(sorted_key, profile_path)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册