提交 3cb63956 编写于 作者: X Xin Pan

better profiler and benchmark

上级 38af7bca
...@@ -98,6 +98,8 @@ def parse_args(): ...@@ -98,6 +98,8 @@ def parse_args():
'--use_fake_data', '--use_fake_data',
action='store_true', action='store_true',
help='If set ommit the actual read data operators.') help='If set ommit the actual read data operators.')
parser.add_argument(
'--profile', action='store_true', help='If set, profile a few steps.')
parser.add_argument( parser.add_argument(
'--update_method', '--update_method',
type=str, type=str,
...@@ -108,8 +110,8 @@ def parse_args(): ...@@ -108,8 +110,8 @@ def parse_args():
return args return args
def append_nccl2_prepare(): def append_nccl2_prepare(trainer_id):
if os.getenv("PADDLE_TRAINER_ID", None) != None: if trainer_id >= 0:
# append gen_nccl_id at the end of startup program # append gen_nccl_id at the end of startup program
trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
port = os.getenv("PADDLE_PSERVER_PORT") port = os.getenv("PADDLE_PSERVER_PORT")
...@@ -136,12 +138,12 @@ def append_nccl2_prepare(): ...@@ -136,12 +138,12 @@ def append_nccl2_prepare():
}) })
return nccl_id_var, num_trainers, trainer_id return nccl_id_var, num_trainers, trainer_id
else: else:
raise Exception( raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
"must set PADDLE_TRAINER_ID env variables for dist train.") "nccl-based dist train.")
def dist_transpile(): def dist_transpile(trainer_id):
if "PADDLE_TRAINING_ROLE" not in os.environ: if trainer_id < 0:
return None, None return None, None
# the port of all pservers, needed by both trainer and pserver # the port of all pservers, needed by both trainer and pserver
...@@ -158,9 +160,6 @@ def dist_transpile(): ...@@ -158,9 +160,6 @@ def dist_transpile():
trainers = int(os.getenv("PADDLE_TRAINERS")) trainers = int(os.getenv("PADDLE_TRAINERS"))
# the IP of the local machine, needed by pserver only # the IP of the local machine, needed by pserver only
current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
# the unique trainer id, starting from 0, needed by trainer
# only
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
# the role, should be either PSERVER or TRAINER # the role, should be either PSERVER or TRAINER
training_role = os.getenv("PADDLE_TRAINING_ROLE") training_role = os.getenv("PADDLE_TRAINING_ROLE")
...@@ -295,6 +294,11 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, ...@@ -295,6 +294,11 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
iters = 0 iters = 0
start_time = time.time() start_time = time.time()
for batch_id, data in enumerate(train_reader()): for batch_id, data in enumerate(train_reader()):
if args.profile and pass_id == 0 and batch_id == 5:
profiler.start_profiler("All")
elif args.profile and pass_id == 0 and batch_id == 10:
profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
if iters == args.skip_batch_num: if iters == args.skip_batch_num:
start_time = time.time() start_time = time.time()
num_samples = 0 num_samples = 0
...@@ -334,7 +338,11 @@ def print_arguments(args): ...@@ -334,7 +338,11 @@ def print_arguments(args):
def main(): def main():
args = parse_args() args = parse_args()
print_arguments(args) print_arguments(args)
nccl_id_var, num_trainers, trainer_id = None, 1, 0
# the unique trainer id, starting from 0, needed by trainer
# only
nccl_id_var, num_trainers, trainer_id = (
None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1")))
if args.use_cprof: if args.use_cprof:
pr = cProfile.Profile() pr = cProfile.Profile()
...@@ -348,7 +356,7 @@ def main(): ...@@ -348,7 +356,7 @@ def main():
fluid.memory_optimize(fluid.default_main_program()) fluid.memory_optimize(fluid.default_main_program())
if args.update_method == "pserver": if args.update_method == "pserver":
train_prog, startup_prog = dist_transpile() train_prog, startup_prog = dist_transpile(trainer_id)
if not train_prog: if not train_prog:
raise Exception( raise Exception(
"Must configure correct environments to run dist train.") "Must configure correct environments to run dist train.")
...@@ -364,7 +372,7 @@ def main(): ...@@ -364,7 +372,7 @@ def main():
train_args.append(fluid.default_startup_program()) train_args.append(fluid.default_startup_program())
if args.update_method == "nccl2": if args.update_method == "nccl2":
nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare() nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
if args.gpus == 1: if args.gpus == 1:
# NOTE: parallel executor use profiler interanlly # NOTE: parallel executor use profiler interanlly
if args.use_nvprof and args.device == 'GPU': if args.use_nvprof and args.device == 'GPU':
......
...@@ -272,6 +272,8 @@ if(NOT WITH_MKLDNN) ...@@ -272,6 +272,8 @@ if(NOT WITH_MKLDNN)
list(REMOVE_ITEM GENERAL_OPS fc_op) list(REMOVE_ITEM GENERAL_OPS fc_op)
endif(NOT WITH_MKLDNN) endif(NOT WITH_MKLDNN)
list(REMOVE_ITEM GENERAL_OPS reduce_op)
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
op_library(${src}) op_library(${src})
endforeach() endforeach()
......
...@@ -38,6 +38,7 @@ struct EventList; ...@@ -38,6 +38,7 @@ struct EventList;
static int64_t profiler_lister_id = 0; static int64_t profiler_lister_id = 0;
static bool should_send_profile_state = false; static bool should_send_profile_state = false;
std::mutex profiler_mu;
// The profiler state, the initial value is ProfilerState::kDisabled // The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled; static ProfilerState g_state = ProfilerState::kDisabled;
...@@ -228,11 +229,13 @@ void EnableProfiler(ProfilerState state) { ...@@ -228,11 +229,13 @@ void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled, PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enbale profling, since the input state is ", "Can't enbale profling, since the input state is ",
"ProfilerState::kDisabled"); "ProfilerState::kDisabled");
std::lock_guard<std::mutex> l(profiler_mu);
if (state == g_state) { if (state == g_state) {
return; return;
} }
g_state = state; g_state = state;
should_send_profile_state = true; { should_send_profile_state = true; }
GetDeviceTracer()->Enable(); GetDeviceTracer()->Enable();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (g_state == ProfilerState::kCUDA) { if (g_state == ProfilerState::kCUDA) {
...@@ -295,7 +298,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, ...@@ -295,7 +298,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
} else if (g_state == ProfilerState::kAll) { } else if (g_state == ProfilerState::kAll) {
place = "All"; place = "All";
} else { } else {
PADDLE_THROW("Invalid profiler state"); PADDLE_THROW("Invalid profiler state", g_state);
} }
std::cout << "Place: " << place << std::endl; std::cout << "Place: " << place << std::endl;
...@@ -443,6 +446,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -443,6 +446,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path) { const std::string& profile_path) {
std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
// Mark the profiling stop. // Mark the profiling stop.
Mark("_stop_profiler_", nullptr); Mark("_stop_profiler_", nullptr);
...@@ -456,7 +460,7 @@ void DisableProfiler(EventSortingKey sorted_key, ...@@ -456,7 +460,7 @@ void DisableProfiler(EventSortingKey sorted_key,
tracer->GenProfile(profile_path); tracer->GenProfile(profile_path);
} }
g_state = ProfilerState::kDisabled; g_state = ProfilerState::kDisabled;
should_send_profile_state = true; { should_send_profile_state = true; }
} }
bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; } bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
...@@ -466,7 +470,7 @@ void SetProfileListener() { ...@@ -466,7 +470,7 @@ void SetProfileListener() {
std::mt19937 rng; std::mt19937 rng;
rng.seed(std::random_device()()); rng.seed(std::random_device()());
std::uniform_int_distribution<std::mt19937::result_type> dist6( std::uniform_int_distribution<std::mt19937::result_type> dist6(
1, std::numeric_limits<std::mt19937::result_type>::max()); 1, std::numeric_limits<int>::max());
profiler_lister_id = dist6(rng); profiler_lister_id = dist6(rng);
} }
int64_t ListenerId() { return profiler_lister_id; } int64_t ListenerId() { return profiler_lister_id; }
......
...@@ -16,7 +16,10 @@ import core ...@@ -16,7 +16,10 @@ import core
from contextlib import contextmanager from contextlib import contextmanager
import os import os
__all__ = ['cuda_profiler', 'reset_profiler', 'profiler'] __all__ = [
'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
'stop_profiler'
]
NVPROF_CONFIG = [ NVPROF_CONFIG = [
"gpustarttimestamp", "gpustarttimestamp",
...@@ -72,6 +75,36 @@ def reset_profiler(): ...@@ -72,6 +75,36 @@ def reset_profiler():
core.reset_profiler() core.reset_profiler()
def start_profiler(state):
if state not in ['CPU', 'GPU', "All"]:
raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
if state == "GPU":
prof_state = core.ProfilerState.kCUDA
elif state == "CPU":
prof_state = core.ProfilerState.kCPU
else:
prof_state = core.ProfilerState.kAll
core.enable_profiler(prof_state)
def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
sorted_key = 'default' if sorted_key is None else sorted_key
if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
raise ValueError("The sorted_key must be None or in 'calls', 'total', "
"'max', 'min' and 'ave'")
key_map = {
'default': core.EventSortingKey.kDefault,
'calls': core.EventSortingKey.kCalls,
'total': core.EventSortingKey.kTotal,
'max': core.EventSortingKey.kMax,
'min': core.EventSortingKey.kMin,
'ave': core.EventSortingKey.kAve,
}
# TODO(qingqing) : redirect C++ ostream to Python stream.
# with core.ostream_redirect(stdout=True, stderr=True):
core.disable_profiler(key_map[sorted_key], profile_path)
@contextmanager @contextmanager
def profiler(state, sorted_key=None, profile_path='/tmp/profile'): def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
"""The profiler interface. """The profiler interface.
...@@ -98,29 +131,6 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): ...@@ -98,29 +131,6 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
profile_path (string) : If state == 'All', it will write a profile profile_path (string) : If state == 'All', it will write a profile
proto output file. proto output file.
""" """
if state not in ['CPU', 'GPU', "All"]: start_profiler(state)
raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
if state == "GPU":
prof_state = core.ProfilerState.kCUDA
elif state == "CPU":
prof_state = core.ProfilerState.kCPU
else:
prof_state = core.ProfilerState.kAll
core.enable_profiler(prof_state)
yield yield
stop_profiler(sorted_key, profile_path)
sorted_key = 'default' if sorted_key is None else sorted_key
if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
raise ValueError("The sorted_key must be None or in 'calls', 'total', "
"'max', 'min' and 'ave'")
key_map = {
'default': core.EventSortingKey.kDefault,
'calls': core.EventSortingKey.kCalls,
'total': core.EventSortingKey.kTotal,
'max': core.EventSortingKey.kMax,
'min': core.EventSortingKey.kMin,
'ave': core.EventSortingKey.kAve,
}
# TODO(qingqing) : redirect C++ ostream to Python stream.
# with core.ostream_redirect(stdout=True, stderr=True):
core.disable_profiler(key_map[sorted_key], profile_path)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册