提交 3cb63956 编写于 作者: X Xin Pan

better profiler and benchmark

上级 38af7bca
......@@ -98,6 +98,8 @@ def parse_args():
'--use_fake_data',
action='store_true',
help='If set ommit the actual read data operators.')
parser.add_argument(
'--profile', action='store_true', help='If set, profile a few steps.')
parser.add_argument(
'--update_method',
type=str,
......@@ -108,8 +110,8 @@ def parse_args():
return args
def append_nccl2_prepare():
if os.getenv("PADDLE_TRAINER_ID", None) != None:
def append_nccl2_prepare(trainer_id):
if trainer_id >= 0:
# append gen_nccl_id at the end of startup program
trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
port = os.getenv("PADDLE_PSERVER_PORT")
......@@ -136,12 +138,12 @@ def append_nccl2_prepare():
})
return nccl_id_var, num_trainers, trainer_id
else:
raise Exception(
"must set PADDLE_TRAINER_ID env variables for dist train.")
raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
"nccl-based dist train.")
def dist_transpile():
if "PADDLE_TRAINING_ROLE" not in os.environ:
def dist_transpile(trainer_id):
if trainer_id < 0:
return None, None
# the port of all pservers, needed by both trainer and pserver
......@@ -158,9 +160,6 @@ def dist_transpile():
trainers = int(os.getenv("PADDLE_TRAINERS"))
# the IP of the local machine, needed by pserver only
current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
# the unique trainer id, starting from 0, needed by trainer
# only
trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
# the role, should be either PSERVER or TRAINER
training_role = os.getenv("PADDLE_TRAINING_ROLE")
......@@ -295,6 +294,11 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
iters = 0
start_time = time.time()
for batch_id, data in enumerate(train_reader()):
if args.profile and pass_id == 0 and batch_id == 5:
profiler.start_profiler("All")
elif args.profile and pass_id == 0 and batch_id == 10:
profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
if iters == args.skip_batch_num:
start_time = time.time()
num_samples = 0
......@@ -334,7 +338,11 @@ def print_arguments(args):
def main():
args = parse_args()
print_arguments(args)
nccl_id_var, num_trainers, trainer_id = None, 1, 0
# the unique trainer id, starting from 0, needed by trainer
# only
nccl_id_var, num_trainers, trainer_id = (
None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1")))
if args.use_cprof:
pr = cProfile.Profile()
......@@ -348,7 +356,7 @@ def main():
fluid.memory_optimize(fluid.default_main_program())
if args.update_method == "pserver":
train_prog, startup_prog = dist_transpile()
train_prog, startup_prog = dist_transpile(trainer_id)
if not train_prog:
raise Exception(
"Must configure correct environments to run dist train.")
......@@ -364,7 +372,7 @@ def main():
train_args.append(fluid.default_startup_program())
if args.update_method == "nccl2":
nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare()
nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
if args.gpus == 1:
# NOTE: parallel executor use profiler interanlly
if args.use_nvprof and args.device == 'GPU':
......
......@@ -272,6 +272,8 @@ if(NOT WITH_MKLDNN)
list(REMOVE_ITEM GENERAL_OPS fc_op)
endif(NOT WITH_MKLDNN)
list(REMOVE_ITEM GENERAL_OPS reduce_op)
foreach(src ${GENERAL_OPS})
op_library(${src})
endforeach()
......
......@@ -38,6 +38,7 @@ struct EventList;
static int64_t profiler_lister_id = 0;
static bool should_send_profile_state = false;
std::mutex profiler_mu;
// The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled;
......@@ -228,11 +229,13 @@ void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enbale profling, since the input state is ",
"ProfilerState::kDisabled");
std::lock_guard<std::mutex> l(profiler_mu);
if (state == g_state) {
return;
}
g_state = state;
should_send_profile_state = true;
{ should_send_profile_state = true; }
GetDeviceTracer()->Enable();
#ifdef PADDLE_WITH_CUDA
if (g_state == ProfilerState::kCUDA) {
......@@ -295,7 +298,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
} else if (g_state == ProfilerState::kAll) {
place = "All";
} else {
PADDLE_THROW("Invalid profiler state");
PADDLE_THROW("Invalid profiler state", g_state);
}
std::cout << "Place: " << place << std::endl;
......@@ -443,6 +446,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path) {
std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled) return;
// Mark the profiling stop.
Mark("_stop_profiler_", nullptr);
......@@ -456,7 +460,7 @@ void DisableProfiler(EventSortingKey sorted_key,
tracer->GenProfile(profile_path);
}
g_state = ProfilerState::kDisabled;
should_send_profile_state = true;
{ should_send_profile_state = true; }
}
bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
......@@ -466,7 +470,7 @@ void SetProfileListener() {
std::mt19937 rng;
rng.seed(std::random_device()());
std::uniform_int_distribution<std::mt19937::result_type> dist6(
1, std::numeric_limits<std::mt19937::result_type>::max());
1, std::numeric_limits<int>::max());
profiler_lister_id = dist6(rng);
}
int64_t ListenerId() { return profiler_lister_id; }
......
......@@ -16,7 +16,10 @@ import core
from contextlib import contextmanager
import os
__all__ = ['cuda_profiler', 'reset_profiler', 'profiler']
__all__ = [
'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
'stop_profiler'
]
NVPROF_CONFIG = [
"gpustarttimestamp",
......@@ -72,6 +75,36 @@ def reset_profiler():
core.reset_profiler()
def start_profiler(state):
if state not in ['CPU', 'GPU', "All"]:
raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
if state == "GPU":
prof_state = core.ProfilerState.kCUDA
elif state == "CPU":
prof_state = core.ProfilerState.kCPU
else:
prof_state = core.ProfilerState.kAll
core.enable_profiler(prof_state)
def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
sorted_key = 'default' if sorted_key is None else sorted_key
if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
raise ValueError("The sorted_key must be None or in 'calls', 'total', "
"'max', 'min' and 'ave'")
key_map = {
'default': core.EventSortingKey.kDefault,
'calls': core.EventSortingKey.kCalls,
'total': core.EventSortingKey.kTotal,
'max': core.EventSortingKey.kMax,
'min': core.EventSortingKey.kMin,
'ave': core.EventSortingKey.kAve,
}
# TODO(qingqing) : redirect C++ ostream to Python stream.
# with core.ostream_redirect(stdout=True, stderr=True):
core.disable_profiler(key_map[sorted_key], profile_path)
@contextmanager
def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
"""The profiler interface.
......@@ -98,29 +131,6 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
profile_path (string) : If state == 'All', it will write a profile
proto output file.
"""
if state not in ['CPU', 'GPU', "All"]:
raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
if state == "GPU":
prof_state = core.ProfilerState.kCUDA
elif state == "CPU":
prof_state = core.ProfilerState.kCPU
else:
prof_state = core.ProfilerState.kAll
core.enable_profiler(prof_state)
start_profiler(state)
yield
sorted_key = 'default' if sorted_key is None else sorted_key
if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
raise ValueError("The sorted_key must be None or in 'calls', 'total', "
"'max', 'min' and 'ave'")
key_map = {
'default': core.EventSortingKey.kDefault,
'calls': core.EventSortingKey.kCalls,
'total': core.EventSortingKey.kTotal,
'max': core.EventSortingKey.kMax,
'min': core.EventSortingKey.kMin,
'ave': core.EventSortingKey.kAve,
}
# TODO(qingqing) : redirect C++ ostream to Python stream.
# with core.ostream_redirect(stdout=True, stderr=True):
core.disable_profiler(key_map[sorted_key], profile_path)
stop_profiler(sorted_key, profile_path)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册