diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 30b070e4acac60caa97a4e8ffd07462cb347ee93..c1d458970a58bfac2a3369e8964eb100568b28f2 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -98,6 +98,8 @@ def parse_args(): '--use_fake_data', action='store_true', help='If set ommit the actual read data operators.') + parser.add_argument( + '--profile', action='store_true', help='If set, profile a few steps.') parser.add_argument( '--update_method', type=str, @@ -108,8 +110,8 @@ def parse_args(): return args -def append_nccl2_prepare(): - if os.getenv("PADDLE_TRAINER_ID", None) != None: +def append_nccl2_prepare(trainer_id): + if trainer_id >= 0: # append gen_nccl_id at the end of startup program trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) port = os.getenv("PADDLE_PSERVER_PORT") @@ -136,12 +138,12 @@ def append_nccl2_prepare(): }) return nccl_id_var, num_trainers, trainer_id else: - raise Exception( - "must set PADDLE_TRAINER_ID env variables for dist train.") + raise Exception("must set positive PADDLE_TRAINER_ID env variables for " + "nccl-based dist train.") -def dist_transpile(): - if "PADDLE_TRAINING_ROLE" not in os.environ: +def dist_transpile(trainer_id): + if trainer_id < 0: return None, None # the port of all pservers, needed by both trainer and pserver @@ -158,9 +160,6 @@ def dist_transpile(): trainers = int(os.getenv("PADDLE_TRAINERS")) # the IP of the local machine, needed by pserver only current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port - # the unique trainer id, starting from 0, needed by trainer - # only - trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) # the role, should be either PSERVER or TRAINER training_role = os.getenv("PADDLE_TRAINING_ROLE") @@ -295,6 +294,11 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, iters = 0 start_time = time.time() for batch_id, data in enumerate(train_reader()): + if args.profile and pass_id == 0 and batch_id == 5: + profiler.start_profiler("All") + elif args.profile and pass_id == 0 and batch_id == 10: + profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id) + if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 @@ -334,7 +338,11 @@ def print_arguments(args): def main(): args = parse_args() print_arguments(args) - nccl_id_var, num_trainers, trainer_id = None, 1, 0 + + # the unique trainer id, starting from 0, needed by trainer + # only + nccl_id_var, num_trainers, trainer_id = ( + None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1"))) if args.use_cprof: pr = cProfile.Profile() @@ -348,7 +356,7 @@ def main(): fluid.memory_optimize(fluid.default_main_program()) if args.update_method == "pserver": - train_prog, startup_prog = dist_transpile() + train_prog, startup_prog = dist_transpile(trainer_id) if not train_prog: raise Exception( "Must configure correct environments to run dist train.") @@ -364,7 +372,7 @@ def main(): train_args.append(fluid.default_startup_program()) if args.update_method == "nccl2": - nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare() + nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id) if args.gpus == 1: # NOTE: parallel executor use profiler interanlly if args.use_nvprof and args.device == 'GPU': diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index e00cc73565fc98615090367606b6ba4f58feacfd..3f455b791b4e1ac32f0070fb5352994f1ab6777d 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -272,6 +272,8 @@ if(NOT WITH_MKLDNN) list(REMOVE_ITEM GENERAL_OPS fc_op) endif(NOT WITH_MKLDNN) +list(REMOVE_ITEM GENERAL_OPS reduce_op) + foreach(src ${GENERAL_OPS}) op_library(${src}) endforeach() diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 2fb5c6dc6b8ad25fa1ad5fcf7c2acfedd5be4a83..04f450aa3ec9df7137f865b0a2eef07c1cb9a229 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -38,6 +38,7 @@ struct EventList; static int64_t profiler_lister_id = 0; static bool should_send_profile_state = false; +std::mutex profiler_mu; // The profiler state, the initial value is ProfilerState::kDisabled static ProfilerState g_state = ProfilerState::kDisabled; @@ -228,11 +229,13 @@ void EnableProfiler(ProfilerState state) { PADDLE_ENFORCE(state != ProfilerState::kDisabled, "Can't enbale profling, since the input state is ", "ProfilerState::kDisabled"); + + std::lock_guard l(profiler_mu); if (state == g_state) { return; } g_state = state; - should_send_profile_state = true; + { should_send_profile_state = true; } GetDeviceTracer()->Enable(); #ifdef PADDLE_WITH_CUDA if (g_state == ProfilerState::kCUDA) { @@ -295,7 +298,7 @@ void PrintProfiler(const std::vector>& events_table, } else if (g_state == ProfilerState::kAll) { place = "All"; } else { - PADDLE_THROW("Invalid profiler state"); + PADDLE_THROW("Invalid profiler state", g_state); } std::cout << "Place: " << place << std::endl; @@ -443,6 +446,7 @@ void ParseEvents(const std::vector>& events, void DisableProfiler(EventSortingKey sorted_key, const std::string& profile_path) { + std::lock_guard l(profiler_mu); if (g_state == ProfilerState::kDisabled) return; // Mark the profiling stop. Mark("_stop_profiler_", nullptr); @@ -456,7 +460,7 @@ void DisableProfiler(EventSortingKey sorted_key, tracer->GenProfile(profile_path); } g_state = ProfilerState::kDisabled; - should_send_profile_state = true; + { should_send_profile_state = true; } } bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; } @@ -466,7 +470,7 @@ void SetProfileListener() { std::mt19937 rng; rng.seed(std::random_device()()); std::uniform_int_distribution dist6( - 1, std::numeric_limits::max()); + 1, std::numeric_limits::max()); profiler_lister_id = dist6(rng); } int64_t ListenerId() { return profiler_lister_id; } diff --git a/python/paddle/fluid/profiler.py b/python/paddle/fluid/profiler.py index 04fd05cc33cff3d720be75923d4af3767942669f..2e87cab88eca668b240ec44937a336caa0f9406f 100644 --- a/python/paddle/fluid/profiler.py +++ b/python/paddle/fluid/profiler.py @@ -16,7 +16,10 @@ import core from contextlib import contextmanager import os -__all__ = ['cuda_profiler', 'reset_profiler', 'profiler'] +__all__ = [ + 'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler', + 'stop_profiler' +] NVPROF_CONFIG = [ "gpustarttimestamp", @@ -72,6 +75,36 @@ def reset_profiler(): core.reset_profiler() +def start_profiler(state): + if state not in ['CPU', 'GPU', "All"]: + raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.") + if state == "GPU": + prof_state = core.ProfilerState.kCUDA + elif state == "CPU": + prof_state = core.ProfilerState.kCPU + else: + prof_state = core.ProfilerState.kAll + core.enable_profiler(prof_state) + + +def stop_profiler(sorted_key=None, profile_path='/tmp/profile'): + sorted_key = 'default' if sorted_key is None else sorted_key + if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']: + raise ValueError("The sorted_key must be None or in 'calls', 'total', " + "'max', 'min' and 'ave'") + key_map = { + 'default': core.EventSortingKey.kDefault, + 'calls': core.EventSortingKey.kCalls, + 'total': core.EventSortingKey.kTotal, + 'max': core.EventSortingKey.kMax, + 'min': core.EventSortingKey.kMin, + 'ave': core.EventSortingKey.kAve, + } + # TODO(qingqing) : redirect C++ ostream to Python stream. + # with core.ostream_redirect(stdout=True, stderr=True): + core.disable_profiler(key_map[sorted_key], profile_path) + + @contextmanager def profiler(state, sorted_key=None, profile_path='/tmp/profile'): """The profiler interface. @@ -98,29 +131,6 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'): profile_path (string) : If state == 'All', it will write a profile proto output file. """ - if state not in ['CPU', 'GPU', "All"]: - raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.") - if state == "GPU": - prof_state = core.ProfilerState.kCUDA - elif state == "CPU": - prof_state = core.ProfilerState.kCPU - else: - prof_state = core.ProfilerState.kAll - core.enable_profiler(prof_state) + start_profiler(state) yield - - sorted_key = 'default' if sorted_key is None else sorted_key - if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']: - raise ValueError("The sorted_key must be None or in 'calls', 'total', " - "'max', 'min' and 'ave'") - key_map = { - 'default': core.EventSortingKey.kDefault, - 'calls': core.EventSortingKey.kCalls, - 'total': core.EventSortingKey.kTotal, - 'max': core.EventSortingKey.kMax, - 'min': core.EventSortingKey.kMin, - 'ave': core.EventSortingKey.kAve, - } - # TODO(qingqing) : redirect C++ ostream to Python stream. - # with core.ostream_redirect(stdout=True, stderr=True): - core.disable_profiler(key_map[sorted_key], profile_path) + stop_profiler(sorted_key, profile_path)