better profiler and benchmark

3cb63956 · Xin Pan · 38af7bca · 3cb63956 · 3cb63956 · 3cb63956
4 changed file
--- a/benchmark/fluid/fluid_benchmark.py
+++ b/benchmark/fluid/fluid_benchmark.py
@@ -98,6 +98,8 @@ def parse_args():
        '--use_fake_data',
        action='store_true',
        help='If set ommit the actual read data operators.')
+    parser.add_argument(
+        '--profile', action='store_true', help='If set, profile a few steps.')
    parser.add_argument(
        '--update_method',
        type=str,
@@ -108,8 +110,8 @@ def parse_args():
    return args


-def append_nccl2_prepare():
-    if os.getenv("PADDLE_TRAINER_ID", None) != None:
+def append_nccl2_prepare(trainer_id):
+    if trainer_id >= 0:
        # append gen_nccl_id at the end of startup program
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
        port = os.getenv("PADDLE_PSERVER_PORT")
@@ -136,12 +138,12 @@ def append_nccl2_prepare():
            })
        return nccl_id_var, num_trainers, trainer_id
    else:
-        raise Exception(
-            "must set PADDLE_TRAINER_ID env variables for dist train.")
+        raise Exception("must set positive PADDLE_TRAINER_ID env variables for "
+                        "nccl-based dist train.")


-def dist_transpile():
-    if "PADDLE_TRAINING_ROLE" not in os.environ:
+def dist_transpile(trainer_id):
+    if trainer_id < 0:
        return None, None

    # the port of all pservers, needed by both trainer and pserver
@@ -158,9 +160,6 @@ def dist_transpile():
    trainers = int(os.getenv("PADDLE_TRAINERS"))
    # the IP of the local machine, needed by pserver only
    current_endpoint = os.getenv("PADDLE_CURRENT_IP", "") + ":" + port
-    # the unique trainer id, starting from 0, needed by trainer
-    # only
-    trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
    # the role, should be either PSERVER or TRAINER
    training_role = os.getenv("PADDLE_TRAINING_ROLE")

@@ -295,6 +294,11 @@ def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader,
        iters = 0
        start_time = time.time()
        for batch_id, data in enumerate(train_reader()):
+            if args.profile and pass_id == 0 and batch_id == 5:
+                profiler.start_profiler("All")
+            elif args.profile and pass_id == 0 and batch_id == 10:
+                profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id)
+
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
@@ -334,7 +338,11 @@ def print_arguments(args):
 def main():
    args = parse_args()
    print_arguments(args)
-    nccl_id_var, num_trainers, trainer_id = None, 1, 0
+
+    # the unique trainer id, starting from 0, needed by trainer
+    # only
+    nccl_id_var, num_trainers, trainer_id = (
+        None, 1, int(os.getenv("PADDLE_TRAINER_ID", "-1")))

    if args.use_cprof:
        pr = cProfile.Profile()
@@ -348,7 +356,7 @@ def main():
        fluid.memory_optimize(fluid.default_main_program())

    if args.update_method == "pserver":
-        train_prog, startup_prog = dist_transpile()
+        train_prog, startup_prog = dist_transpile(trainer_id)
        if not train_prog:
            raise Exception(
                "Must configure correct environments to run dist train.")
@@ -364,7 +372,7 @@ def main():
    train_args.append(fluid.default_startup_program())

    if args.update_method == "nccl2":
-        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare()
+        nccl_id_var, num_trainers, trainer_id = append_nccl2_prepare(trainer_id)
    if args.gpus == 1:
        # NOTE: parallel executor use profiler interanlly
        if args.use_nvprof and args.device == 'GPU':

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -272,6 +272,8 @@ if(NOT WITH_MKLDNN)
    list(REMOVE_ITEM GENERAL_OPS fc_op)
 endif(NOT WITH_MKLDNN)

+list(REMOVE_ITEM GENERAL_OPS reduce_op)
+
 foreach(src ${GENERAL_OPS})
    op_library(${src})
 endforeach()

--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -38,6 +38,7 @@ struct EventList;

 static int64_t profiler_lister_id = 0;
 static bool should_send_profile_state = false;
+std::mutex profiler_mu;

 // The profiler state, the initial value is ProfilerState::kDisabled
 static ProfilerState g_state = ProfilerState::kDisabled;
@@ -228,11 +229,13 @@ void EnableProfiler(ProfilerState state) {
  PADDLE_ENFORCE(state != ProfilerState::kDisabled,
                 "Can't enbale profling, since the input state is ",
                 "ProfilerState::kDisabled");
+
+  std::lock_guard<std::mutex> l(profiler_mu);
  if (state == g_state) {
    return;
  }
  g_state = state;
-  should_send_profile_state = true;
+  { should_send_profile_state = true; }
  GetDeviceTracer()->Enable();
 #ifdef PADDLE_WITH_CUDA
  if (g_state == ProfilerState::kCUDA) {
@@ -295,7 +298,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
  } else if (g_state == ProfilerState::kAll) {
    place = "All";
  } else {
-    PADDLE_THROW("Invalid profiler state");
+    PADDLE_THROW("Invalid profiler state", g_state);
  }

  std::cout << "Place: " << place << std::endl;
@@ -443,6 +446,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,

 void DisableProfiler(EventSortingKey sorted_key,
                     const std::string& profile_path) {
+  std::lock_guard<std::mutex> l(profiler_mu);
  if (g_state == ProfilerState::kDisabled) return;
  // Mark the profiling stop.
  Mark("_stop_profiler_", nullptr);
@@ -456,7 +460,7 @@ void DisableProfiler(EventSortingKey sorted_key,
    tracer->GenProfile(profile_path);
  }
  g_state = ProfilerState::kDisabled;
-  should_send_profile_state = true;
+  { should_send_profile_state = true; }
 }

 bool IsProfileEnabled() { return g_state != ProfilerState::kDisabled; }
@@ -466,7 +470,7 @@ void SetProfileListener() {
  std::mt19937 rng;
  rng.seed(std::random_device()());
  std::uniform_int_distribution<std::mt19937::result_type> dist6(
-      1, std::numeric_limits<std::mt19937::result_type>::max());
+      1, std::numeric_limits<int>::max());
  profiler_lister_id = dist6(rng);
 }
 int64_t ListenerId() { return profiler_lister_id; }

--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
@@ -16,7 +16,10 @@ import core
 from contextlib import contextmanager
 import os

-__all__ = ['cuda_profiler', 'reset_profiler', 'profiler']
+__all__ = [
+    'cuda_profiler', 'reset_profiler', 'profiler', 'start_profiler',
+    'stop_profiler'
+]

 NVPROF_CONFIG = [
    "gpustarttimestamp",
@@ -72,6 +75,36 @@ def reset_profiler():
    core.reset_profiler()


+def start_profiler(state):
+    if state not in ['CPU', 'GPU', "All"]:
+        raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
+    if state == "GPU":
+        prof_state = core.ProfilerState.kCUDA
+    elif state == "CPU":
+        prof_state = core.ProfilerState.kCPU
+    else:
+        prof_state = core.ProfilerState.kAll
+    core.enable_profiler(prof_state)
+
+
+def stop_profiler(sorted_key=None, profile_path='/tmp/profile'):
+    sorted_key = 'default' if sorted_key is None else sorted_key
+    if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
+        raise ValueError("The sorted_key must be None or in 'calls', 'total', "
+                         "'max', 'min' and 'ave'")
+    key_map = {
+        'default': core.EventSortingKey.kDefault,
+        'calls': core.EventSortingKey.kCalls,
+        'total': core.EventSortingKey.kTotal,
+        'max': core.EventSortingKey.kMax,
+        'min': core.EventSortingKey.kMin,
+        'ave': core.EventSortingKey.kAve,
+    }
+    # TODO(qingqing) : redirect C++ ostream to Python stream.
+    # with core.ostream_redirect(stdout=True, stderr=True):
+    core.disable_profiler(key_map[sorted_key], profile_path)
+
+
 @contextmanager
 def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
    """The profiler interface.
@@ -98,29 +131,6 @@ def profiler(state, sorted_key=None, profile_path='/tmp/profile'):
        profile_path (string) : If state == 'All', it will write a profile
            proto output file.
    """
-    if state not in ['CPU', 'GPU', "All"]:
-        raise ValueError("The state must be 'CPU' or 'GPU' or 'All'.")
-    if state == "GPU":
-        prof_state = core.ProfilerState.kCUDA
-    elif state == "CPU":
-        prof_state = core.ProfilerState.kCPU
-    else:
-        prof_state = core.ProfilerState.kAll
-    core.enable_profiler(prof_state)
+    start_profiler(state)
    yield
-
-    sorted_key = 'default' if sorted_key is None else sorted_key
-    if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
-        raise ValueError("The sorted_key must be None or in 'calls', 'total', "
-                         "'max', 'min' and 'ave'")
-    key_map = {
-        'default': core.EventSortingKey.kDefault,
-        'calls': core.EventSortingKey.kCalls,
-        'total': core.EventSortingKey.kTotal,
-        'max': core.EventSortingKey.kMax,
-        'min': core.EventSortingKey.kMin,
-        'ave': core.EventSortingKey.kAve,
-    }
-    # TODO(qingqing) : redirect C++ ostream to Python stream.
-    # with core.ostream_redirect(stdout=True, stderr=True):
-    core.disable_profiler(key_map[sorted_key], profile_path)
+    stop_profiler(sorted_key, profile_path)