add TrainFilesWithTimer in async_executor

2dee8f6c · dongdaxiang · cf6188a8 · 2dee8f6c · 2dee8f6c · 2dee8f6c
6 changed file
--- a/paddle/fluid/framework/async_executor.cc
+++ b/paddle/fluid/framework/async_executor.cc
@@ -304,8 +304,13 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program,
  // start executing ops in multiple threads
  for (int thidx = 0; thidx < actual_thread_num; ++thidx) {
-    threads.push_back(
+    if (debug) {
-        std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
+      threads.push_back(std::thread(&ExecutorThreadWorker::TrainFilesWithTimer,
+                                    workers[thidx].get()));
+    } else {
+      threads.push_back(
+          std::thread(&ExecutorThreadWorker::TrainFiles, workers[thidx].get()));
+    }
  }
  for (auto& th : threads) {

--- a/paddle/fluid/framework/executor_thread_worker.cc
+++ b/paddle/fluid/framework/executor_thread_worker.cc
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/io.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/fluid/platform/timer.h"
 #include "paddle/fluid/pybind/pybind.h"
 namespace paddle {
 namespace framework {
@@ -180,6 +181,7 @@ void ExecutorThreadWorker::SetDevice() {
  return;
 #else
  static unsigned concurrency_cap = std::thread::hardware_concurrency();
+  LOG(WARNING) << "concurrency capacity " << concurrency_cap;
  int thread_id = this->thread_id_;
  if (static_cast<unsigned>(thread_id) < concurrency_cap) {
@@ -238,6 +240,49 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) {
  VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type();
 }
+void ExecutorThreadWorker::TrainFilesWithTimer() {
+  platform::SetNumThreads(1);
+  SetDevice();
+  thread_reader_->Start();
+  std::vector<double> op_total_time;
+  std::vector<std::string> op_name;
+  for (auto& op : ops_) {
+    op_name.push_back(op->Type());
+  }
+  op_total_time.resize(ops_.size());
+  for (size_t i = 0; i < op_total_time.size(); ++i) {
+    op_total_time[i] = 0.0;
+  }
+  platform::Timer timeline;
+  double total_time = 0.0;
+  double read_time = 0.0;
+  int cur_batch;
+  int batch_cnt = 0;
+  timeline.Start();
+  while ((cur_batch = thread_reader_->Next()) > 0) {
+    timeline.Pause();
+    read_time += timeline.ElapsedSec();
+    total_time += timeline.ElapsedSec();
+    for (size_t i = 0; i < ops_.size(); ++i) {
+      timeline.Start();
+      ops_[i]->Run(*thread_scope_, place_);
+      timeline.Pause();
+      op_total_time[i] += timeline.ElapsedSec();
+      total_time += timeline.ElapsedSec();
+    }
+    ++batch_cnt;
+    thread_scope_->DropKids();
+    if (batch_cnt > 0 && batch_cnt % 1000 == 0) {
+      for (size_t i = 0; i < ops_.size(); ++i) {
+        fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i,
+                op_name[i].c_str(), op_total_time[i] / batch_cnt);
+      }
+      fprintf(stderr, "mean read time: %fs\n", read_time / batch_cnt);
+    }
+    timeline.Start();
+  }
+}
 void ExecutorThreadWorker::TrainFiles() {
  platform::SetNumThreads(1);

--- a/paddle/fluid/framework/executor_thread_worker.h
+++ b/paddle/fluid/framework/executor_thread_worker.h
@@ -155,6 +155,8 @@ class ExecutorThreadWorker {
  void SetDataFeed(const std::shared_ptr<DataFeed>& datafeed);
  // A multi-thread training function
  virtual void TrainFiles();
+  // with timer log
+  virtual void TrainFilesWithTimer();
  // set fetch variable names from python interface assigned by users
  void SetFetchVarNames(const std::vector<std::string>& fetch_var_names);
 #ifdef PADDLE_WITH_PSLIB

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -84,6 +84,8 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
+cc_library(timer SRCS timer.cc)
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)

--- a/paddle/fluid/platform/timer.cc
+++ b/paddle/fluid/platform/timer.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/timer.h"
+namespace paddle {
+namespace platform {
+void Timer::Reset() {
+  _start.tv_sec = 0;
+  _start.tv_usec = 0;
+  _count = 0;
+  _elapsed = 0;
+  _paused = true;
+}
+void Timer::Start() {
+  Reset();
+  Resume();
+}
+void Timer::Pause() {
+  if (_paused) {
+    return;
+  }
+  _elapsed += Tickus();
+  ++_count;
+  _paused = true;
+}
+void Timer::Resume() {
+  gettimeofday(&_start, NULL);
+  _paused = false;
+}
+int Timer::Count() { return _count; }
+double Timer::ElapsedUS() { return static_cast<double>(_elapsed); }
+double Timer::ElapsedMS() { return _elapsed / 1000.0; }
+double Timer::ElapsedSec() { return _elapsed / 1000000.0; }
+int64_t Timer::Tickus() {
+  gettimeofday(&_now, NULL);
+  return (_now.tv_sec - _start.tv_sec) * 1000 * 1000L +
+         (_now.tv_usec - _start.tv_usec);
+}
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/timer.h
+++ b/paddle/fluid/platform/timer.h
@@ -14,74 +14,33 @@ limitations under the License. */
 #pragma once
 #include <sys/time.h>
-#include <pmmintrin.h>
+#include "paddle/fluid/framework/scope.h"
-#include <string>
-#include <vector>
 namespace paddle {
 namespace platform {
 // A Standard Timer implementation for debugging
 class Timer {
 public:
-    Timer() {
+  Timer() { Reset(); }
-        reset();
+  void Reset();
-    }
+  void Start();
+  void Pause();
-    inline void reset() {
+  void Resume();
-        _start.tv_sec = 0;
+  int Count();
-        _start.tv_usec = 0;
+  double ElapsedUS();
+  double ElapsedMS();
-        _count = 0;
+  double ElapsedSec();
-        _elapsed = 0;
-        _paused = true;
-    }
-    inline void start() {
-        reset();
-        resume();
-    }
-    inline void pause() {
-        if (_paused) {
-            return;
-        }
-        _elapsed += tickus();
-        ++_count;
-        _paused = true;
-    }
-    inline void resume() {
-        gettimeofday(&_start, NULL);
-        _paused = false;
-    }
-    inline int count() const {
-        return _count;
-    }
-    inline double elapsed_us() const {
-        return static_cast<double>(_elapsed);
-    }
-    inline double elapsed_ms() const {
-        return _elapsed / 1000.0;
-    }
-    inline double elapsed_sec() const {
-        return _elapsed / 1000000.0;
-    }
 private:
-    struct timeval _start;
+  struct timeval _start;
-    struct timeval _now;
+  struct timeval _now;
+  int _count;
+  int _elapsed;
+  bool _paused;
-    int32_t _count;
+  int64_t Tickus();
-    int64_t _elapsed;
-    bool _paused;
-    inline int64_t tickus() {
-        gettimeofday(&_now, NULL);
-        return (_now.tv_sec - _start.tv_sec) * 1000 * 1000L +
-            (_now.tv_usec - _start.tv_usec);
-    }
 };
+}  // namespace platform
+}  // namespace paddle