From 8a634b716ad35865722bb3f48c1ce6d2c798566a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Wed, 4 Dec 2019 12:55:11 +0800
Subject: [PATCH] refactor profile tools, test=develop (#2536)

---
 lite/api/model_test.cc                        |  16 +--
 lite/core/CMakeLists.txt                      |   4 +-
 lite/core/arena/framework.cc                  |   3 +
 lite/core/kernel.h                            |  18 ++-
 lite/core/profile/CMakeLists.txt              |   3 +-
 lite/core/profile/profiler.cc                 | 117 ++++++++++++++++++
 lite/core/profile/profiler.h                  |  59 +++++++++
 lite/core/profile/test_timer.cc               |  81 ++++++++++++
 lite/core/profile/timer.h                     | 114 +++++++++++++++++
 lite/core/program.cc                          |   8 +-
 lite/core/program.h                           |  48 ++++---
 lite/tests/cv/image_convert_test.cc           |  12 +-
 lite/tests/math/conv_compute_test.cc          |  16 +--
 lite/tests/math/conv_int8_compute_test.cc     |  30 ++---
 .../tests/math/conv_transpose_compute_test.cc |  16 +--
 lite/tests/math/gemm_int8_compute_test.cc     |  30 ++---
 lite/tests/math/gemv_int8_compute_test.cc     |  30 ++---
 lite/tests/math/layout_compute_test.cc        |  52 ++++----
 lite/tests/math/pool_compute_test.cc          |  16 +--
 lite/tests/math/sgemm_c4_compute_test.cc      |  16 +--
 lite/tests/math/sgemm_compute_test.cc         |  16 +--
 lite/tests/math/sgemv_compute_test.cc         |  18 +--
 lite/tests/utils/timer.h                      | 105 ----------------
 23 files changed, 555 insertions(+), 273 deletions(-)
 create mode 100644 lite/core/profile/profiler.cc
 create mode 100644 lite/core/profile/profiler.h
 create mode 100644 lite/core/profile/test_timer.cc
 create mode 100644 lite/core/profile/timer.h
 delete mode 100644 lite/tests/utils/timer.h
diff --git a/lite/api/model_test.cc b/lite/api/model_test.cc
index 1358267000..a04e86b7d2 100644
--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -21,14 +21,14 @@
 #include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
 #include "lite/core/device_info.h"
-#include "lite/tests/utils/timer.h"
+#include "lite/core/profile/timer.h"
 #include "lite/utils/cp_logging.h"
 #include "lite/utils/string.h"
 #ifdef LITE_WITH_PROFILE
 #include "lite/core/profile/basic_profiler.h"
 #endif  // LITE_WITH_PROFILE
 
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DEFINE_string(input_shape,
               "1,3,224,224",
@@ -102,20 +102,20 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
 
   Timer ti;
   for (int j = 0; j < repeat; ++j) {
-    ti.start();
+    ti.Start();
     predictor->Run();
-    ti.end();
-    LOG(INFO) << "iter: " << j << ", time: " << ti.latest_time() << " ms";
+    float t = ti.Stop();
+    LOG(INFO) << "iter: " << j << ", time: " << t << " ms";
   }
 
   LOG(INFO) << "================== Speed Report ===================";
   LOG(INFO) << "Model: " << model_dir
             << ", power_mode: " << static_cast<int>(power_mode)
             << ", threads num " << thread_num << ", warmup: " << warmup_times
-            << ", repeats: " << repeat << ", avg time: " << ti.get_average_ms()
+            << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg()
             << " ms"
-            << ", min time: " << ti.get_min_time() << " ms"
-            << ", max time: " << ti.get_max_time() << " ms.";
+            << ", min time: " << ti.LapTimes().Min() << " ms"
+            << ", max time: " << ti.LapTimes().Max() << " ms.";
 
   auto output = predictor->GetOutput(0);
   auto out = output->data<float>();
diff --git a/lite/core/CMakeLists.txt b/lite/core/CMakeLists.txt
index 5eecf1d815..a93b962a47 100644
--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -99,7 +99,7 @@ add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc)
 #----------------------------------------------- NOT CHANGE -----------------------------------------------
 lite_cc_library(kernel SRCS kernel.cc
         DEPS context type_system target_wrapper any op_params tensor
-        PROFILE_DEPS basic_profiler
+        PROFILE_DEPS lite_profiler
   )
 lite_cc_library(op SRCS op_lite.cc DEPS scope op_registry target_wrapper kernel
   cpp_op_desc tensor
@@ -113,7 +113,7 @@ lite_cc_library(type_system SRCS type_system.cc DEPS tensor target_wrapper)
 
 lite_cc_library(program SRCS program.cc
     DEPS op kernel model_parser ${ops} ${cpp_wrapper}
-    PROFILE_DEPS basic_profiler)
+    PROFILE_DEPS lite_profiler)
 
 if (NOT LITE_ON_TINY_PUBLISH)
   lite_cc_library(optimizer SRCS optimizer.cc DEPS mir_pass_manager model_parser program)
diff --git a/lite/core/arena/framework.cc b/lite/core/arena/framework.cc
index c59c078787..561a508d20 100644
--- a/lite/core/arena/framework.cc
+++ b/lite/core/arena/framework.cc
@@ -37,6 +37,9 @@ void TestCase::CreateInstruction() {
   // prepare context
   (*it)->SetContext(std::move(ctx_));
   instruction_.reset(new Instruction(op, std::move(*it)));
+#ifdef LITE_WITH_PROFILE
+  instruction_->set_profiler(new profile::Profiler());
+#endif
 }
 
 void TestCase::PrepareInputsForInstruction() {
diff --git a/lite/core/kernel.h b/lite/core/kernel.h
index 05d7a6b333..86193235a2 100644
--- a/lite/core/kernel.h
+++ b/lite/core/kernel.h
@@ -31,7 +31,7 @@
 #include "lite/utils/replace_stl/stream.h"
 
 #ifdef LITE_WITH_PROFILE
-#include "lite/core/profile/basic_profiler.h"
+#include "lite/core/profile/profiler.h"
 #endif  // LITE_WITH_PROFILE
 
 namespace paddle {
@@ -58,7 +58,10 @@ class KernelBase {
   virtual void Run() = 0;
 
 #ifdef LITE_WITH_PROFILE
-  void SetProfileID(uint32_t id) { profile_id_ = id; }
+  void SetProfiler(profile::Profiler* profiler, int id) {
+    profiler_ = profiler;
+    profile_id_ = id;
+  }
 #endif
 
   void Launch() {
@@ -82,10 +85,12 @@ class KernelBase {
 #endif
 
 #ifdef LITE_WITH_PROFILE
-    if (profile_id_ >= 0) {
-      profile::ProfileBlock x(profile_id_, "kernel");
-      Run();
-    }
+    CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. "
+                        "When LITE_WITH_PROFILE is defined, please set a "
+                        "Profiler for Instruction.";
+    profiler_->StartTiming(profile_id_, ctx_.get());
+    Run();
+    profiler_->StopTiming(profile_id_, ctx_.get());
 #else
     Run();
 #endif
@@ -175,6 +180,7 @@ class KernelBase {
   bool is_first_epoch_{true};
 
 #ifdef LITE_WITH_PROFILE
+  profile::Profiler* profiler_{nullptr};
   int profile_id_{-1};
 #endif
 };
diff --git a/lite/core/profile/CMakeLists.txt b/lite/core/profile/CMakeLists.txt
index 54a2390244..b7ddd810af 100644
--- a/lite/core/profile/CMakeLists.txt
+++ b/lite/core/profile/CMakeLists.txt
@@ -5,4 +5,5 @@ endif()
 lite_cc_library(basic_profiler SRCS basic_profiler.cc DEPS gflags)
 lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler)
  
- 
+lite_cc_library(lite_profiler SRCS profiler.cc DEPS context)
+lite_cc_test(test_lite_timer SRCS test_timer.cc DEPS lite_profiler)
diff --git a/lite/core/profile/profiler.cc b/lite/core/profile/profiler.cc
new file mode 100644
index 0000000000..a51b769c8f
--- /dev/null
+++ b/lite/core/profile/profiler.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/core/profile/profiler.h"
+#include <map>
+#include <string>
+#include <utility>
+
+namespace paddle {
+namespace lite {
+namespace profile {
+
+int Profiler::NewTimer(const OpCharacter& ch) {
+  StatisUnit unit;
+  unit.character = ch;
+  if (ch.target == TargetType::kCUDA) {
+#ifdef LITE_WITH_CUDA
+    unit.timer.reset(new DeviceTimer<TargetType::kCUDA>());
+#else
+    LOG(ERROR) << "The timer type specified as cuda is uninitialized, so the "
+                  "default x86 timer is used instead.";
+#endif
+  } else {
+    unit.timer.reset(new DeviceTimer<TargetType::kHost>());
+  }
+  units_.push_back(std::move(unit));
+  return units_.size() - 1;
+}
+
+void Profiler::StartTiming(const int index, KernelContext* ctx) {
+  CHECK_LT(index, units_.size())
+      << "The timer index in the profiler is out of range.";
+  units_[index].timer->Start(ctx);
+}
+
+float Profiler::StopTiming(const int index, KernelContext* ctx) {
+  CHECK_LT(index, units_.size())
+      << "The timer index in the profiler is out of range.";
+  return units_[index].timer->Stop(ctx);
+}
+
+std::string Profiler::Summary(bool concise) {
+  STL::stringstream ss;
+  auto cout_title = [&ss](const std::string& title, const std::string& name) {
+    // clang-format off
+    ss << "===== " << title << ": " << name << " =====" << std::endl;
+    ss << std::setw(25) << std::left << "Operator Type" \
+       << std::setw(40) << std::left << "Kernel Name"   \
+       << std::setw(10) << std::left << "Remark"        \
+       << std::setw(10) << std::left << "Avg (ms)"      \
+       << std::setw(10) << std::left << "Min (ms)"      \
+       << std::setw(10) << std::left << "Max (ms)"      \
+       << std::endl;
+    // clang-format on
+  };
+  if (concise) {
+    auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) {
+      return (c1.target < c2.target) || (c1.op_type < c2.op_type) ||
+             (c1.kernel_name < c2.kernel_name) || (c1.remark < c2.remark);
+    };
+    std::map<OpCharacter, TimeInfo, decltype(op_comp)> summary(op_comp);
+    for (auto& unit : units_) {
+      auto ch = summary.find(unit.character);
+      if (ch != summary.end()) {
+        ch->second.avg += unit.timer->LapTimes().Avg();
+        ch->second.min += unit.timer->LapTimes().Min();
+        ch->second.max += unit.timer->LapTimes().Max();
+      } else {
+        TimeInfo info({unit.timer->LapTimes().Avg(),
+                       unit.timer->LapTimes().Min(),
+                       unit.timer->LapTimes().Max()});
+        summary.insert({unit.character, info});
+      }
+    }
+    cout_title("Concise Profiler Summary", name_);
+    for (const auto& item : summary) {
+      // clang-format off
+      ss << std::setw(25) << std::left << item.first.op_type      \
+         << std::setw(40) << std::left << item.first.kernel_name  \
+         << std::setw(10) << std::left << item.first.remark       \
+         << std::setw(10) << std::left << item.second.avg         \
+         << std::setw(10) << std::left << item.second.min         \
+         << std::setw(10) << std::left << item.second.max         \
+         << std::endl;
+      // clang-format on
+    }
+  } else {
+    cout_title("Detailed Profiler Summary", name_);
+    for (auto& unit : units_) {
+      // clang-format off
+      ss << std::setw(25) << std::left << unit.character.op_type        \
+         << std::setw(40) << std::left << unit.character.kernel_name    \
+         << std::setw(10) << std::left << unit.character.remark         \
+         << std::setw(10) << std::left << unit.timer->LapTimes().Avg()  \
+         << std::setw(10) << std::left << unit.timer->LapTimes().Min()  \
+         << std::setw(10) << std::left << unit.timer->LapTimes().Max()  \
+         << std::endl;
+      // clang-format on
+    }
+  }
+  return ss.str();
+}
+
+}  // namespace profile
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/profile/profiler.h b/lite/core/profile/profiler.h
new file mode 100644
index 0000000000..0fce8167cd
--- /dev/null
+++ b/lite/core/profile/profiler.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/profile/timer.h"
+
+namespace paddle {
+namespace lite {
+namespace profile {
+
+struct TimeInfo {
+  float avg;
+  float min;
+  float max;
+};
+
+struct OpCharacter {
+  TargetType target;
+  std::string op_type{std::string("N/A")};
+  std::string kernel_name{std::string("N/A")};
+  std::string remark{std::string("N/A")};
+};
+
+struct StatisUnit {
+  std::unique_ptr<Timer> timer;
+  OpCharacter character;
+};
+
+class Profiler final {
+ public:
+  Profiler() = default;
+  explicit Profiler(const std::string& name) : name_(name) {}
+  int NewTimer(const OpCharacter& ch);
+  void StartTiming(const int index, KernelContext* ctx);
+  float StopTiming(const int index, KernelContext* ctx);
+  std::string Summary(bool concise = true);
+
+ private:
+  std::string name_{std::string("N/A")};
+  std::vector<StatisUnit> units_;
+};
+
+}  // namespace profile
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/profile/test_timer.cc b/lite/core/profile/test_timer.cc
new file mode 100644
index 0000000000..6f49698ef4
--- /dev/null
+++ b/lite/core/profile/test_timer.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <chrono>  // NOLINT
+#include <thread>  // NOLINT
+#include "lite/core/context.h"
+#include "lite/core/profile/profiler.h"
+#include "lite/core/profile/timer.h"
+#include "lite/utils/cp_logging.h"
+
+namespace paddle {
+namespace lite {
+namespace profile {
+
+TEST(timer, real_latency) {
+  Timer timer;
+
+  timer.Start();
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  timer.Stop();
+
+  timer.Start();
+  std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  timer.Stop();
+
+  LOG(INFO) << "LapTimes().Avg() = " << timer.LapTimes().Avg();
+}
+
+#ifdef LITE_WITH_CUDA
+TEST(gpu_timer, real_latency) {
+  DeviceTimer<TargetType::kCUDA> timer;
+  KernelContext ctx;
+  cudaStream_t exec_stream;
+  cudaStreamCreate(&exec_stream);
+  (&ctx.As<CUDAContext>())->SetExecStream(exec_stream);
+
+  timer.Start(&ctx);
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  timer.Stop(&ctx);
+
+  (&timer)->Start(&ctx);
+  std::this_thread::sleep_for(std::chrono::milliseconds(50));
+  timer.Stop(&ctx);
+
+  LOG(INFO) << "LapTimes().Avg() = " << timer.LapTimes().Avg();
+}
+
+TEST(profiler, real_latency) {
+  KernelContext ctx;
+  cudaStream_t exec_stream;
+  cudaStreamCreate(&exec_stream);
+  (&ctx.As<CUDAContext>())->SetExecStream(exec_stream);
+
+  Profiler profiler("name");
+  profile::OpCharacter ch;
+  ch.target = TargetType::kCUDA;
+  ch.op_type = "operator/1";
+  ch.kernel_name = "kernel/1";
+  int idx = profiler.NewTimer(ch);
+  profiler.StartTiming(idx, &ctx);
+  std::this_thread::sleep_for(std::chrono::milliseconds(10));
+  profiler.StopTiming(idx, &ctx);
+  std::cout << profiler.Summary();
+}
+#endif
+
+}  // namespace profile
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/profile/timer.h b/lite/core/profile/timer.h
new file mode 100644
index 0000000000..1e86f0d7b9
--- /dev/null
+++ b/lite/core/profile/timer.h
@@ -0,0 +1,114 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <chrono>  // NOLINT
+#include <list>
+#ifdef LITE_WITH_CUDA
+#include "lite/backends/cuda/cuda_utils.h"
+#endif
+#include "lite/core/context.h"
+
+namespace paddle {
+namespace lite {
+namespace profile {
+
+template <typename T>
+class TimeList {
+ public:
+  void Clear() { laps_t_.clear(); }
+  void Add(T t) { laps_t_.push_back(t); }
+  T Max() const { return *std::max_element(laps_t_.begin(), laps_t_.end()); }
+  T Min() const { return *std::min_element(laps_t_.begin(), laps_t_.end()); }
+  T Sum() const { return std::accumulate(laps_t_.begin(), laps_t_.end(), 0.0); }
+  size_t Size() const { return laps_t_.size(); }
+  T Avg() const {
+    if (!Size()) {
+      return 0;
+    }
+    return Sum() / Size();
+  }
+  const std::list<T>& Raw() const { return laps_t_; }
+
+ private:
+  std::list<T> laps_t_;
+};
+
+class Timer {
+ public:
+  Timer() = default;
+  virtual ~Timer() = default;
+
+  void Reset() { laps_t_.Clear(); }
+  void Start() { t_start_ = std::chrono::system_clock::now(); }
+  float Stop() {
+    t_stop_ = std::chrono::system_clock::now();
+    auto ts = std::chrono::duration_cast<std::chrono::microseconds>(t_stop_ -
+                                                                    t_start_);
+    float elapse_ms = 1000.f * static_cast<float>(ts.count()) *
+                      std::chrono::microseconds::period::num /
+                      std::chrono::microseconds::period::den;
+    this->laps_t_.Add(elapse_ms);
+    return elapse_ms;
+  }
+  virtual void Start(KernelContext* ctx) { return Start(); }
+  virtual float Stop(KernelContext* ctx) { return Stop(); }
+  float AvgLapTimeMs() const { return laps_t_.Avg(); }
+  const TimeList<float>& LapTimes() const { return laps_t_; }
+
+ protected:
+  std::chrono::time_point<std::chrono::system_clock> t_start_, t_stop_;
+  TimeList<float> laps_t_;
+};
+
+template <TargetType Target>
+class DeviceTimer final : public Timer {};
+
+#ifdef LITE_WITH_CUDA
+template <>
+class DeviceTimer<TargetType::kCUDA> final : public Timer {
+ public:
+  DeviceTimer() {
+    CUDA_CALL(cudaEventCreate(&e_start_));
+    CUDA_CALL(cudaEventCreate(&e_stop_));
+  }
+  ~DeviceTimer() {
+    CUDA_CALL(cudaEventDestroy(e_start_));
+    CUDA_CALL(cudaEventDestroy(e_stop_));
+  }
+  void Start(KernelContext* ctx) {
+    cudaStream_t stream;
+    stream = ctx->As<CUDAContext>().exec_stream();
+    CUDA_CALL(cudaEventRecord(e_start_, stream));
+  }
+  float Stop(KernelContext* ctx) {
+    cudaStream_t stream;
+    stream = ctx->As<CUDAContext>().exec_stream();
+    CUDA_CALL(cudaEventRecord(e_stop_, stream));
+    CUDA_CALL(cudaEventSynchronize(e_stop_));
+    float elapse_ms = 1.f;
+    CUDA_CALL(cudaEventElapsedTime(&elapse_ms, e_start_, e_stop_));
+    this->laps_t_.Add(elapse_ms);
+    return elapse_ms;
+  }
+
+ private:
+  cudaEvent_t e_start_, e_stop_;
+};
+#endif
+
+}  // namespace profile
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/core/program.cc b/lite/core/program.cc
index b60f279c0f..45796a478b 100644
--- a/lite/core/program.cc
+++ b/lite/core/program.cc
@@ -122,6 +122,9 @@ void RuntimeProgram::Run() {
 #endif  // LITE_WITH_PRECISION_PROFILE
 #endif  // LITE_WITH_PROFILE
   }
+#ifdef LITE_WITH_PROFILE
+  LOG(INFO) << "\n" << profiler_.Summary();
+#endif  // LITE_WITH_PROFILE
 }
 
 void Program::Build(const cpp::ProgramDesc& prog) {
@@ -183,11 +186,6 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) {
 void Instruction::Run() {
   CHECK(op_) << "op null";
   CHECK(kernel_) << "kernel null";
-#ifdef LITE_WITH_PROFILE
-  if (profile_id_ >= 0) {
-    profile::ProfileBlock x(profile_id_, "instruction");
-  }
-#endif  // LITE_WITH_PROFILE
   if (first_epoch_) {
     first_epoch_ = false;
     CHECK(op_->CheckShape());
diff --git a/lite/core/program.h b/lite/core/program.h
index 7a6700da61..1c1e4975c3 100644
--- a/lite/core/program.h
+++ b/lite/core/program.h
@@ -22,9 +22,6 @@
 #include "lite/core/op_lite.h"
 #include "lite/core/op_registry.h"
 #include "lite/model_parser/cpp/program_desc.h"
-#ifdef LITE_WITH_PROFILE
-#include "lite/core/profile/basic_profiler.h"
-#endif  // LITE_WITH_PROFILE
 
 namespace paddle {
 namespace lite {
@@ -87,22 +84,7 @@ struct Program {
 struct Instruction {
   Instruction(const std::shared_ptr<OpLite>& op,
               std::unique_ptr<KernelBase>&& kernel)
-      : op_(op), kernel_(std::move(kernel)) {
-#ifdef LITE_WITH_PROFILE
-    if (op_->Type() != "feed" && op_->Type() != "fetch") {
-      profile_id_ = profile::BasicProfiler<profile::BasicTimer>::Global()
-                        .NewRcd(kernel_->SerializedKernelType())
-                        .id();
-      kernel_->SetProfileID(profile_id_);
-      // Set profile custom info
-      auto& profiler =
-          *profile::BasicProfiler<profile::BasicTimer>::Global().mutable_record(
-              profile_id_);
-      profiler.SetCustomInfo("op_type", op_->Type());
-      profiler.SetCustomInfo("op_info", op_->SerializedOpInfo());
-    }
-#endif  // LITE_WITH_PROFILE
-  }
+      : op_(op), kernel_(std::move(kernel)) {}
 
   // Run the instruction.
   void Run();
@@ -113,6 +95,20 @@ struct Instruction {
   const KernelBase* kernel() const { return kernel_.get(); }
   KernelBase* mutable_kernel() { return kernel_.get(); }
 
+#ifdef LITE_WITH_PROFILE
+  void set_profiler(profile::Profiler* profiler) {
+    profiler_ = profiler;
+    if (op_->Type() != "feed" && op_->Type() != "fetch") {
+      profile::OpCharacter ch;
+      ch.target = kernel()->target();
+      ch.op_type = op_->Type();
+      ch.kernel_name = kernel()->name();
+      profile_id_ = profiler->NewTimer(ch);
+      kernel_->SetProfiler(profiler_, profile_id_);
+    }
+  }
+#endif
+
  private:
   std::shared_ptr<OpLite> op_;
   std::unique_ptr<KernelBase> kernel_;
@@ -120,7 +116,7 @@ struct Instruction {
   bool has_run_{false};
 
 #ifdef LITE_WITH_PROFILE
-  // for profiler
+  profile::Profiler* profiler_;
   int profile_id_{-1};
 #endif  // LITE_WITH_PROFILE
 };
@@ -135,6 +131,9 @@ class LITE_API RuntimeProgram {
     if (instructions_.empty()) {
       LOG(FATAL) << "no instructions";
     }
+#ifdef LITE_WITH_PROFILE
+    set_profiler();
+#endif
   }
 
   void Run();
@@ -159,6 +158,15 @@ class LITE_API RuntimeProgram {
   RuntimeProgram(const RuntimeProgram&) = delete;
   std::vector<Instruction> instructions_;
   lite::Scope* exec_scope_{};
+
+#ifdef LITE_WITH_PROFILE
+  profile::Profiler profiler_;
+  void set_profiler() {
+    for (auto i = instructions_.begin(); i != instructions_.end(); ++i) {
+      i->set_profiler(&profiler_);
+    }
+  }
+#endif
 };
 
 }  // namespace lite
diff --git a/lite/tests/cv/image_convert_test.cc b/lite/tests/cv/image_convert_test.cc
index 7c0f867fae..eefd30f74f 100644
--- a/lite/tests/cv/image_convert_test.cc
+++ b/lite/tests/cv/image_convert_test.cc
@@ -17,8 +17,8 @@
 #include <math.h>
 #include <random>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/tests/cv/cv_basic.h"
-#include "lite/tests/utils/timer.h"
 #include "lite/utils/cv/paddle_image_preprocess.h"
 
 DEFINE_int32(cluster, 3, "cluster id");
@@ -46,7 +46,7 @@ typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess;
 typedef paddle::lite_api::Tensor Tensor_api;
 typedef paddle::lite::Tensor Tensor;
 
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 void fill_tensor_host_rand(uint8_t* dio, int64_t size) {
   uint seed = 256;
@@ -285,8 +285,8 @@ void test_img(const std::vector<int>& cluster_id,
       ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
 
       for (int i = 0; i < test_iter; ++i) {
-        t1.clear();
-        t1.start();
+        t1.Reset();
+        t1.Start();
 
         LOG(INFO) << "image convert saber compute";
         // 方法一: image_preprocess.imageCovert(src, lite_dst);
@@ -329,8 +329,8 @@ void test_img(const std::vector<int>& cluster_id,
                                       means,
                                       scales);
 
-        t1.end();
-        double tdiff = t1.get_average_ms();
+        t1.Stop();
+        double tdiff = t1.LapTimes().Avg();
         to += tdiff;
         if (tdiff < min_time) {
           min_time = tdiff;
diff --git a/lite/tests/math/conv_compute_test.cc b/lite/tests/math/conv_compute_test.cc
index 194d7ab1c3..bda50d3563 100644
--- a/lite/tests/math/conv_compute_test.cc
+++ b/lite/tests/math/conv_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/conv_compute.h"
@@ -59,7 +59,7 @@ DEFINE_bool(flag_bias, true, "with bias");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::ConvParam ConvParam;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
                      const paddle::lite::operators::ConvParam& param) {
@@ -205,19 +205,19 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
         /// compute
         Timer t0;
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           conv.Launch();
-          t0.end();
+          t0.Stop();
         }
 
         double gops = 2.0 * dim_out.production() * dim_in[1] * weight_dim[2] *
                       weight_dim[3] / param.groups;
         LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape"
-                  << dim_out << ",running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         if (FLAGS_check_result) {
           double max_ratio = 0;
diff --git a/lite/tests/math/conv_int8_compute_test.cc b/lite/tests/math/conv_int8_compute_test.cc
index 6af9bbd431..27c186d7ce 100644
--- a/lite/tests/math/conv_int8_compute_test.cc
+++ b/lite/tests/math/conv_int8_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/conv_compute.h"
@@ -59,7 +59,7 @@ DEFINE_bool(flag_bias, true, "with bias");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::ConvParam ConvParam;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
                      const paddle::lite::operators::ConvParam& param) {
@@ -309,30 +309,30 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
         /// compute fp32 output
         Timer t0;
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           conv_int8_fp32.Launch();
-          t0.end();
+          t0.Stop();
         }
         LOG(INFO) << "int8 conv, fp32 output: output shape" << dim_out
-                  << ",running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << ",running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         /// compute int8 output
-        t0.clear();
+        t0.Reset();
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           conv_int8_int8.Launch();
-          t0.end();
+          t0.Stop();
         }
         LOG(INFO) << "int8 conv, int8 output: output shape" << dim_out
-                  << ",running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << ",running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         /// compare result fp32 output
         if (FLAGS_check_result) {
diff --git a/lite/tests/math/conv_transpose_compute_test.cc b/lite/tests/math/conv_transpose_compute_test.cc
index 7a56b5836b..398e745d94 100644
--- a/lite/tests/math/conv_transpose_compute_test.cc
+++ b/lite/tests/math/conv_transpose_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/conv_transpose_compute.h"
@@ -59,7 +59,7 @@ DEFINE_bool(flag_bias, false, "with bias");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::ConvParam ConvParam;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
                      const paddle::lite::operators::ConvParam& param) {
@@ -187,19 +187,19 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
         /// compute
         Timer t0;
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           conv_t.Launch();
-          t0.end();
+          t0.Stop();
         }
 
         float gops =
             2.f * tmp_weights.numel() * dim_in[0] * dim_in[2] * dim_in[3];
         LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape"
-                  << dim_out << ",running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         if (FLAGS_check_result) {
           double max_ratio = 0;
diff --git a/lite/tests/math/gemm_int8_compute_test.cc b/lite/tests/math/gemm_int8_compute_test.cc
index 06a1a0a65e..fde5aacb1c 100644
--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
@@ -20,12 +20,12 @@
 #include "lite/backends/arm/math/funcs.h"
 #endif  // LITE_WITH_ARM
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/core/tensor.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 typedef paddle::lite::Tensor Tensor;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DEFINE_int32(power_mode,
              3,
@@ -193,7 +193,7 @@ bool test_gemm_int8(bool tra,
     dbias_int8[l] = dbias[l] / scale_c[0];
   }
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::gemm_prepack_int8(tpackedA.data<int8_t>(),
                                                db,
                                                dbias_int8,
@@ -206,21 +206,21 @@ bool test_gemm_int8(bool tra,
                                                trb,
                                                scale_merge_int8.data(),
                                                &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemm_int8_int8 output: M: " << m << ", N: " << n << ", K: " << k
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   /// fp32 output compute
-  t0.clear();
+  t0.Reset();
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::gemm_prepack_int8(tpackedA.data<int8_t>(),
                                                db,
                                                dbias,
@@ -233,15 +233,15 @@ bool test_gemm_int8(bool tra,
                                                trb,
                                                scale_merge_fp32.data(),
                                                &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n << ", K: " << k
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   if (FLAGS_check_result) {
diff --git a/lite/tests/math/gemv_int8_compute_test.cc b/lite/tests/math/gemv_int8_compute_test.cc
index c64e78d66a..623615c8da 100644
--- a/lite/tests/math/gemv_int8_compute_test.cc
+++ b/lite/tests/math/gemv_int8_compute_test.cc
@@ -20,12 +20,12 @@
 #include "lite/backends/arm/math/funcs.h"
 #endif  // LITE_WITH_ARM
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/core/tensor.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 typedef paddle::lite::Tensor Tensor;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DEFINE_int32(power_mode,
              3,
@@ -165,7 +165,7 @@ bool test_gemv_int8(
     dbias_int8[l] = dbias[l] / scale_c[0];
   }
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::gemv_int8(da,
                                        db,
                                        dc_fp32,
@@ -177,21 +177,21 @@ bool test_gemv_int8(
                                        dbias,
                                        has_relu,
                                        &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemv_int8_int8 output: M: " << m << ", N: " << n
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   /// fp32 output compute
-  t0.clear();
+  t0.Reset();
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::gemv_int8(da,
                                        db,
                                        dc_int8,
@@ -203,15 +203,15 @@ bool test_gemv_int8(
                                        dbias_int8,
                                        has_relu,
                                        &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   if (FLAGS_check_result) {
diff --git a/lite/tests/math/layout_compute_test.cc b/lite/tests/math/layout_compute_test.cc
index 29f8f749db..a566924548 100644
--- a/lite/tests/math/layout_compute_test.cc
+++ b/lite/tests/math/layout_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/layout_compute.h"
@@ -48,7 +48,7 @@ typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::LayoutParam LayoutParam;
 
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 #define IN(n, c, h, w)                                 \
   input_data[w + h * input_w + c * input_h * input_w + \
@@ -165,17 +165,17 @@ void test_layout_fp32_nchw(DDim dim_in,
       /// compute
       Timer t0;
       for (int i = 0; i < FLAGS_repeats; ++i) {
-        t0.start();
+        t0.Start();
         layout.Run();
-        t0.end();
+        t0.Stop();
       }
       double gops = 2.0 * dim_out.production();
       LOG(INFO) << "layout fp32: input shape: " << dim_in << ", output shape"
-                << dim_out << ",running time, avg: " << t0.get_average_ms()
-                << ", min time: " << t0.get_min_time()
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
                 << ", total GOPS: " << 1e-9 * gops
-                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
       if (FLAGS_check_result) {
         double max_ratio = 0;
@@ -268,17 +268,17 @@ void test_layout_fp32_nhwc(DDim dim_in,
       /// compute
       Timer t0;
       for (int i = 0; i < FLAGS_repeats; ++i) {
-        t0.start();
+        t0.Start();
         layout.Run();
-        t0.end();
+        t0.Stop();
       }
       double gops = 2.0 * dim_out.production();
       LOG(INFO) << "layout fp32: input shape: " << dim_in << ", output shape"
-                << dim_out << ",running time, avg: " << t0.get_average_ms()
-                << ", min time: " << t0.get_min_time()
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
                 << ", total GOPS: " << 1e-9 * gops
-                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
       if (FLAGS_check_result) {
         double max_ratio = 0;
@@ -370,18 +370,18 @@ void test_layout_int8_nchw(DDim dim_in,
       /// compute
       Timer t0;
       for (int i = 0; i < FLAGS_repeats; ++i) {
-        t0.start();
+        t0.Start();
         layout.Run();
-        t0.end();
+        t0.Stop();
       }
       LOG(INFO) << "saber compute end";
       double gops = 2.0 * dim_out.production();
       LOG(INFO) << "layout int8: input shape: " << dim_in << ", output shape"
-                << dim_out << ",running time, avg: " << t0.get_average_ms()
-                << ", min time: " << t0.get_min_time()
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
                 << ", total GOPS: " << 1e-9 * gops
-                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
       if (FLAGS_check_result) {
         double max_ratio = 0;
@@ -474,18 +474,18 @@ void test_layout_int8_nhwc(DDim dim_in,
       /// compute
       Timer t0;
       for (int i = 0; i < FLAGS_repeats; ++i) {
-        t0.start();
+        t0.Start();
         layout.Run();
-        t0.end();
+        t0.Stop();
       }
       LOG(INFO) << "run";
       double gops = 2.0 * dim_out.production();
       LOG(INFO) << "layout int8: input shape: " << dim_in << ", output shape"
-                << dim_out << ",running time, avg: " << t0.get_average_ms()
-                << ", min time: " << t0.get_min_time()
+                << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
+                << ", min time: " << t0.LapTimes().Min()
                 << ", total GOPS: " << 1e-9 * gops
-                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
       if (FLAGS_check_result) {
         double max_ratio = 0;
diff --git a/lite/tests/math/pool_compute_test.cc b/lite/tests/math/pool_compute_test.cc
index 2d6a0be628..73a5ba5606 100644
--- a/lite/tests/math/pool_compute_test.cc
+++ b/lite/tests/math/pool_compute_test.cc
@@ -15,10 +15,10 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/operators/op_params.h"
 #include "lite/tests/utils/naive_math_impl.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 #ifdef LITE_WITH_ARM
 #include "lite/kernels/arm/pool_compute.h"
@@ -60,7 +60,7 @@ DEFINE_string(pooling_type, "max", "do max pooling");
 typedef paddle::lite::DDim DDim;
 typedef paddle::lite::Tensor Tensor;
 typedef paddle::lite::operators::PoolParam PoolParam;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DDim compute_out_dim(const DDim& dim_in,
                      const paddle::lite::operators::PoolParam& param) {
@@ -320,18 +320,18 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
         /// compute
         Timer t0;
         for (int i = 0; i < FLAGS_repeats; ++i) {
-          t0.start();
+          t0.Start();
           pool.Launch();
-          t0.end();
+          t0.Stop();
         }
 
         double gops = 2.0 * dim_out.production() * ksize[0] * ksize[1];
         LOG(INFO) << "pool fp32: input shape: " << dim_in << ", output shape"
-                  << dim_out << ", running time, avg: " << t0.get_average_ms()
-                  << ", min time: " << t0.get_min_time()
+                  << dim_out << ", running time, avg: " << t0.LapTimes().Avg()
+                  << ", min time: " << t0.LapTimes().Min()
                   << ", total GOPS: " << 1e-9 * gops
-                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms()
-                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time();
+                  << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
+                  << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
 
         if (FLAGS_check_result) {
           double max_ratio = 0;
diff --git a/lite/tests/math/sgemm_c4_compute_test.cc b/lite/tests/math/sgemm_c4_compute_test.cc
index 5fcc54f338..886dba6ac5 100644
--- a/lite/tests/math/sgemm_c4_compute_test.cc
+++ b/lite/tests/math/sgemm_c4_compute_test.cc
@@ -20,12 +20,12 @@
 #include "lite/backends/arm/math/funcs.h"
 #endif  // LITE_WITH_ARM
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/core/tensor.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 typedef paddle::lite::Tensor Tensor;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DEFINE_int32(power_mode,
              3,
@@ -134,18 +134,18 @@ bool test_sgemm_c4(
   }
 
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::sgemm_prepack_c4(
         m, n, k, da_c4, db_c4, dc, dbias, has_bias, has_relu, &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   if (FLAGS_check_result) {
diff --git a/lite/tests/math/sgemm_compute_test.cc b/lite/tests/math/sgemm_compute_test.cc
index 1621ceb904..6df5e671fe 100644
--- a/lite/tests/math/sgemm_compute_test.cc
+++ b/lite/tests/math/sgemm_compute_test.cc
@@ -20,12 +20,12 @@
 #include "lite/backends/arm/math/funcs.h"
 #endif  // LITE_WITH_ARM
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/core/tensor.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 typedef paddle::lite::Tensor Tensor;
-using paddle::lite::Timer;
+using paddle::lite::profile::Timer;
 
 DEFINE_int32(power_mode,
              3,
@@ -171,7 +171,7 @@ bool test_sgemm(bool tra,
     if (i == FLAGS_repeats - 1) {
       memcpy(dc, dc_backup, sizeof(float) * m * ldc);
     }
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::sgemm_prepack(trb,
                                            m,
                                            n,
@@ -186,15 +186,15 @@ bool test_sgemm(bool tra,
                                            has_bias,
                                            has_relu,
                                            &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k
             << ", power_mode: " << cls << ", threads: " << ths
             << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   if (FLAGS_check_result) {
diff --git a/lite/tests/math/sgemv_compute_test.cc b/lite/tests/math/sgemv_compute_test.cc
index 3c8965cb2c..5dd2d32295 100644
--- a/lite/tests/math/sgemv_compute_test.cc
+++ b/lite/tests/math/sgemv_compute_test.cc
@@ -20,9 +20,9 @@
 #include "lite/backends/arm/math/funcs.h"
 #endif  // LITE_WITH_ARM
 #include "lite/core/context.h"
+#include "lite/core/profile/timer.h"
 #include "lite/core/tensor.h"
 #include "lite/tests/utils/tensor_utils.h"
-#include "lite/tests/utils/timer.h"
 
 typedef paddle::lite::Tensor Tensor;
 
@@ -83,7 +83,7 @@ bool test_sgemv(
     basic_gemv(
         m, k, da, db, dbias, dc_basic, 1.f, 0.f, tra, has_bias, has_relu);
   }
-  paddle::lite::Timer t0;
+  paddle::lite::profile::Timer t0;
   //! compute
   double ops = 2.0 * m * k;
   std::unique_ptr<paddle::lite::KernelContext> ctx1(
@@ -96,19 +96,19 @@ bool test_sgemv(
         da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx);
   }
 
-  t0.clear();
+  t0.Reset();
   for (int i = 0; i < FLAGS_repeats; ++i) {
-    t0.start();
+    t0.Start();
     paddle::lite::arm::math::sgemv(
         da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx);
-    t0.end();
+    t0.Stop();
   }
   LOG(INFO) << "gemv output: M: " << m << ", K: " << k << ", cluster: " << cls
             << ", threads: " << ths << ", GOPS: " << ops * 1e-9f
-            << " GOPS, avg time: " << t0.get_average_ms()
-            << " ms, min time: " << t0.get_min_time()
-            << " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms()
-            << " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time()
+            << " GOPS, avg time: " << t0.LapTimes().Avg()
+            << " ms, min time: " << t0.LapTimes().Min()
+            << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
+            << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
             << " GOPs";
 
   if (FLAGS_check_result) {
diff --git a/lite/tests/utils/timer.h b/lite/tests/utils/timer.h
deleted file mode 100644
index 095f32046e..0000000000
--- a/lite/tests/utils/timer.h
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <chrono>  // NOLINT
-#include <list>
-
-namespace paddle {
-namespace lite {
-
-class Timer final {
- public:
-  Timer() {}
-
-  ~Timer() {}
-
-  void clear() { ms_time_.clear(); }
-
-  void start() { tstart_ = std::chrono::system_clock::now(); }
-
-  void end() {
-    tend_ = std::chrono::system_clock::now();
-    auto ts =
-        std::chrono::duration_cast<std::chrono::microseconds>(tend_ - tstart_);
-    latest_time_ = 1000.f * static_cast<float>(ts.count()) *
-                   std::chrono::microseconds::period::num /
-                   std::chrono::microseconds::period::den;
-    ms_time_.push_back(latest_time_);
-  }
-
-  float latest_time() const { return latest_time_; }
-
-  float get_average_ms() {
-    if (ms_time_.size() == 0) {
-      return 0.f;
-    }
-    float sum = 0.f;
-    for (auto i : ms_time_) {
-      sum += i;
-    }
-    return sum / ms_time_.size();
-  }
-
-  float get_sum_ms() {
-    if (ms_time_.size() == 0) {
-      return 0.f;
-    }
-    float sum = 0.f;
-    for (auto i : ms_time_) {
-      sum += i;
-    }
-    return sum;
-  }
-
-  // return tile (0-99) time.
-  float get_tile_time(float tile) {
-    if (tile < 0 || tile > 100) {
-      return -1.f;
-    }
-    int total_items = static_cast<int>(ms_time_.size());
-    if (total_items <= 0) {
-      return -2.f;
-    }
-    ms_time_.sort();
-    int pos = static_cast<int>(tile * total_items / 100);
-    auto it = ms_time_.begin();
-    for (int i = 0; i < pos; ++i) {
-      ++it;
-    }
-    return *it;
-  }
-
-  std::list<float> get_time_stat() { return ms_time_; }
-
-  float get_min_time() {
-    ms_time_.sort();
-    return *ms_time_.begin();
-  }
-
-  float get_max_time() {
-    ms_time_.sort([](int a, int b) { return a > b; });
-    return *ms_time_.begin();
-  }
-
- private:
-  std::chrono::time_point<std::chrono::system_clock> tstart_;
-  std::chrono::time_point<std::chrono::system_clock> tend_;
-  std::list<float> ms_time_;
-  float latest_time_;
-};
-
-}  // namespace lite
-}  // namespace paddle
-- 
GitLab