未验证 提交 8a634b71 编写于 作者: 石晓伟 提交者: GitHub

refactor profile tools, test=develop (#2536)

上级 c8b51f82
...@@ -21,14 +21,14 @@ ...@@ -21,14 +21,14 @@
#include "lite/api/paddle_use_passes.h" #include "lite/api/paddle_use_passes.h"
#include "lite/api/test_helper.h" #include "lite/api/test_helper.h"
#include "lite/core/device_info.h" #include "lite/core/device_info.h"
#include "lite/tests/utils/timer.h" #include "lite/core/profile/timer.h"
#include "lite/utils/cp_logging.h" #include "lite/utils/cp_logging.h"
#include "lite/utils/string.h" #include "lite/utils/string.h"
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
#include "lite/core/profile/basic_profiler.h" #include "lite/core/profile/basic_profiler.h"
#endif // LITE_WITH_PROFILE #endif // LITE_WITH_PROFILE
using paddle::lite::Timer; using paddle::lite::profile::Timer;
DEFINE_string(input_shape, DEFINE_string(input_shape,
"1,3,224,224", "1,3,224,224",
...@@ -102,20 +102,20 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes, ...@@ -102,20 +102,20 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
Timer ti; Timer ti;
for (int j = 0; j < repeat; ++j) { for (int j = 0; j < repeat; ++j) {
ti.start(); ti.Start();
predictor->Run(); predictor->Run();
ti.end(); float t = ti.Stop();
LOG(INFO) << "iter: " << j << ", time: " << ti.latest_time() << " ms"; LOG(INFO) << "iter: " << j << ", time: " << t << " ms";
} }
LOG(INFO) << "================== Speed Report ==================="; LOG(INFO) << "================== Speed Report ===================";
LOG(INFO) << "Model: " << model_dir LOG(INFO) << "Model: " << model_dir
<< ", power_mode: " << static_cast<int>(power_mode) << ", power_mode: " << static_cast<int>(power_mode)
<< ", threads num " << thread_num << ", warmup: " << warmup_times << ", threads num " << thread_num << ", warmup: " << warmup_times
<< ", repeats: " << repeat << ", avg time: " << ti.get_average_ms() << ", repeats: " << repeat << ", avg time: " << ti.LapTimes().Avg()
<< " ms" << " ms"
<< ", min time: " << ti.get_min_time() << " ms" << ", min time: " << ti.LapTimes().Min() << " ms"
<< ", max time: " << ti.get_max_time() << " ms."; << ", max time: " << ti.LapTimes().Max() << " ms.";
auto output = predictor->GetOutput(0); auto output = predictor->GetOutput(0);
auto out = output->data<float>(); auto out = output->data<float>();
......
...@@ -99,7 +99,7 @@ add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc) ...@@ -99,7 +99,7 @@ add_custom_target(all_kernel_faked_cc DEPENDS all_kernel_faked.cc)
#----------------------------------------------- NOT CHANGE ----------------------------------------------- #----------------------------------------------- NOT CHANGE -----------------------------------------------
lite_cc_library(kernel SRCS kernel.cc lite_cc_library(kernel SRCS kernel.cc
DEPS context type_system target_wrapper any op_params tensor DEPS context type_system target_wrapper any op_params tensor
PROFILE_DEPS basic_profiler PROFILE_DEPS lite_profiler
) )
lite_cc_library(op SRCS op_lite.cc DEPS scope op_registry target_wrapper kernel lite_cc_library(op SRCS op_lite.cc DEPS scope op_registry target_wrapper kernel
cpp_op_desc tensor cpp_op_desc tensor
...@@ -113,7 +113,7 @@ lite_cc_library(type_system SRCS type_system.cc DEPS tensor target_wrapper) ...@@ -113,7 +113,7 @@ lite_cc_library(type_system SRCS type_system.cc DEPS tensor target_wrapper)
lite_cc_library(program SRCS program.cc lite_cc_library(program SRCS program.cc
DEPS op kernel model_parser ${ops} ${cpp_wrapper} DEPS op kernel model_parser ${ops} ${cpp_wrapper}
PROFILE_DEPS basic_profiler) PROFILE_DEPS lite_profiler)
if (NOT LITE_ON_TINY_PUBLISH) if (NOT LITE_ON_TINY_PUBLISH)
lite_cc_library(optimizer SRCS optimizer.cc DEPS mir_pass_manager model_parser program) lite_cc_library(optimizer SRCS optimizer.cc DEPS mir_pass_manager model_parser program)
......
...@@ -37,6 +37,9 @@ void TestCase::CreateInstruction() { ...@@ -37,6 +37,9 @@ void TestCase::CreateInstruction() {
// prepare context // prepare context
(*it)->SetContext(std::move(ctx_)); (*it)->SetContext(std::move(ctx_));
instruction_.reset(new Instruction(op, std::move(*it))); instruction_.reset(new Instruction(op, std::move(*it)));
#ifdef LITE_WITH_PROFILE
instruction_->set_profiler(new profile::Profiler());
#endif
} }
void TestCase::PrepareInputsForInstruction() { void TestCase::PrepareInputsForInstruction() {
......
...@@ -31,7 +31,7 @@ ...@@ -31,7 +31,7 @@
#include "lite/utils/replace_stl/stream.h" #include "lite/utils/replace_stl/stream.h"
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
#include "lite/core/profile/basic_profiler.h" #include "lite/core/profile/profiler.h"
#endif // LITE_WITH_PROFILE #endif // LITE_WITH_PROFILE
namespace paddle { namespace paddle {
...@@ -58,7 +58,10 @@ class KernelBase { ...@@ -58,7 +58,10 @@ class KernelBase {
virtual void Run() = 0; virtual void Run() = 0;
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
void SetProfileID(uint32_t id) { profile_id_ = id; } void SetProfiler(profile::Profiler* profiler, int id) {
profiler_ = profiler;
profile_id_ = id;
}
#endif #endif
void Launch() { void Launch() {
...@@ -82,10 +85,12 @@ class KernelBase { ...@@ -82,10 +85,12 @@ class KernelBase {
#endif #endif
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
if (profile_id_ >= 0) { CHECK(profiler_) << "Profiler pointer of kernel can not be nullptr. "
profile::ProfileBlock x(profile_id_, "kernel"); "When LITE_WITH_PROFILE is defined, please set a "
"Profiler for Instruction.";
profiler_->StartTiming(profile_id_, ctx_.get());
Run(); Run();
} profiler_->StopTiming(profile_id_, ctx_.get());
#else #else
Run(); Run();
#endif #endif
...@@ -175,6 +180,7 @@ class KernelBase { ...@@ -175,6 +180,7 @@ class KernelBase {
bool is_first_epoch_{true}; bool is_first_epoch_{true};
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
profile::Profiler* profiler_{nullptr};
int profile_id_{-1}; int profile_id_{-1};
#endif #endif
}; };
......
...@@ -5,4 +5,5 @@ endif() ...@@ -5,4 +5,5 @@ endif()
lite_cc_library(basic_profiler SRCS basic_profiler.cc DEPS gflags) lite_cc_library(basic_profiler SRCS basic_profiler.cc DEPS gflags)
lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler) lite_cc_test(test_basic_profiler SRCS basic_profiler_test.cc DEPS basic_profiler)
lite_cc_library(lite_profiler SRCS profiler.cc DEPS context)
lite_cc_test(test_lite_timer SRCS test_timer.cc DEPS lite_profiler)
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/core/profile/profiler.h"
#include <map>
#include <string>
#include <utility>
namespace paddle {
namespace lite {
namespace profile {
int Profiler::NewTimer(const OpCharacter& ch) {
StatisUnit unit;
unit.character = ch;
if (ch.target == TargetType::kCUDA) {
#ifdef LITE_WITH_CUDA
unit.timer.reset(new DeviceTimer<TargetType::kCUDA>());
#else
LOG(ERROR) << "The timer type specified as cuda is uninitialized, so the "
"default x86 timer is used instead.";
#endif
} else {
unit.timer.reset(new DeviceTimer<TargetType::kHost>());
}
units_.push_back(std::move(unit));
return units_.size() - 1;
}
void Profiler::StartTiming(const int index, KernelContext* ctx) {
CHECK_LT(index, units_.size())
<< "The timer index in the profiler is out of range.";
units_[index].timer->Start(ctx);
}
float Profiler::StopTiming(const int index, KernelContext* ctx) {
CHECK_LT(index, units_.size())
<< "The timer index in the profiler is out of range.";
return units_[index].timer->Stop(ctx);
}
std::string Profiler::Summary(bool concise) {
STL::stringstream ss;
auto cout_title = [&ss](const std::string& title, const std::string& name) {
// clang-format off
ss << "===== " << title << ": " << name << " =====" << std::endl;
ss << std::setw(25) << std::left << "Operator Type" \
<< std::setw(40) << std::left << "Kernel Name" \
<< std::setw(10) << std::left << "Remark" \
<< std::setw(10) << std::left << "Avg (ms)" \
<< std::setw(10) << std::left << "Min (ms)" \
<< std::setw(10) << std::left << "Max (ms)" \
<< std::endl;
// clang-format on
};
if (concise) {
auto op_comp = [](const OpCharacter& c1, const OpCharacter& c2) {
return (c1.target < c2.target) || (c1.op_type < c2.op_type) ||
(c1.kernel_name < c2.kernel_name) || (c1.remark < c2.remark);
};
std::map<OpCharacter, TimeInfo, decltype(op_comp)> summary(op_comp);
for (auto& unit : units_) {
auto ch = summary.find(unit.character);
if (ch != summary.end()) {
ch->second.avg += unit.timer->LapTimes().Avg();
ch->second.min += unit.timer->LapTimes().Min();
ch->second.max += unit.timer->LapTimes().Max();
} else {
TimeInfo info({unit.timer->LapTimes().Avg(),
unit.timer->LapTimes().Min(),
unit.timer->LapTimes().Max()});
summary.insert({unit.character, info});
}
}
cout_title("Concise Profiler Summary", name_);
for (const auto& item : summary) {
// clang-format off
ss << std::setw(25) << std::left << item.first.op_type \
<< std::setw(40) << std::left << item.first.kernel_name \
<< std::setw(10) << std::left << item.first.remark \
<< std::setw(10) << std::left << item.second.avg \
<< std::setw(10) << std::left << item.second.min \
<< std::setw(10) << std::left << item.second.max \
<< std::endl;
// clang-format on
}
} else {
cout_title("Detailed Profiler Summary", name_);
for (auto& unit : units_) {
// clang-format off
ss << std::setw(25) << std::left << unit.character.op_type \
<< std::setw(40) << std::left << unit.character.kernel_name \
<< std::setw(10) << std::left << unit.character.remark \
<< std::setw(10) << std::left << unit.timer->LapTimes().Avg() \
<< std::setw(10) << std::left << unit.timer->LapTimes().Min() \
<< std::setw(10) << std::left << unit.timer->LapTimes().Max() \
<< std::endl;
// clang-format on
}
}
return ss.str();
}
} // namespace profile
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <string>
#include <vector>
#include "lite/core/profile/timer.h"
namespace paddle {
namespace lite {
namespace profile {
struct TimeInfo {
float avg;
float min;
float max;
};
struct OpCharacter {
TargetType target;
std::string op_type{std::string("N/A")};
std::string kernel_name{std::string("N/A")};
std::string remark{std::string("N/A")};
};
struct StatisUnit {
std::unique_ptr<Timer> timer;
OpCharacter character;
};
class Profiler final {
public:
Profiler() = default;
explicit Profiler(const std::string& name) : name_(name) {}
int NewTimer(const OpCharacter& ch);
void StartTiming(const int index, KernelContext* ctx);
float StopTiming(const int index, KernelContext* ctx);
std::string Summary(bool concise = true);
private:
std::string name_{std::string("N/A")};
std::vector<StatisUnit> units_;
};
} // namespace profile
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <gtest/gtest.h>
#include <chrono> // NOLINT
#include <thread> // NOLINT
#include "lite/core/context.h"
#include "lite/core/profile/profiler.h"
#include "lite/core/profile/timer.h"
#include "lite/utils/cp_logging.h"
namespace paddle {
namespace lite {
namespace profile {
TEST(timer, real_latency) {
Timer timer;
timer.Start();
std::this_thread::sleep_for(std::chrono::milliseconds(10));
timer.Stop();
timer.Start();
std::this_thread::sleep_for(std::chrono::milliseconds(50));
timer.Stop();
LOG(INFO) << "LapTimes().Avg() = " << timer.LapTimes().Avg();
}
#ifdef LITE_WITH_CUDA
TEST(gpu_timer, real_latency) {
DeviceTimer<TargetType::kCUDA> timer;
KernelContext ctx;
cudaStream_t exec_stream;
cudaStreamCreate(&exec_stream);
(&ctx.As<CUDAContext>())->SetExecStream(exec_stream);
timer.Start(&ctx);
std::this_thread::sleep_for(std::chrono::milliseconds(10));
timer.Stop(&ctx);
(&timer)->Start(&ctx);
std::this_thread::sleep_for(std::chrono::milliseconds(50));
timer.Stop(&ctx);
LOG(INFO) << "LapTimes().Avg() = " << timer.LapTimes().Avg();
}
TEST(profiler, real_latency) {
KernelContext ctx;
cudaStream_t exec_stream;
cudaStreamCreate(&exec_stream);
(&ctx.As<CUDAContext>())->SetExecStream(exec_stream);
Profiler profiler("name");
profile::OpCharacter ch;
ch.target = TargetType::kCUDA;
ch.op_type = "operator/1";
ch.kernel_name = "kernel/1";
int idx = profiler.NewTimer(ch);
profiler.StartTiming(idx, &ctx);
std::this_thread::sleep_for(std::chrono::milliseconds(10));
profiler.StopTiming(idx, &ctx);
std::cout << profiler.Summary();
}
#endif
} // namespace profile
} // namespace lite
} // namespace paddle
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <algorithm>
#include <chrono> // NOLINT
#include <list>
#ifdef LITE_WITH_CUDA
#include "lite/backends/cuda/cuda_utils.h"
#endif
#include "lite/core/context.h"
namespace paddle {
namespace lite {
namespace profile {
template <typename T>
class TimeList {
public:
void Clear() { laps_t_.clear(); }
void Add(T t) { laps_t_.push_back(t); }
T Max() const { return *std::max_element(laps_t_.begin(), laps_t_.end()); }
T Min() const { return *std::min_element(laps_t_.begin(), laps_t_.end()); }
T Sum() const { return std::accumulate(laps_t_.begin(), laps_t_.end(), 0.0); }
size_t Size() const { return laps_t_.size(); }
T Avg() const {
if (!Size()) {
return 0;
}
return Sum() / Size();
}
const std::list<T>& Raw() const { return laps_t_; }
private:
std::list<T> laps_t_;
};
class Timer {
public:
Timer() = default;
virtual ~Timer() = default;
void Reset() { laps_t_.Clear(); }
void Start() { t_start_ = std::chrono::system_clock::now(); }
float Stop() {
t_stop_ = std::chrono::system_clock::now();
auto ts = std::chrono::duration_cast<std::chrono::microseconds>(t_stop_ -
t_start_);
float elapse_ms = 1000.f * static_cast<float>(ts.count()) *
std::chrono::microseconds::period::num /
std::chrono::microseconds::period::den;
this->laps_t_.Add(elapse_ms);
return elapse_ms;
}
virtual void Start(KernelContext* ctx) { return Start(); }
virtual float Stop(KernelContext* ctx) { return Stop(); }
float AvgLapTimeMs() const { return laps_t_.Avg(); }
const TimeList<float>& LapTimes() const { return laps_t_; }
protected:
std::chrono::time_point<std::chrono::system_clock> t_start_, t_stop_;
TimeList<float> laps_t_;
};
template <TargetType Target>
class DeviceTimer final : public Timer {};
#ifdef LITE_WITH_CUDA
template <>
class DeviceTimer<TargetType::kCUDA> final : public Timer {
public:
DeviceTimer() {
CUDA_CALL(cudaEventCreate(&e_start_));
CUDA_CALL(cudaEventCreate(&e_stop_));
}
~DeviceTimer() {
CUDA_CALL(cudaEventDestroy(e_start_));
CUDA_CALL(cudaEventDestroy(e_stop_));
}
void Start(KernelContext* ctx) {
cudaStream_t stream;
stream = ctx->As<CUDAContext>().exec_stream();
CUDA_CALL(cudaEventRecord(e_start_, stream));
}
float Stop(KernelContext* ctx) {
cudaStream_t stream;
stream = ctx->As<CUDAContext>().exec_stream();
CUDA_CALL(cudaEventRecord(e_stop_, stream));
CUDA_CALL(cudaEventSynchronize(e_stop_));
float elapse_ms = 1.f;
CUDA_CALL(cudaEventElapsedTime(&elapse_ms, e_start_, e_stop_));
this->laps_t_.Add(elapse_ms);
return elapse_ms;
}
private:
cudaEvent_t e_start_, e_stop_;
};
#endif
} // namespace profile
} // namespace lite
} // namespace paddle
...@@ -122,6 +122,9 @@ void RuntimeProgram::Run() { ...@@ -122,6 +122,9 @@ void RuntimeProgram::Run() {
#endif // LITE_WITH_PRECISION_PROFILE #endif // LITE_WITH_PRECISION_PROFILE
#endif // LITE_WITH_PROFILE #endif // LITE_WITH_PROFILE
} }
#ifdef LITE_WITH_PROFILE
LOG(INFO) << "\n" << profiler_.Summary();
#endif // LITE_WITH_PROFILE
} }
void Program::Build(const cpp::ProgramDesc& prog) { void Program::Build(const cpp::ProgramDesc& prog) {
...@@ -183,11 +186,6 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) { ...@@ -183,11 +186,6 @@ void Program::PrepareWorkspace(const cpp::ProgramDesc& prog) {
void Instruction::Run() { void Instruction::Run() {
CHECK(op_) << "op null"; CHECK(op_) << "op null";
CHECK(kernel_) << "kernel null"; CHECK(kernel_) << "kernel null";
#ifdef LITE_WITH_PROFILE
if (profile_id_ >= 0) {
profile::ProfileBlock x(profile_id_, "instruction");
}
#endif // LITE_WITH_PROFILE
if (first_epoch_) { if (first_epoch_) {
first_epoch_ = false; first_epoch_ = false;
CHECK(op_->CheckShape()); CHECK(op_->CheckShape());
......
...@@ -22,9 +22,6 @@ ...@@ -22,9 +22,6 @@
#include "lite/core/op_lite.h" #include "lite/core/op_lite.h"
#include "lite/core/op_registry.h" #include "lite/core/op_registry.h"
#include "lite/model_parser/cpp/program_desc.h" #include "lite/model_parser/cpp/program_desc.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/basic_profiler.h"
#endif // LITE_WITH_PROFILE
namespace paddle { namespace paddle {
namespace lite { namespace lite {
...@@ -87,22 +84,7 @@ struct Program { ...@@ -87,22 +84,7 @@ struct Program {
struct Instruction { struct Instruction {
Instruction(const std::shared_ptr<OpLite>& op, Instruction(const std::shared_ptr<OpLite>& op,
std::unique_ptr<KernelBase>&& kernel) std::unique_ptr<KernelBase>&& kernel)
: op_(op), kernel_(std::move(kernel)) { : op_(op), kernel_(std::move(kernel)) {}
#ifdef LITE_WITH_PROFILE
if (op_->Type() != "feed" && op_->Type() != "fetch") {
profile_id_ = profile::BasicProfiler<profile::BasicTimer>::Global()
.NewRcd(kernel_->SerializedKernelType())
.id();
kernel_->SetProfileID(profile_id_);
// Set profile custom info
auto& profiler =
*profile::BasicProfiler<profile::BasicTimer>::Global().mutable_record(
profile_id_);
profiler.SetCustomInfo("op_type", op_->Type());
profiler.SetCustomInfo("op_info", op_->SerializedOpInfo());
}
#endif // LITE_WITH_PROFILE
}
// Run the instruction. // Run the instruction.
void Run(); void Run();
...@@ -113,6 +95,20 @@ struct Instruction { ...@@ -113,6 +95,20 @@ struct Instruction {
const KernelBase* kernel() const { return kernel_.get(); } const KernelBase* kernel() const { return kernel_.get(); }
KernelBase* mutable_kernel() { return kernel_.get(); } KernelBase* mutable_kernel() { return kernel_.get(); }
#ifdef LITE_WITH_PROFILE
void set_profiler(profile::Profiler* profiler) {
profiler_ = profiler;
if (op_->Type() != "feed" && op_->Type() != "fetch") {
profile::OpCharacter ch;
ch.target = kernel()->target();
ch.op_type = op_->Type();
ch.kernel_name = kernel()->name();
profile_id_ = profiler->NewTimer(ch);
kernel_->SetProfiler(profiler_, profile_id_);
}
}
#endif
private: private:
std::shared_ptr<OpLite> op_; std::shared_ptr<OpLite> op_;
std::unique_ptr<KernelBase> kernel_; std::unique_ptr<KernelBase> kernel_;
...@@ -120,7 +116,7 @@ struct Instruction { ...@@ -120,7 +116,7 @@ struct Instruction {
bool has_run_{false}; bool has_run_{false};
#ifdef LITE_WITH_PROFILE #ifdef LITE_WITH_PROFILE
// for profiler profile::Profiler* profiler_;
int profile_id_{-1}; int profile_id_{-1};
#endif // LITE_WITH_PROFILE #endif // LITE_WITH_PROFILE
}; };
...@@ -135,6 +131,9 @@ class LITE_API RuntimeProgram { ...@@ -135,6 +131,9 @@ class LITE_API RuntimeProgram {
if (instructions_.empty()) { if (instructions_.empty()) {
LOG(FATAL) << "no instructions"; LOG(FATAL) << "no instructions";
} }
#ifdef LITE_WITH_PROFILE
set_profiler();
#endif
} }
void Run(); void Run();
...@@ -159,6 +158,15 @@ class LITE_API RuntimeProgram { ...@@ -159,6 +158,15 @@ class LITE_API RuntimeProgram {
RuntimeProgram(const RuntimeProgram&) = delete; RuntimeProgram(const RuntimeProgram&) = delete;
std::vector<Instruction> instructions_; std::vector<Instruction> instructions_;
lite::Scope* exec_scope_{}; lite::Scope* exec_scope_{};
#ifdef LITE_WITH_PROFILE
profile::Profiler profiler_;
void set_profiler() {
for (auto i = instructions_.begin(); i != instructions_.end(); ++i) {
i->set_profiler(&profiler_);
}
}
#endif
}; };
} // namespace lite } // namespace lite
......
...@@ -17,8 +17,8 @@ ...@@ -17,8 +17,8 @@
#include <math.h> #include <math.h>
#include <random> #include <random>
#include "lite/core/context.h" #include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/tests/cv/cv_basic.h" #include "lite/tests/cv/cv_basic.h"
#include "lite/tests/utils/timer.h"
#include "lite/utils/cv/paddle_image_preprocess.h" #include "lite/utils/cv/paddle_image_preprocess.h"
DEFINE_int32(cluster, 3, "cluster id"); DEFINE_int32(cluster, 3, "cluster id");
...@@ -46,7 +46,7 @@ typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess; ...@@ -46,7 +46,7 @@ typedef paddle::lite::utils::cv::ImagePreprocess ImagePreprocess;
typedef paddle::lite_api::Tensor Tensor_api; typedef paddle::lite_api::Tensor Tensor_api;
typedef paddle::lite::Tensor Tensor; typedef paddle::lite::Tensor Tensor;
using paddle::lite::Timer; using paddle::lite::profile::Timer;
void fill_tensor_host_rand(uint8_t* dio, int64_t size) { void fill_tensor_host_rand(uint8_t* dio, int64_t size) {
uint seed = 256; uint seed = 256;
...@@ -285,8 +285,8 @@ void test_img(const std::vector<int>& cluster_id, ...@@ -285,8 +285,8 @@ void test_img(const std::vector<int>& cluster_id,
ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam); ImagePreprocess image_preprocess(srcFormat, dstFormat, tparam);
for (int i = 0; i < test_iter; ++i) { for (int i = 0; i < test_iter; ++i) {
t1.clear(); t1.Reset();
t1.start(); t1.Start();
LOG(INFO) << "image convert saber compute"; LOG(INFO) << "image convert saber compute";
// 方法一: image_preprocess.imageCovert(src, lite_dst); // 方法一: image_preprocess.imageCovert(src, lite_dst);
...@@ -329,8 +329,8 @@ void test_img(const std::vector<int>& cluster_id, ...@@ -329,8 +329,8 @@ void test_img(const std::vector<int>& cluster_id,
means, means,
scales); scales);
t1.end(); t1.Stop();
double tdiff = t1.get_average_ms(); double tdiff = t1.LapTimes().Avg();
to += tdiff; to += tdiff;
if (tdiff < min_time) { if (tdiff < min_time) {
min_time = tdiff; min_time = tdiff;
......
...@@ -15,10 +15,10 @@ ...@@ -15,10 +15,10 @@
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "lite/core/context.h" #include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/tests/utils/naive_math_impl.h" #include "lite/tests/utils/naive_math_impl.h"
#include "lite/tests/utils/tensor_utils.h" #include "lite/tests/utils/tensor_utils.h"
#include "lite/tests/utils/timer.h"
#ifdef LITE_WITH_ARM #ifdef LITE_WITH_ARM
#include "lite/kernels/arm/conv_compute.h" #include "lite/kernels/arm/conv_compute.h"
...@@ -59,7 +59,7 @@ DEFINE_bool(flag_bias, true, "with bias"); ...@@ -59,7 +59,7 @@ DEFINE_bool(flag_bias, true, "with bias");
typedef paddle::lite::DDim DDim; typedef paddle::lite::DDim DDim;
typedef paddle::lite::Tensor Tensor; typedef paddle::lite::Tensor Tensor;
typedef paddle::lite::operators::ConvParam ConvParam; typedef paddle::lite::operators::ConvParam ConvParam;
using paddle::lite::Timer; using paddle::lite::profile::Timer;
DDim compute_out_dim(const DDim& dim_in, DDim compute_out_dim(const DDim& dim_in,
const paddle::lite::operators::ConvParam& param) { const paddle::lite::operators::ConvParam& param) {
...@@ -205,19 +205,19 @@ void test_conv_fp32(const std::vector<DDim>& input_dims, ...@@ -205,19 +205,19 @@ void test_conv_fp32(const std::vector<DDim>& input_dims,
/// compute /// compute
Timer t0; Timer t0;
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start(); t0.Start();
conv.Launch(); conv.Launch();
t0.end(); t0.Stop();
} }
double gops = 2.0 * dim_out.production() * dim_in[1] * weight_dim[2] * double gops = 2.0 * dim_out.production() * dim_in[1] * weight_dim[2] *
weight_dim[3] / param.groups; weight_dim[3] / param.groups;
LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape" LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape"
<< dim_out << ",running time, avg: " << t0.get_average_ms() << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
<< ", min time: " << t0.get_min_time() << ", min time: " << t0.LapTimes().Min()
<< ", total GOPS: " << 1e-9 * gops << ", total GOPS: " << 1e-9 * gops
<< " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
if (FLAGS_check_result) { if (FLAGS_check_result) {
double max_ratio = 0; double max_ratio = 0;
......
...@@ -15,10 +15,10 @@ ...@@ -15,10 +15,10 @@
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "lite/core/context.h" #include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/tests/utils/naive_math_impl.h" #include "lite/tests/utils/naive_math_impl.h"
#include "lite/tests/utils/tensor_utils.h" #include "lite/tests/utils/tensor_utils.h"
#include "lite/tests/utils/timer.h"
#ifdef LITE_WITH_ARM #ifdef LITE_WITH_ARM
#include "lite/kernels/arm/conv_compute.h" #include "lite/kernels/arm/conv_compute.h"
...@@ -59,7 +59,7 @@ DEFINE_bool(flag_bias, true, "with bias"); ...@@ -59,7 +59,7 @@ DEFINE_bool(flag_bias, true, "with bias");
typedef paddle::lite::DDim DDim; typedef paddle::lite::DDim DDim;
typedef paddle::lite::Tensor Tensor; typedef paddle::lite::Tensor Tensor;
typedef paddle::lite::operators::ConvParam ConvParam; typedef paddle::lite::operators::ConvParam ConvParam;
using paddle::lite::Timer; using paddle::lite::profile::Timer;
DDim compute_out_dim(const DDim& dim_in, DDim compute_out_dim(const DDim& dim_in,
const paddle::lite::operators::ConvParam& param) { const paddle::lite::operators::ConvParam& param) {
...@@ -309,30 +309,30 @@ void test_conv_int8(const std::vector<DDim>& input_dims, ...@@ -309,30 +309,30 @@ void test_conv_int8(const std::vector<DDim>& input_dims,
/// compute fp32 output /// compute fp32 output
Timer t0; Timer t0;
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start(); t0.Start();
conv_int8_fp32.Launch(); conv_int8_fp32.Launch();
t0.end(); t0.Stop();
} }
LOG(INFO) << "int8 conv, fp32 output: output shape" << dim_out LOG(INFO) << "int8 conv, fp32 output: output shape" << dim_out
<< ",running time, avg: " << t0.get_average_ms() << ",running time, avg: " << t0.LapTimes().Avg()
<< ", min time: " << t0.get_min_time() << ", min time: " << t0.LapTimes().Min()
<< ", total GOPS: " << 1e-9 * gops << ", total GOPS: " << 1e-9 * gops
<< " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
/// compute int8 output /// compute int8 output
t0.clear(); t0.Reset();
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start(); t0.Start();
conv_int8_int8.Launch(); conv_int8_int8.Launch();
t0.end(); t0.Stop();
} }
LOG(INFO) << "int8 conv, int8 output: output shape" << dim_out LOG(INFO) << "int8 conv, int8 output: output shape" << dim_out
<< ",running time, avg: " << t0.get_average_ms() << ",running time, avg: " << t0.LapTimes().Avg()
<< ", min time: " << t0.get_min_time() << ", min time: " << t0.LapTimes().Min()
<< ", total GOPS: " << 1e-9 * gops << ", total GOPS: " << 1e-9 * gops
<< " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
/// compare result fp32 output /// compare result fp32 output
if (FLAGS_check_result) { if (FLAGS_check_result) {
......
...@@ -15,10 +15,10 @@ ...@@ -15,10 +15,10 @@
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "lite/core/context.h" #include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/tests/utils/naive_math_impl.h" #include "lite/tests/utils/naive_math_impl.h"
#include "lite/tests/utils/tensor_utils.h" #include "lite/tests/utils/tensor_utils.h"
#include "lite/tests/utils/timer.h"
#ifdef LITE_WITH_ARM #ifdef LITE_WITH_ARM
#include "lite/kernels/arm/conv_transpose_compute.h" #include "lite/kernels/arm/conv_transpose_compute.h"
...@@ -59,7 +59,7 @@ DEFINE_bool(flag_bias, false, "with bias"); ...@@ -59,7 +59,7 @@ DEFINE_bool(flag_bias, false, "with bias");
typedef paddle::lite::DDim DDim; typedef paddle::lite::DDim DDim;
typedef paddle::lite::Tensor Tensor; typedef paddle::lite::Tensor Tensor;
typedef paddle::lite::operators::ConvParam ConvParam; typedef paddle::lite::operators::ConvParam ConvParam;
using paddle::lite::Timer; using paddle::lite::profile::Timer;
DDim compute_out_dim(const DDim& dim_in, DDim compute_out_dim(const DDim& dim_in,
const paddle::lite::operators::ConvParam& param) { const paddle::lite::operators::ConvParam& param) {
...@@ -187,19 +187,19 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims, ...@@ -187,19 +187,19 @@ void test_conv_transpose_fp32(const std::vector<DDim>& input_dims,
/// compute /// compute
Timer t0; Timer t0;
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start(); t0.Start();
conv_t.Launch(); conv_t.Launch();
t0.end(); t0.Stop();
} }
float gops = float gops =
2.f * tmp_weights.numel() * dim_in[0] * dim_in[2] * dim_in[3]; 2.f * tmp_weights.numel() * dim_in[0] * dim_in[2] * dim_in[3];
LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape" LOG(INFO) << "conv fp32: input shape: " << dim_in << ", output shape"
<< dim_out << ",running time, avg: " << t0.get_average_ms() << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
<< ", min time: " << t0.get_min_time() << ", min time: " << t0.LapTimes().Min()
<< ", total GOPS: " << 1e-9 * gops << ", total GOPS: " << 1e-9 * gops
<< " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
if (FLAGS_check_result) { if (FLAGS_check_result) {
double max_ratio = 0; double max_ratio = 0;
......
...@@ -20,12 +20,12 @@ ...@@ -20,12 +20,12 @@
#include "lite/backends/arm/math/funcs.h" #include "lite/backends/arm/math/funcs.h"
#endif // LITE_WITH_ARM #endif // LITE_WITH_ARM
#include "lite/core/context.h" #include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/tests/utils/tensor_utils.h" #include "lite/tests/utils/tensor_utils.h"
#include "lite/tests/utils/timer.h"
typedef paddle::lite::Tensor Tensor; typedef paddle::lite::Tensor Tensor;
using paddle::lite::Timer; using paddle::lite::profile::Timer;
DEFINE_int32(power_mode, DEFINE_int32(power_mode,
3, 3,
...@@ -193,7 +193,7 @@ bool test_gemm_int8(bool tra, ...@@ -193,7 +193,7 @@ bool test_gemm_int8(bool tra,
dbias_int8[l] = dbias[l] / scale_c[0]; dbias_int8[l] = dbias[l] / scale_c[0];
} }
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start(); t0.Start();
paddle::lite::arm::math::gemm_prepack_int8(tpackedA.data<int8_t>(), paddle::lite::arm::math::gemm_prepack_int8(tpackedA.data<int8_t>(),
db, db,
dbias_int8, dbias_int8,
...@@ -206,21 +206,21 @@ bool test_gemm_int8(bool tra, ...@@ -206,21 +206,21 @@ bool test_gemm_int8(bool tra,
trb, trb,
scale_merge_int8.data(), scale_merge_int8.data(),
&ctx); &ctx);
t0.end(); t0.Stop();
} }
LOG(INFO) << "gemm_int8_int8 output: M: " << m << ", N: " << n << ", K: " << k LOG(INFO) << "gemm_int8_int8 output: M: " << m << ", N: " << n << ", K: " << k
<< ", power_mode: " << cls << ", threads: " << ths << ", power_mode: " << cls << ", threads: " << ths
<< ", GOPS: " << ops * 1e-9f << ", GOPS: " << ops * 1e-9f
<< " GOPS, avg time: " << t0.get_average_ms() << " GOPS, avg time: " << t0.LapTimes().Avg()
<< " ms, min time: " << t0.get_min_time() << " ms, min time: " << t0.LapTimes().Min()
<< " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
<< " GOPs"; << " GOPs";
/// fp32 output compute /// fp32 output compute
t0.clear(); t0.Reset();
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start(); t0.Start();
paddle::lite::arm::math::gemm_prepack_int8(tpackedA.data<int8_t>(), paddle::lite::arm::math::gemm_prepack_int8(tpackedA.data<int8_t>(),
db, db,
dbias, dbias,
...@@ -233,15 +233,15 @@ bool test_gemm_int8(bool tra, ...@@ -233,15 +233,15 @@ bool test_gemm_int8(bool tra,
trb, trb,
scale_merge_fp32.data(), scale_merge_fp32.data(),
&ctx); &ctx);
t0.end(); t0.Stop();
} }
LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n << ", K: " << k LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n << ", K: " << k
<< ", power_mode: " << cls << ", threads: " << ths << ", power_mode: " << cls << ", threads: " << ths
<< ", GOPS: " << ops * 1e-9f << ", GOPS: " << ops * 1e-9f
<< " GOPS, avg time: " << t0.get_average_ms() << " GOPS, avg time: " << t0.LapTimes().Avg()
<< " ms, min time: " << t0.get_min_time() << " ms, min time: " << t0.LapTimes().Min()
<< " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
<< " GOPs"; << " GOPs";
if (FLAGS_check_result) { if (FLAGS_check_result) {
......
...@@ -20,12 +20,12 @@ ...@@ -20,12 +20,12 @@
#include "lite/backends/arm/math/funcs.h" #include "lite/backends/arm/math/funcs.h"
#endif // LITE_WITH_ARM #endif // LITE_WITH_ARM
#include "lite/core/context.h" #include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/tests/utils/tensor_utils.h" #include "lite/tests/utils/tensor_utils.h"
#include "lite/tests/utils/timer.h"
typedef paddle::lite::Tensor Tensor; typedef paddle::lite::Tensor Tensor;
using paddle::lite::Timer; using paddle::lite::profile::Timer;
DEFINE_int32(power_mode, DEFINE_int32(power_mode,
3, 3,
...@@ -165,7 +165,7 @@ bool test_gemv_int8( ...@@ -165,7 +165,7 @@ bool test_gemv_int8(
dbias_int8[l] = dbias[l] / scale_c[0]; dbias_int8[l] = dbias[l] / scale_c[0];
} }
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start(); t0.Start();
paddle::lite::arm::math::gemv_int8(da, paddle::lite::arm::math::gemv_int8(da,
db, db,
dc_fp32, dc_fp32,
...@@ -177,21 +177,21 @@ bool test_gemv_int8( ...@@ -177,21 +177,21 @@ bool test_gemv_int8(
dbias, dbias,
has_relu, has_relu,
&ctx); &ctx);
t0.end(); t0.Stop();
} }
LOG(INFO) << "gemv_int8_int8 output: M: " << m << ", N: " << n LOG(INFO) << "gemv_int8_int8 output: M: " << m << ", N: " << n
<< ", power_mode: " << cls << ", threads: " << ths << ", power_mode: " << cls << ", threads: " << ths
<< ", GOPS: " << ops * 1e-9f << ", GOPS: " << ops * 1e-9f
<< " GOPS, avg time: " << t0.get_average_ms() << " GOPS, avg time: " << t0.LapTimes().Avg()
<< " ms, min time: " << t0.get_min_time() << " ms, min time: " << t0.LapTimes().Min()
<< " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
<< " GOPs"; << " GOPs";
/// fp32 output compute /// fp32 output compute
t0.clear(); t0.Reset();
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start(); t0.Start();
paddle::lite::arm::math::gemv_int8(da, paddle::lite::arm::math::gemv_int8(da,
db, db,
dc_int8, dc_int8,
...@@ -203,15 +203,15 @@ bool test_gemv_int8( ...@@ -203,15 +203,15 @@ bool test_gemv_int8(
dbias_int8, dbias_int8,
has_relu, has_relu,
&ctx); &ctx);
t0.end(); t0.Stop();
} }
LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n LOG(INFO) << "gemm_int8_fp32 output: M: " << m << ", N: " << n
<< ", power_mode: " << cls << ", threads: " << ths << ", power_mode: " << cls << ", threads: " << ths
<< ", GOPS: " << ops * 1e-9f << ", GOPS: " << ops * 1e-9f
<< " GOPS, avg time: " << t0.get_average_ms() << " GOPS, avg time: " << t0.LapTimes().Avg()
<< " ms, min time: " << t0.get_min_time() << " ms, min time: " << t0.LapTimes().Min()
<< " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
<< " GOPs"; << " GOPs";
if (FLAGS_check_result) { if (FLAGS_check_result) {
......
...@@ -15,10 +15,10 @@ ...@@ -15,10 +15,10 @@
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "lite/core/context.h" #include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/tests/utils/naive_math_impl.h" #include "lite/tests/utils/naive_math_impl.h"
#include "lite/tests/utils/tensor_utils.h" #include "lite/tests/utils/tensor_utils.h"
#include "lite/tests/utils/timer.h"
#ifdef LITE_WITH_ARM #ifdef LITE_WITH_ARM
#include "lite/kernels/arm/layout_compute.h" #include "lite/kernels/arm/layout_compute.h"
...@@ -48,7 +48,7 @@ typedef paddle::lite::DDim DDim; ...@@ -48,7 +48,7 @@ typedef paddle::lite::DDim DDim;
typedef paddle::lite::Tensor Tensor; typedef paddle::lite::Tensor Tensor;
typedef paddle::lite::operators::LayoutParam LayoutParam; typedef paddle::lite::operators::LayoutParam LayoutParam;
using paddle::lite::Timer; using paddle::lite::profile::Timer;
#define IN(n, c, h, w) \ #define IN(n, c, h, w) \
input_data[w + h * input_w + c * input_h * input_w + \ input_data[w + h * input_w + c * input_h * input_w + \
...@@ -165,17 +165,17 @@ void test_layout_fp32_nchw(DDim dim_in, ...@@ -165,17 +165,17 @@ void test_layout_fp32_nchw(DDim dim_in,
/// compute /// compute
Timer t0; Timer t0;
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start(); t0.Start();
layout.Run(); layout.Run();
t0.end(); t0.Stop();
} }
double gops = 2.0 * dim_out.production(); double gops = 2.0 * dim_out.production();
LOG(INFO) << "layout fp32: input shape: " << dim_in << ", output shape" LOG(INFO) << "layout fp32: input shape: " << dim_in << ", output shape"
<< dim_out << ",running time, avg: " << t0.get_average_ms() << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
<< ", min time: " << t0.get_min_time() << ", min time: " << t0.LapTimes().Min()
<< ", total GOPS: " << 1e-9 * gops << ", total GOPS: " << 1e-9 * gops
<< " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
if (FLAGS_check_result) { if (FLAGS_check_result) {
double max_ratio = 0; double max_ratio = 0;
...@@ -268,17 +268,17 @@ void test_layout_fp32_nhwc(DDim dim_in, ...@@ -268,17 +268,17 @@ void test_layout_fp32_nhwc(DDim dim_in,
/// compute /// compute
Timer t0; Timer t0;
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start(); t0.Start();
layout.Run(); layout.Run();
t0.end(); t0.Stop();
} }
double gops = 2.0 * dim_out.production(); double gops = 2.0 * dim_out.production();
LOG(INFO) << "layout fp32: input shape: " << dim_in << ", output shape" LOG(INFO) << "layout fp32: input shape: " << dim_in << ", output shape"
<< dim_out << ",running time, avg: " << t0.get_average_ms() << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
<< ", min time: " << t0.get_min_time() << ", min time: " << t0.LapTimes().Min()
<< ", total GOPS: " << 1e-9 * gops << ", total GOPS: " << 1e-9 * gops
<< " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
if (FLAGS_check_result) { if (FLAGS_check_result) {
double max_ratio = 0; double max_ratio = 0;
...@@ -370,18 +370,18 @@ void test_layout_int8_nchw(DDim dim_in, ...@@ -370,18 +370,18 @@ void test_layout_int8_nchw(DDim dim_in,
/// compute /// compute
Timer t0; Timer t0;
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start(); t0.Start();
layout.Run(); layout.Run();
t0.end(); t0.Stop();
} }
LOG(INFO) << "saber compute end"; LOG(INFO) << "saber compute end";
double gops = 2.0 * dim_out.production(); double gops = 2.0 * dim_out.production();
LOG(INFO) << "layout int8: input shape: " << dim_in << ", output shape" LOG(INFO) << "layout int8: input shape: " << dim_in << ", output shape"
<< dim_out << ",running time, avg: " << t0.get_average_ms() << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
<< ", min time: " << t0.get_min_time() << ", min time: " << t0.LapTimes().Min()
<< ", total GOPS: " << 1e-9 * gops << ", total GOPS: " << 1e-9 * gops
<< " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
if (FLAGS_check_result) { if (FLAGS_check_result) {
double max_ratio = 0; double max_ratio = 0;
...@@ -474,18 +474,18 @@ void test_layout_int8_nhwc(DDim dim_in, ...@@ -474,18 +474,18 @@ void test_layout_int8_nhwc(DDim dim_in,
/// compute /// compute
Timer t0; Timer t0;
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start(); t0.Start();
layout.Run(); layout.Run();
t0.end(); t0.Stop();
} }
LOG(INFO) << "run"; LOG(INFO) << "run";
double gops = 2.0 * dim_out.production(); double gops = 2.0 * dim_out.production();
LOG(INFO) << "layout int8: input shape: " << dim_in << ", output shape" LOG(INFO) << "layout int8: input shape: " << dim_in << ", output shape"
<< dim_out << ",running time, avg: " << t0.get_average_ms() << dim_out << ",running time, avg: " << t0.LapTimes().Avg()
<< ", min time: " << t0.get_min_time() << ", min time: " << t0.LapTimes().Min()
<< ", total GOPS: " << 1e-9 * gops << ", total GOPS: " << 1e-9 * gops
<< " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
if (FLAGS_check_result) { if (FLAGS_check_result) {
double max_ratio = 0; double max_ratio = 0;
......
...@@ -15,10 +15,10 @@ ...@@ -15,10 +15,10 @@
#include <gflags/gflags.h> #include <gflags/gflags.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "lite/core/context.h" #include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/operators/op_params.h" #include "lite/operators/op_params.h"
#include "lite/tests/utils/naive_math_impl.h" #include "lite/tests/utils/naive_math_impl.h"
#include "lite/tests/utils/tensor_utils.h" #include "lite/tests/utils/tensor_utils.h"
#include "lite/tests/utils/timer.h"
#ifdef LITE_WITH_ARM #ifdef LITE_WITH_ARM
#include "lite/kernels/arm/pool_compute.h" #include "lite/kernels/arm/pool_compute.h"
...@@ -60,7 +60,7 @@ DEFINE_string(pooling_type, "max", "do max pooling"); ...@@ -60,7 +60,7 @@ DEFINE_string(pooling_type, "max", "do max pooling");
typedef paddle::lite::DDim DDim; typedef paddle::lite::DDim DDim;
typedef paddle::lite::Tensor Tensor; typedef paddle::lite::Tensor Tensor;
typedef paddle::lite::operators::PoolParam PoolParam; typedef paddle::lite::operators::PoolParam PoolParam;
using paddle::lite::Timer; using paddle::lite::profile::Timer;
DDim compute_out_dim(const DDim& dim_in, DDim compute_out_dim(const DDim& dim_in,
const paddle::lite::operators::PoolParam& param) { const paddle::lite::operators::PoolParam& param) {
...@@ -320,18 +320,18 @@ void test_pool_fp32(const std::vector<DDim>& input_dims, ...@@ -320,18 +320,18 @@ void test_pool_fp32(const std::vector<DDim>& input_dims,
/// compute /// compute
Timer t0; Timer t0;
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start(); t0.Start();
pool.Launch(); pool.Launch();
t0.end(); t0.Stop();
} }
double gops = 2.0 * dim_out.production() * ksize[0] * ksize[1]; double gops = 2.0 * dim_out.production() * ksize[0] * ksize[1];
LOG(INFO) << "pool fp32: input shape: " << dim_in << ", output shape" LOG(INFO) << "pool fp32: input shape: " << dim_in << ", output shape"
<< dim_out << ", running time, avg: " << t0.get_average_ms() << dim_out << ", running time, avg: " << t0.LapTimes().Avg()
<< ", min time: " << t0.get_min_time() << ", min time: " << t0.LapTimes().Min()
<< ", total GOPS: " << 1e-9 * gops << ", total GOPS: " << 1e-9 * gops
<< " GOPS, avg GOPs: " << 1e-6 * gops / t0.get_average_ms() << " GOPS, avg GOPs: " << 1e-6 * gops / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << 1e-6 * gops / t0.get_min_time(); << " GOPs, max GOPs: " << 1e-6 * gops / t0.LapTimes().Min();
if (FLAGS_check_result) { if (FLAGS_check_result) {
double max_ratio = 0; double max_ratio = 0;
......
...@@ -20,12 +20,12 @@ ...@@ -20,12 +20,12 @@
#include "lite/backends/arm/math/funcs.h" #include "lite/backends/arm/math/funcs.h"
#endif // LITE_WITH_ARM #endif // LITE_WITH_ARM
#include "lite/core/context.h" #include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/tests/utils/tensor_utils.h" #include "lite/tests/utils/tensor_utils.h"
#include "lite/tests/utils/timer.h"
typedef paddle::lite::Tensor Tensor; typedef paddle::lite::Tensor Tensor;
using paddle::lite::Timer; using paddle::lite::profile::Timer;
DEFINE_int32(power_mode, DEFINE_int32(power_mode,
3, 3,
...@@ -134,18 +134,18 @@ bool test_sgemm_c4( ...@@ -134,18 +134,18 @@ bool test_sgemm_c4(
} }
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start(); t0.Start();
paddle::lite::arm::math::sgemm_prepack_c4( paddle::lite::arm::math::sgemm_prepack_c4(
m, n, k, da_c4, db_c4, dc, dbias, has_bias, has_relu, &ctx); m, n, k, da_c4, db_c4, dc, dbias, has_bias, has_relu, &ctx);
t0.end(); t0.Stop();
} }
LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k
<< ", power_mode: " << cls << ", threads: " << ths << ", power_mode: " << cls << ", threads: " << ths
<< ", GOPS: " << ops * 1e-9f << ", GOPS: " << ops * 1e-9f
<< " GOPS, avg time: " << t0.get_average_ms() << " GOPS, avg time: " << t0.LapTimes().Avg()
<< " ms, min time: " << t0.get_min_time() << " ms, min time: " << t0.LapTimes().Min()
<< " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
<< " GOPs"; << " GOPs";
if (FLAGS_check_result) { if (FLAGS_check_result) {
......
...@@ -20,12 +20,12 @@ ...@@ -20,12 +20,12 @@
#include "lite/backends/arm/math/funcs.h" #include "lite/backends/arm/math/funcs.h"
#endif // LITE_WITH_ARM #endif // LITE_WITH_ARM
#include "lite/core/context.h" #include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/tests/utils/tensor_utils.h" #include "lite/tests/utils/tensor_utils.h"
#include "lite/tests/utils/timer.h"
typedef paddle::lite::Tensor Tensor; typedef paddle::lite::Tensor Tensor;
using paddle::lite::Timer; using paddle::lite::profile::Timer;
DEFINE_int32(power_mode, DEFINE_int32(power_mode,
3, 3,
...@@ -171,7 +171,7 @@ bool test_sgemm(bool tra, ...@@ -171,7 +171,7 @@ bool test_sgemm(bool tra,
if (i == FLAGS_repeats - 1) { if (i == FLAGS_repeats - 1) {
memcpy(dc, dc_backup, sizeof(float) * m * ldc); memcpy(dc, dc_backup, sizeof(float) * m * ldc);
} }
t0.start(); t0.Start();
paddle::lite::arm::math::sgemm_prepack(trb, paddle::lite::arm::math::sgemm_prepack(trb,
m, m,
n, n,
...@@ -186,15 +186,15 @@ bool test_sgemm(bool tra, ...@@ -186,15 +186,15 @@ bool test_sgemm(bool tra,
has_bias, has_bias,
has_relu, has_relu,
&ctx); &ctx);
t0.end(); t0.Stop();
} }
LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k LOG(INFO) << "M: " << m << ", N: " << n << ", K: " << k
<< ", power_mode: " << cls << ", threads: " << ths << ", power_mode: " << cls << ", threads: " << ths
<< ", GOPS: " << ops * 1e-9f << ", GOPS: " << ops * 1e-9f
<< " GOPS, avg time: " << t0.get_average_ms() << " GOPS, avg time: " << t0.LapTimes().Avg()
<< " ms, min time: " << t0.get_min_time() << " ms, min time: " << t0.LapTimes().Min()
<< " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
<< " GOPs"; << " GOPs";
if (FLAGS_check_result) { if (FLAGS_check_result) {
......
...@@ -20,9 +20,9 @@ ...@@ -20,9 +20,9 @@
#include "lite/backends/arm/math/funcs.h" #include "lite/backends/arm/math/funcs.h"
#endif // LITE_WITH_ARM #endif // LITE_WITH_ARM
#include "lite/core/context.h" #include "lite/core/context.h"
#include "lite/core/profile/timer.h"
#include "lite/core/tensor.h" #include "lite/core/tensor.h"
#include "lite/tests/utils/tensor_utils.h" #include "lite/tests/utils/tensor_utils.h"
#include "lite/tests/utils/timer.h"
typedef paddle::lite::Tensor Tensor; typedef paddle::lite::Tensor Tensor;
...@@ -83,7 +83,7 @@ bool test_sgemv( ...@@ -83,7 +83,7 @@ bool test_sgemv(
basic_gemv( basic_gemv(
m, k, da, db, dbias, dc_basic, 1.f, 0.f, tra, has_bias, has_relu); m, k, da, db, dbias, dc_basic, 1.f, 0.f, tra, has_bias, has_relu);
} }
paddle::lite::Timer t0; paddle::lite::profile::Timer t0;
//! compute //! compute
double ops = 2.0 * m * k; double ops = 2.0 * m * k;
std::unique_ptr<paddle::lite::KernelContext> ctx1( std::unique_ptr<paddle::lite::KernelContext> ctx1(
...@@ -96,19 +96,19 @@ bool test_sgemv( ...@@ -96,19 +96,19 @@ bool test_sgemv(
da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx); da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx);
} }
t0.clear(); t0.Reset();
for (int i = 0; i < FLAGS_repeats; ++i) { for (int i = 0; i < FLAGS_repeats; ++i) {
t0.start(); t0.Start();
paddle::lite::arm::math::sgemv( paddle::lite::arm::math::sgemv(
da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx); da, db, dc, tra, m, k, has_bias, dbias, has_relu, &ctx);
t0.end(); t0.Stop();
} }
LOG(INFO) << "gemv output: M: " << m << ", K: " << k << ", cluster: " << cls LOG(INFO) << "gemv output: M: " << m << ", K: " << k << ", cluster: " << cls
<< ", threads: " << ths << ", GOPS: " << ops * 1e-9f << ", threads: " << ths << ", GOPS: " << ops * 1e-9f
<< " GOPS, avg time: " << t0.get_average_ms() << " GOPS, avg time: " << t0.LapTimes().Avg()
<< " ms, min time: " << t0.get_min_time() << " ms, min time: " << t0.LapTimes().Min()
<< " ms, mean GOPs: " << ops * 1e-6f / t0.get_average_ms() << " ms, mean GOPs: " << ops * 1e-6f / t0.LapTimes().Avg()
<< " GOPs, max GOPs: " << ops * 1e-6f / t0.get_min_time() << " GOPs, max GOPs: " << ops * 1e-6f / t0.LapTimes().Min()
<< " GOPs"; << " GOPs";
if (FLAGS_check_result) { if (FLAGS_check_result) {
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <chrono> // NOLINT
#include <list>
namespace paddle {
namespace lite {
class Timer final {
public:
Timer() {}
~Timer() {}
void clear() { ms_time_.clear(); }
void start() { tstart_ = std::chrono::system_clock::now(); }
void end() {
tend_ = std::chrono::system_clock::now();
auto ts =
std::chrono::duration_cast<std::chrono::microseconds>(tend_ - tstart_);
latest_time_ = 1000.f * static_cast<float>(ts.count()) *
std::chrono::microseconds::period::num /
std::chrono::microseconds::period::den;
ms_time_.push_back(latest_time_);
}
float latest_time() const { return latest_time_; }
float get_average_ms() {
if (ms_time_.size() == 0) {
return 0.f;
}
float sum = 0.f;
for (auto i : ms_time_) {
sum += i;
}
return sum / ms_time_.size();
}
float get_sum_ms() {
if (ms_time_.size() == 0) {
return 0.f;
}
float sum = 0.f;
for (auto i : ms_time_) {
sum += i;
}
return sum;
}
// return tile (0-99) time.
float get_tile_time(float tile) {
if (tile < 0 || tile > 100) {
return -1.f;
}
int total_items = static_cast<int>(ms_time_.size());
if (total_items <= 0) {
return -2.f;
}
ms_time_.sort();
int pos = static_cast<int>(tile * total_items / 100);
auto it = ms_time_.begin();
for (int i = 0; i < pos; ++i) {
++it;
}
return *it;
}
std::list<float> get_time_stat() { return ms_time_; }
float get_min_time() {
ms_time_.sort();
return *ms_time_.begin();
}
float get_max_time() {
ms_time_.sort([](int a, int b) { return a > b; });
return *ms_time_.begin();
}
private:
std::chrono::time_point<std::chrono::system_clock> tstart_;
std::chrono::time_point<std::chrono::system_clock> tend_;
std::list<float> ms_time_;
float latest_time_;
};
} // namespace lite
} // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册