提交 e18ba102 编写于 作者: Y Yuan Shuai 提交者: GitHub

[LITE][OPENCL] Enhance Profiler for OpenCL with in/out/filter shape,...

[LITE][OPENCL] Enhance Profiler for OpenCL with in/out/filter shape, macs/macs_ps, real backend kernel etc. (#3641)

* [LITE][OPENCL] Enhance Precision Profiler for OpenCL. test=develop
上级 ed788a4a
......@@ -370,5 +370,26 @@ void CLRuntime::GetAdrenoContextProperties(
properties->push_back(0);
}
double CLRuntime::GetCommandTime(const cl::Event& event) {
command_queue().finish();
auto start_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_START>();
auto stop_nanos = event.getProfilingInfo<CL_PROFILING_COMMAND_END>();
return (stop_nanos - start_nanos) / 1000000.0;
}
double CLRuntime::GetQueuedTime(const cl::Event& event) {
command_queue().finish();
return (event.getProfilingInfo<CL_PROFILING_COMMAND_START>() -
event.getProfilingInfo<CL_PROFILING_COMMAND_QUEUED>()) /
1000000.0;
}
double CLRuntime::GetSubmitTime(const cl::Event& event) {
command_queue().finish();
return (event.getProfilingInfo<CL_PROFILING_COMMAND_START>() -
event.getProfilingInfo<CL_PROFILING_COMMAND_SUBMIT>()) /
1000000.0;
}
} // namespace lite
} // namespace paddle
......@@ -95,6 +95,12 @@ class CLRuntime {
GpuType& GetGpuType();
double GetCommandTime(const cl::Event& event);
double GetQueuedTime(const cl::Event& event);
double GetSubmitTime(const cl::Event& event);
private:
CLRuntime() { Init(); }
......
......@@ -45,5 +45,18 @@ const char* opencl_error_to_str(cl_int error);
#else
#define CL_CHECK_FATAL(err_code__)
#endif
#ifdef LITE_WITH_PROFILE
#define EnqueueNDRangeKernel( \
context, kernel, gws_offset, gws, lws, event_wait_list, event) \
context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( \
kernel, gws_offset, gws, lws, event_wait_list, &event)
#else
#define EnqueueNDRangeKernel( \
context, kernel, gws_offset, gws, lws, event_wait_list, event) \
context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( \
kernel, gws_offset, gws, lws, event_wait_list, nullptr)
#endif
} // namespace lite
} // namespace paddle
......@@ -62,6 +62,14 @@ class KernelBase {
profiler_ = profiler;
profile_id_ = id;
}
virtual void SetProfileRuntimeKernelInfo(
paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = std::string("NotImpl");
#ifdef LITE_WITH_ARM
ch->cl_event = event_;
#endif
}
#endif
void Launch() {
......@@ -90,10 +98,13 @@ class KernelBase {
profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get());
profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
Run();
#ifdef LITE_WITH_OPENCL
CLRuntime::Global()->command_queue().finish();
#endif
if (is_first_epoch_for_profiler_) {
SetProfileRuntimeKernelInfo(profiler_->GetOpCharacter(profile_id_));
is_first_epoch_for_profiler_ = false;
}
profiler_->StopTiming(profile::Type::kDispatch, profile_id_, ctx_.get());
#else
Run();
#endif
......@@ -185,6 +196,11 @@ class KernelBase {
#ifdef LITE_WITH_PROFILE
profile::Profiler* profiler_{nullptr};
int profile_id_{-1};
bool is_first_epoch_for_profiler_{true};
#endif
#ifdef LITE_WITH_OPENCL
cl::Event event_;
#endif
};
......
......@@ -73,6 +73,9 @@ class OpLite : public Registry {
// Indicate whether the Op runs only once or not
virtual bool run_once() const { return false; }
std::string Type() { return op_type_; }
#ifdef LITE_WITH_PROFILE
virtual void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {}
#endif
// Link the external execution environ to internal context.
bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope);
......
......@@ -13,6 +13,7 @@
// limitations under the License.
#include "lite/core/profile/profiler.h"
#include <iomanip>
#include <map>
#include <string>
#include <utility>
......@@ -64,22 +65,34 @@ int Profiler::NewTimer(const OpCharacter& ch) {
return units_.size() - 1;
}
OpCharacter* Profiler::GetOpCharacter(const size_t index) {
CHECK_LT(index, units_.size())
<< "The timer index in the profiler is out of range.";
return &units_[index].Character();
}
void Profiler::StartTiming(Type type, const int index, KernelContext* ctx) {
CHECK_LT(index, units_.size())
<< "The timer index in the profiler is out of range.";
units_[index].Timer(type)->Start(ctx);
}
float Profiler::StopTiming(Type type, const int index, KernelContext* ctx) {
void Profiler::StopTiming(Type type, const int index, KernelContext* ctx) {
CHECK_LT(index, units_.size())
<< "The timer index in the profiler is out of range.";
return units_[index].Timer(type)->Stop(ctx);
units_[index].Timer(type)->Stop(ctx);
#ifdef LITE_WITH_OPENCL
units_[index].Timer(type)->CLStop(units_[index].character.op_type,
units_[index].character.io_duration,
units_[index].character.cl_event);
#endif
}
std::string Profiler::Summary(Type type, bool concise, size_t w) {
using std::setw;
using std::left;
using std::fixed;
using std::setprecision;
STL::stringstream ss;
std::string title;
// Title.
......@@ -94,14 +107,36 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
<< " Profiler Summary: " << name_ << ", Exclude " << w
<< " warm-ups =====" << std::endl;
}
ss << setw(25) << left << "Operator Type"
<< " " << setw(40) << left << "Kernel Name"
<< " " << setw(12) << left << "Remark"
<< " " << setw(12) << left << "Avg (ms)"
<< " " << setw(12) << left << "Min (ms)"
<< " " << setw(12) << left << "Max (ms)"
<< " " << setw(12) << left << "Last (ms)"
<< " " << setw(12) << left << "Percent (%)" << std::endl;
ss << setw(20) << left << "OperatorType"
<< " " << setw(30) << left << "KerneAttr";
if (!concise) {
ss << " " << setw(24) << left << "KernelName";
}
ss << " " << setw(16) << left << "Remark";
if (!concise) {
ss << " " << setw(15) << left << "InDim"
<< " " << setw(15) << left << "FilterDim"
<< " " << setw(15) << left << "OutDim";
}
ss << " " << setw(7) << left << "Avg(ms)"
<< " " << setw(7) << left << "Min(ms)"
<< " " << setw(7) << left << "Max(ms)";
if (!concise) {
ss << " " << setw(7) << left << "Last(ms)";
}
ss << " " << setw(7) << left << "Avg(%)";
if (!concise) {
ss << " " << setw(7) << left << "GOPs"
<< " " << setw(7) << left << "GOPS";
}
#ifdef LITE_WITH_OPENCL
ss << " " << setw(9) << left << "clAvg(ms)"
<< " " << setw(9) << left << "clMin(ms)"
<< " " << setw(9) << left << "clMax(ms)"
<< " " << setw(9) << left << "clAvg(%)";
#endif
ss << std::endl;
// Profile information.
if (concise) {
std::map<OpCharacter, TimeInfo, decltype(op_comp)> summary(op_comp);
......@@ -111,33 +146,67 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
ch->second.avg += unit.Timer(type)->LapTimes().Avg(w);
ch->second.min += unit.Timer(type)->LapTimes().Min(w);
ch->second.max += unit.Timer(type)->LapTimes().Max(w);
#ifdef LITE_WITH_OPENCL
ch->second.cl_avg += unit.Timer(type)->CLLapTimes().Avg(w);
ch->second.cl_min += unit.Timer(type)->CLLapTimes().Min(w);
ch->second.cl_max += unit.Timer(type)->CLLapTimes().Max(w);
#endif
} else {
TimeInfo info;
info.avg = unit.Timer(type)->LapTimes().Avg(w);
info.min = unit.Timer(type)->LapTimes().Min(w);
info.max = unit.Timer(type)->LapTimes().Max(w);
#ifdef LITE_WITH_OPENCL
info.cl_avg = unit.Timer(type)->CLLapTimes().Avg(w);
info.cl_min = unit.Timer(type)->CLLapTimes().Min(w);
info.cl_max = unit.Timer(type)->CLLapTimes().Max(w);
#endif
summary.insert({unit.Character(), info});
}
}
// compute total time
float total = 0.0;
for (const auto& item : summary) {
total += item.second.avg;
}
#ifdef LITE_WITH_OPENCL
float cl_total = 0.0;
for (const auto& item : summary) {
cl_total += item.second.cl_avg;
}
#endif
for (const auto& item : summary) {
float percent = 0;
if (total > 0) {
percent = 100 * (item.second.avg / total);
}
// clang-format off
ss << setw(25) << left << fixed << item.first.op_type \
<< " " << setw(40) << left << fixed << item.first.kernel_name \
<< " " << setw(12) << left << fixed << item.first.remark \
<< " " << setw(12) << left << fixed << item.second.avg \
<< " " << setw(12) << left << fixed << item.second.min \
<< " " << setw(12) << left << fixed << item.second.max \
<< " " << setw(12) << left << fixed << percent << "%" \
<< " " << std::endl;
ss << setw(20) << left << fixed << item.first.op_type
<< " " << setw(30) << left << fixed << item.first.kernel_attr
<< " " << setw(16) << left << fixed << item.first.remark
<< " " << setw(7) << left << fixed << setprecision(3)
<< item.second.avg
<< " " << setw(7) << left << fixed << setprecision(3)
<< item.second.min
<< " " << setw(7) << left << fixed << setprecision(3)
<< item.second.max
<< " " << setprecision(2) << percent << "% ";
#ifdef LITE_WITH_OPENCL
float cl_percent = 0;
if (cl_total > 0) {
cl_percent = 100 * (item.second.cl_avg / cl_total);
}
ss << " " << setw(9) << left << fixed << setprecision(3)
<< item.second.cl_avg
<< " " << setw(9) << left << fixed << setprecision(3)
<< item.second.cl_min
<< " " << setw(9) << left << fixed << setprecision(3)
<< item.second.cl_max
<< " " << left << fixed <<setprecision(2) << cl_percent << "% ";
#endif
ss << std::endl;
// clang-format on
}
} else {
......@@ -146,6 +215,13 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
const auto& times = unit.Timer(type)->LapTimes();
total += times.Avg(w);
}
#ifdef LITE_WITH_OPENCL
float cl_total = 0.0;
for (auto& unit : units_) {
const auto& cl_times = unit.Timer(type)->CLLapTimes();
cl_total += cl_times.Avg(w);
}
#endif
for (auto& unit : units_) {
const auto& times = unit.Timer(type)->LapTimes();
float run = times.Avg(w);
......@@ -153,17 +229,43 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) {
if (total > 0) {
percent = 100 * (run / total);
}
#ifdef LITE_WITH_OPENCL
const auto& cl_times = unit.Timer(type)->CLLapTimes();
float cl_run = cl_times.Avg(w);
float cl_percent = 0;
if (cl_total > 0) {
cl_percent = 100 * (cl_run / cl_total);
}
#endif
// clang-format off
ss << setw(25) << left << fixed << unit.Character().op_type \
<< " " << setw(40) << left << fixed << unit.Character().kernel_name \
<< " " << setw(12) << left << fixed << unit.Character().remark \
<< " " << setw(12) << left << fixed << times.Avg(w) \
<< " " << setw(12) << left << fixed << times.Min(w) \
<< " " << setw(12) << left << fixed << times.Max(w) \
<< " " << setw(12) << left << fixed << times.Last(w) \
<< " " << setw(12) << left << fixed << percent << "%" \
<< std::endl;
// clang-format on
ss << setw(20) << left << fixed << unit.Character().op_type
<< " " << setw(30) << left << fixed << unit.Character().kernel_attr
<< " " << setw(24) << left << fixed
<< unit.Character().kernel_func_name
<< " " << setw(16) << left << fixed << unit.Character().remark
<< " " << setw(15) << left << fixed << unit.Character().input_shape
<< " " << setw(15) << left << fixed << unit.Character().filter_shape
<< " " << setw(15) << left << fixed << unit.Character().output_shape
<< " " << setw(7) << left << fixed << setprecision(3) << times.Avg(w)
<< " " << setw(7) << left << fixed << setprecision(3) << times.Min(w)
<< " " << setw(7) << left << fixed << setprecision(3) << times.Max(w)
<< " " << setw(7) << left << fixed << setprecision(3) << times.Last(w)
<< " " << left << setprecision(2) << percent << "% "
<< " " << setw(7) << left << fixed << setprecision(2)
<< 1e-9f * unit.Character().macs
<< " " << setw(7) << left << fixed << setprecision(2)
<< 1e-6f * unit.Character().macs / times.Avg(w);
// clang-format on
#ifdef LITE_WITH_OPENCL
ss << " " << setw(9) << left << fixed << setprecision(3)
<< cl_times.Avg(w) << " " << setw(9) << left << fixed
<< setprecision(3) << cl_times.Min(w) << " " << setw(9) << left
<< fixed << setprecision(3) << cl_times.Max(w) << " " << left
<< setprecision(2) << cl_percent << "% ";
#endif
ss << std::endl;
}
}
return ss.str();
......
......@@ -18,6 +18,7 @@
#include <string>
#include <vector>
#include "lite/core/profile/timer.h"
#include "lite/core/tensor.h"
namespace paddle {
namespace lite {
......@@ -35,25 +36,61 @@ struct TimeInfo {
float avg;
float min;
float max;
#ifdef LITE_WITH_OPENCL
float cl_avg;
float cl_min;
float cl_max;
#endif
};
struct OpCharacter {
TargetType target;
void* op_lite{nullptr};
std::string op_type{std::string("N/A")};
std::string kernel_name{std::string("N/A")};
std::string kernel_attr{std::string("N/A")};
std::string kernel_func_name{std::string("N/A")};
std::string remark{std::string("N/A")};
std::string input_shape{"N/A"};
std::string output_shape{"N/A"};
std::string filter_shape{"N/A"};
float macs{0};
float macs_ps{0};
float io_duration{0};
#ifdef LITE_WITH_OPENCL
cl::Event cl_event{};
#else
void* cl_event{nullptr};
#endif
std::string DimToStr(const paddle::lite::DDimLite& dim) {
if (!dim.size()) return "NotImpl";
std::string dim_str{""};
for (size_t i = 0; i < dim.size(); ++i) {
dim_str += std::to_string(dim[i]);
if (i != dim.size() - 1) {
dim_str += "x";
}
}
return dim_str;
}
};
class StatisUnit final {
public:
explicit StatisUnit(const OpCharacter& ch);
lite::profile::Timer* Timer(Type type);
const OpCharacter& Character() const { return character; }
OpCharacter& Character() { return character; }
OpCharacter character;
protected:
std::unique_ptr<lite::profile::Timer> create_t;
std::unique_ptr<lite::profile::Timer> dispatch_t;
OpCharacter character;
};
class Profiler final {
......@@ -62,8 +99,9 @@ class Profiler final {
explicit Profiler(const std::string& name) : name_(name) {}
int NewTimer(const OpCharacter& ch);
void StartTiming(Type type, const int index, KernelContext* ctx);
float StopTiming(Type type, const int index, KernelContext* ctx);
void StopTiming(Type type, const int index, KernelContext* ctx);
std::string Summary(Type type, bool concise = true, size_t warm_up = 10);
OpCharacter* GetOpCharacter(const size_t index);
private:
std::string name_{std::string("N/A")};
......
......@@ -15,6 +15,7 @@
#pragma once
#include <algorithm>
#include <chrono> // NOLINT
#include <string>
#include <vector>
#ifdef LITE_WITH_CUDA
#include "lite/backends/cuda/cuda_utils.h"
......@@ -87,6 +88,22 @@ class Timer {
this->laps_t_.Add(elapse_ms);
return elapse_ms;
}
#ifdef LITE_WITH_OPENCL
float CLStop(const std::string& op_type, float io_duration, cl::Event event) {
float cl_kernel_elapse_ms = 0.0;
if (op_type != "io_copy") {
cl_kernel_elapse_ms =
CLRuntime::Global()->CLRuntime::GetCommandTime(event);
} else {
cl_kernel_elapse_ms = io_duration;
}
this->cl_laps_t_.Add(cl_kernel_elapse_ms);
return cl_kernel_elapse_ms;
}
const TimeList<float>& CLLapTimes() const { return cl_laps_t_; }
#endif
virtual void Start(KernelContext* ctx) { return Start(); }
virtual float Stop(KernelContext* ctx) { return Stop(); }
float AvgLapTimeMs() const { return laps_t_.Avg(); }
......@@ -94,6 +111,9 @@ class Timer {
protected:
TimeList<float> laps_t_;
#ifdef LITE_WITH_OPENCL
TimeList<float> cl_laps_t_;
#endif
private:
std::chrono::time_point<std::chrono::system_clock> t_start_, t_stop_;
......
......@@ -167,7 +167,7 @@ void RuntimeProgram::Run() {
#endif // LITE_WITH_PRECISION_PROFILE
}
#ifdef LITE_WITH_PROFILE
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0);
LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 1);
#endif
#ifdef LITE_WITH_PRECISION_PROFILE
LOG(INFO) << "\n" << precision_profiler_summary;
......@@ -297,6 +297,13 @@ void Instruction::Run() {
op_->InferShape();
kernel_->Launch();
has_run_ = true;
#ifdef LITE_WITH_PROFILE
if (first_epoch_for_profiler_) {
SetProfileRuntimeOpInfo(profiler_->GetOpCharacter(profile_id_));
first_epoch_for_profiler_ = false;
}
#endif
}
STL::ostream& operator<<(STL::ostream& os, const Instruction& other) {
......
......@@ -23,6 +23,9 @@
#include "lite/core/op_lite.h"
#include "lite/core/op_registry.h"
#include "lite/model_parser/cpp/program_desc.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
namespace paddle {
namespace lite {
......@@ -125,13 +128,22 @@ struct Instruction {
profiler_ = profiler;
if (op_->Type() != "feed" && op_->Type() != "fetch") {
profile::OpCharacter ch;
ch.op_lite = static_cast<void*>(const_cast<paddle::lite::OpLite*>(op()));
ch.target = kernel()->target();
ch.op_type = op_->Type();
ch.kernel_name = kernel()->name();
ch.kernel_attr = kernel()->name().substr(ch.op_type.size() + 1,
kernel()->name().size());
// append `ch.kernel_func_name` in StopTiming
profile_id_ = profiler->NewTimer(ch);
kernel_->SetProfiler(profiler_, profile_id_);
}
}
void SetProfileRuntimeOpInfo(paddle::lite::profile::OpCharacter* ch) {
auto* op_lite = static_cast<paddle::lite::OpLite*>(ch->op_lite);
op_lite->GetOpRuntimeInfo(ch);
}
#endif
private:
......@@ -144,6 +156,7 @@ struct Instruction {
#ifdef LITE_WITH_PROFILE
profile::Profiler* profiler_;
int profile_id_{-1};
bool first_epoch_for_profiler_{true};
#endif // LITE_WITH_PROFILE
};
......
......@@ -22,6 +22,8 @@
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#undef LITE_WITH_LOG
namespace paddle {
namespace lite {
namespace kernels {
......@@ -651,13 +653,13 @@ void ConvImageCompute::Conv2d1x1opt(bool is_turn) {
status = kernel.setArg(++arg_idx, default_w_blk_);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
nullptr);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_);
CL_CHECK_FATAL(status);
if (is_turn) {
CLRuntime::Global()->command_queue().finish();
......@@ -833,13 +835,13 @@ void ConvImageCompute::Conv2d3x3(bool is_turn) {
// VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << ","
// << global_work_size[1] << "," << global_work_size[2] << "}";
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
nullptr);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
}
void ConvImageCompute::Conv2d3x3opt(bool is_turn) {
......@@ -954,13 +956,13 @@ void ConvImageCompute::Conv2d3x3opt(bool is_turn) {
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
nullptr);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_);
CL_CHECK_FATAL(status);
if (is_turn) {
CLRuntime::Global()->command_queue().finish();
......@@ -1084,13 +1086,13 @@ void ConvImageCompute::Conv2d5x5(bool is_turn) {
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
nullptr);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
if (is_turn) {
CLRuntime::Global()->command_queue().finish();
......@@ -1202,13 +1204,13 @@ void ConvImageCompute::Conv2d5x5opt(bool is_turn) {
// VLOG(4) << "out_image: " << out_image;
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
nullptr);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_);
CL_CHECK_FATAL(status);
if (is_turn) {
CLRuntime::Global()->command_queue().finish();
......@@ -1332,13 +1334,13 @@ void ConvImageCompute::Conv2d7x7(bool is_turn) {
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
nullptr);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
if (is_turn) {
......@@ -1447,13 +1449,13 @@ void ConvImageCompute::Conv2d7x7opt(bool is_turn) {
status = kernel.setArg(++arg_idx, output_height);
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
nullptr);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_);
CL_CHECK_FATAL(status);
if (is_turn) {
......@@ -1530,13 +1532,13 @@ void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) {
status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
nullptr);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
local_work_size_,
nullptr,
event_);
CL_CHECK_FATAL(status);
if (is_turn) {
......@@ -1627,13 +1629,13 @@ void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) {
status = kernel.setArg(++arg_idx, static_cast<const int>(output_dims[2]));
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
nullptr);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
if (is_turn) {
......@@ -1762,13 +1764,13 @@ void ConvImageCompute::DepthwiseConv2d(bool is_turn) {
<< global_work_size_[1] << "," << global_work_size_[2] << "}";
#endif
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
nullptr);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
}
......@@ -1828,3 +1830,4 @@ REGISTER_LITE_KERNEL(depthwise_conv2d,
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
#define LITE_WITH_LOG
......@@ -24,6 +24,10 @@
#include "lite/core/tensor.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
#ifdef LITE_WITH_PROFILE
#include "lite/core/profile/profiler.h"
#endif
#include "lite/backends/opencl/cl_utility.h"
namespace paddle {
namespace lite {
......@@ -41,6 +45,14 @@ class ConvImageCompute : public KernelLite<TARGET(kOpenCL),
void Run() override;
double Turn(int times = 5);
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_names_[0];
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
private:
void Conv2d1x1opt(bool is_turn = false);
void Conv2d3x3(bool is_turn = false);
......
......@@ -18,6 +18,8 @@
#include "lite/core/op_registry.h"
#include "lite/utils/replace_stl/stream.h"
#undef LITE_WITH_LOG
namespace paddle {
namespace lite {
namespace kernels {
......@@ -154,13 +156,13 @@ void ElementwiseAddImageCompute::Run() {
auto& context = ctx_->As<OpenCLContext>();
CHECK(context.cl_context() != nullptr);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
nullptr);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size_,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
}
......@@ -196,3 +198,5 @@ REGISTER_LITE_KERNEL(elementwise_add,
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
#define LITE_WITH_LOG
......@@ -17,6 +17,7 @@
#include <string>
#include <vector>
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_utility.h"
#include "lite/core/kernel.h"
#include "lite/kernels/opencl/image_helper.h"
#include "lite/operators/op_params.h"
......@@ -42,6 +43,14 @@ class ElementwiseAddImageCompute
void Run() override;
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
std::string doc() const override {
return "ElementwiseAdd using cl::Image2D, kFP16";
}
......
......@@ -16,19 +16,46 @@
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#undef LITE_WITH_LOG
namespace paddle {
namespace lite {
namespace kernels {
namespace opencl {
inline double GetCurrentUS() {
struct timeval time;
gettimeofday(&time, NULL);
return 1e+6 * time.tv_sec + time.tv_usec;
}
// Host to OpenCL memory.
void CopyFromHostSync(void* target, const void* source, size_t size) {
float CopyFromHostSync(void* target, const void* source, size_t size) {
#ifdef LITE_WITH_PROFILE
auto h2d_copy_start = GetCurrentUS();
#endif
TargetWrapperCL::MemcpySync(target, source, size, IoDirection::HtoD);
#ifdef LITE_WITH_PROFILE
auto h2d_duration = (GetCurrentUS() - h2d_copy_start) / 1000.0;
return h2d_duration;
#else
return 0.0;
#endif
}
// Device to Host memory.
void CopyToHostSync(void* target, const void* source, size_t size) {
float CopyToHostSync(void* target, const void* source, size_t size) {
#ifdef LITE_WITH_PROFILE
auto d2h_copy_start = GetCurrentUS();
#endif
CLRuntime::Global()->command_queue().finish();
TargetWrapperCL::MemcpySync(target, source, size, IoDirection::DtoH);
#ifdef LITE_WITH_PROFILE
auto d2h_duration = (GetCurrentUS() - d2h_copy_start) / 1000.0;
return d2h_duration;
#else
return 0.0;
#endif
}
/*
......@@ -37,6 +64,13 @@ void CopyToHostSync(void* target, const void* source, size_t size) {
class IoCopyHostToOpenCLCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = "HostToOpenCL";
ch->io_duration = h2d_duration_;
}
#endif
void Run() override {
auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kHost) ||
......@@ -50,7 +84,7 @@ class IoCopyHostToOpenCLCompute
VLOG(2) << "param.y->dims():" << param.y->dims();
#endif
auto* data = param.y->mutable_data(TARGET(kOpenCL), mem_size);
CopyFromHostSync(data, param.x->raw_data(), mem_size);
h2d_duration_ = CopyFromHostSync(data, param.x->raw_data(), mem_size);
}
std::unique_ptr<type_infer_handler_t> GetTypeInferHandler() override {
......@@ -74,6 +108,8 @@ class IoCopyHostToOpenCLCompute
}
std::string doc() const override { return "Copy IO from HOST to OpenCL"; }
float h2d_duration_{0};
};
/*
......@@ -82,6 +118,13 @@ class IoCopyHostToOpenCLCompute
class IoCopykOpenCLToHostCompute
: public KernelLite<TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kAny)> {
public:
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = "OpenCLToHost";
ch->io_duration = d2h_duration_;
}
#endif
void Run() override {
auto& param = Param<operators::IoCopyParam>();
CHECK(param.x->target() == TARGET(kOpenCL));
......@@ -109,12 +152,13 @@ class IoCopykOpenCLToHostCompute
#ifdef LITE_WITH_LOG
VLOG(2) << "--- Find the sync event for the target cl tensor. ---";
#endif
CLRuntime::Global()->command_queue().finish();
CopyToHostSync(data, param.x->raw_data(), mem_size);
d2h_duration_ = CopyToHostSync(data, param.x->raw_data(), mem_size);
}
std::string doc() const override { return "Copy IO from OpenCL to HOST"; }
float d2h_duration_{0};
};
} // namespace opencl
......@@ -161,3 +205,5 @@ REGISTER_LITE_KERNEL(io_copy_once,
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
.Finalize();
#define LITE_WITH_LOG
......@@ -16,6 +16,7 @@
#include <string>
#include "lite/api/paddle_place.h"
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_utility.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/core/target_wrapper.h"
......@@ -24,6 +25,8 @@
#include "lite/operators/op_params.h"
#include "lite/utils/cp_logging.h"
#undef LITE_WITH_LOG
namespace paddle {
namespace lite {
namespace kernels {
......@@ -50,6 +53,14 @@ class LayoutComputeBufferChwToImageDefault
time_stamp_);
}
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
void Run() override {
auto& param = Param<param_t>();
const cl::Buffer* x_data;
......@@ -128,13 +139,13 @@ class LayoutComputeBufferChwToImageDefault
static_cast<cl::size_type>(new_dims[3]),
static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
nullptr);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
}
......@@ -168,6 +179,14 @@ class LayoutComputeImageDefaultToBufferChw
time_stamp_);
}
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
void Run() override {
auto& param = Param<param_t>();
const cl::Buffer* y_data;
......@@ -237,13 +256,13 @@ class LayoutComputeImageDefaultToBufferChw
static_cast<cl::size_type>(new_dims[3]),
static_cast<cl::size_type>(new_dims[0] * new_dims[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
nullptr);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
}
......@@ -274,6 +293,14 @@ class LayoutComputeBufferChwToImage2DNw
time_stamp_);
}
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
void Run() override {
auto& param = Param<param_t>();
auto* x_data = param.x->data<float, cl::Buffer>();
......@@ -333,13 +360,13 @@ class LayoutComputeBufferChwToImage2DNw
static_cast<cl::size_type>(out_W), // w
static_cast<cl::size_type>(out_C * out_H)}; // ch
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
nullptr);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
}
......@@ -394,3 +421,4 @@ REGISTER_LITE_KERNEL(
PRECISION(kAny),
DATALAYOUT(kNCHW))})
.Finalize();
#define LITE_WITH_LOG
......@@ -16,6 +16,7 @@
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/backends/opencl/cl_utility.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
......@@ -23,6 +24,8 @@
#include "lite/utils/replace_stl/stream.h"
#include "lite/utils/string.h"
#undef LITE_WITH_LOG
namespace paddle {
namespace lite {
namespace kernels {
......@@ -50,6 +53,14 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
kernel_func_name_, "image/pool_kernel.cl", build_options_, time_stamp_);
}
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
void Run() override {
const auto& param = *param_.get_mutable<param_t>();
const auto& in_dims = param.x->dims();
......@@ -150,13 +161,13 @@ class PoolComputeImage2D : public KernelLite<TARGET(kOpenCL),
status = kernel.setArg(++arg_idx, static_cast<const int>(paddings[0]));
CL_CHECK_FATAL(status);
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
nullptr);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
}
......@@ -186,3 +197,4 @@ REGISTER_LITE_KERNEL(pool2d,
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
#define LITE_WITH_LOG
......@@ -14,6 +14,7 @@
#include "lite/backends/opencl/cl_half.h"
#include "lite/backends/opencl/cl_include.h"
#include "lite/backends/opencl/cl_utility.h"
#include "lite/core/kernel.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/opencl/image_helper.h"
......@@ -21,6 +22,8 @@
#include "lite/utils/logging.h"
#include "lite/utils/replace_stl/stream.h"
#undef LITE_WITH_LOG
namespace paddle {
namespace lite {
namespace kernels {
......@@ -42,6 +45,14 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
time_stamp_);
}
#ifdef LITE_WITH_PROFILE
void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) {
ch->kernel_func_name = kernel_func_name_;
ch->cl_event =
event_; // `event_` defined in `kernel.h`, valid after kernel::Run
}
#endif
void Run() override {
auto& param = *param_.get_mutable<param_t>();
const Tensor* const x = param.x;
......@@ -154,13 +165,13 @@ class ReshapeComputeFloatImage : public KernelLite<TARGET(kOpenCL),
static_cast<size_t>(default_work_size.data()[1]),
static_cast<size_t>(default_work_size.data()[2])};
status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel(
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
nullptr);
status = EnqueueNDRangeKernel(context,
kernel,
cl::NullRange,
global_work_size,
cl::NullRange,
nullptr,
event_);
CL_CHECK_FATAL(status);
}
......@@ -246,3 +257,4 @@ REGISTER_LITE_KERNEL(flatten2,
PRECISION(kFP16),
DATALAYOUT(kImageDefault))})
.Finalize();
#define LITE_WITH_LOG
......@@ -36,6 +36,28 @@ class ConvOpLite : public OpLite {
bool CheckShape() const override;
bool InferShapeImpl() const override;
#ifdef LITE_WITH_PROFILE
void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter* ch) {
auto filter_dims = param_.filter->dims();
auto input_dims = param_.x->dims();
auto output_dims = param_.output->dims();
ch->input_shape = ch->DimToStr(input_dims);
ch->output_shape = ch->DimToStr(output_dims);
ch->filter_shape = ch->DimToStr(filter_dims);
ch->remark = std::to_string(filter_dims[2]) + "x" +
std::to_string(filter_dims[3]) + "p" +
std::to_string((*param_.paddings)[0]) + "s" +
std::to_string(param_.strides[0]) + "g" +
std::to_string(param_.groups) + "d" +
std::to_string((*param_.dilations)[0]);
// MACs = 2.f * kw * kh * batchsize * out_c * out_h * out_w * in_c / group
// GMACs = 1e-9f * MACs
// GMACPS = 1e-6f * MACs / predict_ms
ch->macs = 2.f * filter_dims[2] * filter_dims[3] *
output_dims.production() * input_dims[1] / param_.groups;
}
#endif
// TODO(Superjomn) replace framework::OpDesc with a lite one.
bool AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) override {
AttachParam(&param_);
......
......@@ -30,6 +30,16 @@ class IoCopyOp : public OpLite {
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
#ifdef LITE_WITH_PROFILE
void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
auto input_dims = param_.x->dims();
auto output_dims = param_.y->dims();
ch->input_shape = ch->DimToStr(input_dims);
ch->output_shape = ch->DimToStr(output_dims);
ch->remark = "type" + std::to_string(param_.process_type);
}
#endif
protected:
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
......
......@@ -30,6 +30,16 @@ class LayoutOp : public OpLite {
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
#ifdef LITE_WITH_PROFILE
void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
auto input_dims = param_.x->dims();
auto output_dims = param_.y->dims();
ch->input_shape = ch->DimToStr(input_dims);
ch->output_shape = ch->DimToStr(output_dims);
ch->remark = "type" + std::to_string(param_.process_type);
}
#endif
protected:
bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override;
......
......@@ -92,6 +92,25 @@ class PoolOpLite : public OpLite {
std::string DebugString() const override { return "pool2d"; }
#ifdef LITE_WITH_PROFILE
void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
auto input_dims = param_.x->dims();
auto output_dims = param_.output->dims();
ch->input_shape = ch->DimToStr(input_dims);
ch->output_shape = ch->DimToStr(output_dims);
if (param_.global_pooling) {
ch->remark = "global" + param_.pooling_type;
} else {
ch->remark = param_.pooling_type + std::to_string(param_.ksize[0]) + "x" +
std::to_string(param_.ksize[1]) + "s" +
std::to_string(param_.strides[0]) + "p" +
std::to_string((*param_.paddings)[0]);
}
ch->remark += padding_algorithm_;
ch->macs = output_dims.production() * param_.ksize[0] * param_.ksize[1];
}
#endif
private:
mutable PoolParam param_;
std::string padding_algorithm_{""};
......
......@@ -37,6 +37,15 @@ class ReshapeOp : public OpLite {
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "reshape"; }
#ifdef LITE_WITH_PROFILE
void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
auto input_dims = param_.x->dims();
auto output_dims = param_.output->dims();
ch->input_shape = ch->DimToStr(input_dims);
ch->output_shape = ch->DimToStr(output_dims);
}
#endif
protected:
mutable ReshapeParam param_;
};
......
......@@ -37,6 +37,17 @@ class SoftmaxOp : public OpLite {
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "softmax"; }
#ifdef LITE_WITH_PROFILE
void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
auto input_dims = param_.x->dims();
auto output_dims = param_.output->dims();
ch->input_shape = ch->DimToStr(input_dims);
ch->output_shape = ch->DimToStr(output_dims);
ch->remark = "axis" + std::to_string(param_.axis);
ch->macs = 2.f * input_dims.production() * 3;
}
#endif
private:
mutable SoftmaxParam param_;
};
......
......@@ -37,6 +37,15 @@ class SqueezeOp : public OpLite {
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "squeeze"; }
#ifdef LITE_WITH_PROFILE
void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
auto input_dims = param_.X->dims();
auto output_dims = param_.Out->dims();
ch->input_shape = ch->DimToStr(input_dims);
ch->output_shape = ch->DimToStr(output_dims);
}
#endif
protected:
mutable SqueezeParam param_;
};
......@@ -54,6 +63,15 @@ class Squeeze2Op : public SqueezeOp {
void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
std::string DebugString() const override { return "squeeze2"; }
#ifdef LITE_WITH_PROFILE
void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {
auto input_dims = param_.X->dims();
auto output_dims = param_.Out->dims();
ch->input_shape = ch->DimToStr(input_dims);
ch->output_shape = ch->DimToStr(output_dims);
}
#endif
};
} // namespace operators
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册