From e18ba102c1ccdea09f56a201386396cad2754c53 Mon Sep 17 00:00:00 2001 From: Yuan Shuai Date: Mon, 18 May 2020 01:35:31 -0500 Subject: [PATCH] [LITE][OPENCL] Enhance Profiler for OpenCL with in/out/filter shape, macs/macs_ps, real backend kernel etc. (#3641) * [LITE][OPENCL] Enhance Precision Profiler for OpenCL. test=develop --- lite/backends/opencl/cl_runtime.cc | 21 +++ lite/backends/opencl/cl_runtime.h | 6 + lite/backends/opencl/cl_utility.h | 13 ++ lite/core/kernel.h | 22 ++- lite/core/op_lite.h | 3 + lite/core/profile/profiler.cc | 158 ++++++++++++++---- lite/core/profile/profiler.h | 44 ++++- lite/core/profile/timer.h | 20 +++ lite/core/program.cc | 9 +- lite/core/program.h | 13 ++ lite/kernels/opencl/conv_image_compute.cc | 143 ++++++++-------- lite/kernels/opencl/conv_image_compute.h | 12 ++ .../opencl/elementwise_add_image_compute.cc | 18 +- .../opencl/elementwise_add_image_compute.h | 9 + lite/kernels/opencl/io_copy_buffer_compute.cc | 56 ++++++- lite/kernels/opencl/layout_image_compute.cc | 70 +++++--- lite/kernels/opencl/pool_image_compute.cc | 26 ++- lite/kernels/opencl/reshape_image_compute.cc | 26 ++- lite/operators/conv_op.h | 22 +++ lite/operators/io_copy_op.h | 10 ++ lite/operators/layout_op.h | 10 ++ lite/operators/pool_op.h | 19 +++ lite/operators/reshape_op.h | 9 + lite/operators/softmax_op.h | 11 ++ lite/operators/squeeze_op.h | 18 ++ 25 files changed, 616 insertions(+), 152 deletions(-) diff --git a/lite/backends/opencl/cl_runtime.cc b/lite/backends/opencl/cl_runtime.cc index 929ec7838e..d8232cda4c 100644 --- a/lite/backends/opencl/cl_runtime.cc +++ b/lite/backends/opencl/cl_runtime.cc @@ -370,5 +370,26 @@ void CLRuntime::GetAdrenoContextProperties( properties->push_back(0); } +double CLRuntime::GetCommandTime(const cl::Event& event) { + command_queue().finish(); + auto start_nanos = event.getProfilingInfo(); + auto stop_nanos = event.getProfilingInfo(); + return (stop_nanos - start_nanos) / 1000000.0; +} + +double CLRuntime::GetQueuedTime(const cl::Event& event) { + command_queue().finish(); + return (event.getProfilingInfo() - + event.getProfilingInfo()) / + 1000000.0; +} + +double CLRuntime::GetSubmitTime(const cl::Event& event) { + command_queue().finish(); + return (event.getProfilingInfo() - + event.getProfilingInfo()) / + 1000000.0; +} + } // namespace lite } // namespace paddle diff --git a/lite/backends/opencl/cl_runtime.h b/lite/backends/opencl/cl_runtime.h index 51e545cc34..3eeea7d63a 100644 --- a/lite/backends/opencl/cl_runtime.h +++ b/lite/backends/opencl/cl_runtime.h @@ -95,6 +95,12 @@ class CLRuntime { GpuType& GetGpuType(); + double GetCommandTime(const cl::Event& event); + + double GetQueuedTime(const cl::Event& event); + + double GetSubmitTime(const cl::Event& event); + private: CLRuntime() { Init(); } diff --git a/lite/backends/opencl/cl_utility.h b/lite/backends/opencl/cl_utility.h index 7ca12c1f80..dcea7aef2e 100644 --- a/lite/backends/opencl/cl_utility.h +++ b/lite/backends/opencl/cl_utility.h @@ -45,5 +45,18 @@ const char* opencl_error_to_str(cl_int error); #else #define CL_CHECK_FATAL(err_code__) #endif + +#ifdef LITE_WITH_PROFILE +#define EnqueueNDRangeKernel( \ + context, kernel, gws_offset, gws, lws, event_wait_list, event) \ + context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( \ + kernel, gws_offset, gws, lws, event_wait_list, &event) +#else +#define EnqueueNDRangeKernel( \ + context, kernel, gws_offset, gws, lws, event_wait_list, event) \ + context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( \ + kernel, gws_offset, gws, lws, event_wait_list, nullptr) +#endif + } // namespace lite } // namespace paddle diff --git a/lite/core/kernel.h b/lite/core/kernel.h index 777d6665e1..cbd9e8afff 100644 --- a/lite/core/kernel.h +++ b/lite/core/kernel.h @@ -62,6 +62,14 @@ class KernelBase { profiler_ = profiler; profile_id_ = id; } + + virtual void SetProfileRuntimeKernelInfo( + paddle::lite::profile::OpCharacter* ch) { + ch->kernel_func_name = std::string("NotImpl"); +#ifdef LITE_WITH_ARM + ch->cl_event = event_; +#endif + } #endif void Launch() { @@ -90,10 +98,13 @@ class KernelBase { profiler_->StopTiming(profile::Type::kCreate, profile_id_, ctx_.get()); profiler_->StartTiming(profile::Type::kDispatch, profile_id_, ctx_.get()); Run(); -#ifdef LITE_WITH_OPENCL - CLRuntime::Global()->command_queue().finish(); -#endif + + if (is_first_epoch_for_profiler_) { + SetProfileRuntimeKernelInfo(profiler_->GetOpCharacter(profile_id_)); + is_first_epoch_for_profiler_ = false; + } profiler_->StopTiming(profile::Type::kDispatch, profile_id_, ctx_.get()); + #else Run(); #endif @@ -185,6 +196,11 @@ class KernelBase { #ifdef LITE_WITH_PROFILE profile::Profiler* profiler_{nullptr}; int profile_id_{-1}; + bool is_first_epoch_for_profiler_{true}; +#endif + +#ifdef LITE_WITH_OPENCL + cl::Event event_; #endif }; diff --git a/lite/core/op_lite.h b/lite/core/op_lite.h index 7fb74a3ca3..301065d5b6 100644 --- a/lite/core/op_lite.h +++ b/lite/core/op_lite.h @@ -73,6 +73,9 @@ class OpLite : public Registry { // Indicate whether the Op runs only once or not virtual bool run_once() const { return false; } std::string Type() { return op_type_; } +#ifdef LITE_WITH_PROFILE + virtual void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) {} +#endif // Link the external execution environ to internal context. bool Attach(const cpp::OpDesc &opdesc, lite::Scope *scope); diff --git a/lite/core/profile/profiler.cc b/lite/core/profile/profiler.cc index f067ed90b1..3c50585ef2 100644 --- a/lite/core/profile/profiler.cc +++ b/lite/core/profile/profiler.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "lite/core/profile/profiler.h" +#include #include #include #include @@ -64,22 +65,34 @@ int Profiler::NewTimer(const OpCharacter& ch) { return units_.size() - 1; } +OpCharacter* Profiler::GetOpCharacter(const size_t index) { + CHECK_LT(index, units_.size()) + << "The timer index in the profiler is out of range."; + return &units_[index].Character(); +} + void Profiler::StartTiming(Type type, const int index, KernelContext* ctx) { CHECK_LT(index, units_.size()) << "The timer index in the profiler is out of range."; units_[index].Timer(type)->Start(ctx); } -float Profiler::StopTiming(Type type, const int index, KernelContext* ctx) { +void Profiler::StopTiming(Type type, const int index, KernelContext* ctx) { CHECK_LT(index, units_.size()) << "The timer index in the profiler is out of range."; - return units_[index].Timer(type)->Stop(ctx); + units_[index].Timer(type)->Stop(ctx); +#ifdef LITE_WITH_OPENCL + units_[index].Timer(type)->CLStop(units_[index].character.op_type, + units_[index].character.io_duration, + units_[index].character.cl_event); +#endif } std::string Profiler::Summary(Type type, bool concise, size_t w) { using std::setw; using std::left; using std::fixed; + using std::setprecision; STL::stringstream ss; std::string title; // Title. @@ -94,14 +107,36 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) { << " Profiler Summary: " << name_ << ", Exclude " << w << " warm-ups =====" << std::endl; } - ss << setw(25) << left << "Operator Type" - << " " << setw(40) << left << "Kernel Name" - << " " << setw(12) << left << "Remark" - << " " << setw(12) << left << "Avg (ms)" - << " " << setw(12) << left << "Min (ms)" - << " " << setw(12) << left << "Max (ms)" - << " " << setw(12) << left << "Last (ms)" - << " " << setw(12) << left << "Percent (%)" << std::endl; + ss << setw(20) << left << "OperatorType" + << " " << setw(30) << left << "KerneAttr"; + if (!concise) { + ss << " " << setw(24) << left << "KernelName"; + } + ss << " " << setw(16) << left << "Remark"; + if (!concise) { + ss << " " << setw(15) << left << "InDim" + << " " << setw(15) << left << "FilterDim" + << " " << setw(15) << left << "OutDim"; + } + ss << " " << setw(7) << left << "Avg(ms)" + << " " << setw(7) << left << "Min(ms)" + << " " << setw(7) << left << "Max(ms)"; + if (!concise) { + ss << " " << setw(7) << left << "Last(ms)"; + } + ss << " " << setw(7) << left << "Avg(%)"; + if (!concise) { + ss << " " << setw(7) << left << "GOPs" + << " " << setw(7) << left << "GOPS"; + } +#ifdef LITE_WITH_OPENCL + ss << " " << setw(9) << left << "clAvg(ms)" + << " " << setw(9) << left << "clMin(ms)" + << " " << setw(9) << left << "clMax(ms)" + << " " << setw(9) << left << "clAvg(%)"; +#endif + ss << std::endl; + // Profile information. if (concise) { std::map summary(op_comp); @@ -111,33 +146,67 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) { ch->second.avg += unit.Timer(type)->LapTimes().Avg(w); ch->second.min += unit.Timer(type)->LapTimes().Min(w); ch->second.max += unit.Timer(type)->LapTimes().Max(w); +#ifdef LITE_WITH_OPENCL + ch->second.cl_avg += unit.Timer(type)->CLLapTimes().Avg(w); + ch->second.cl_min += unit.Timer(type)->CLLapTimes().Min(w); + ch->second.cl_max += unit.Timer(type)->CLLapTimes().Max(w); +#endif } else { TimeInfo info; info.avg = unit.Timer(type)->LapTimes().Avg(w); info.min = unit.Timer(type)->LapTimes().Min(w); info.max = unit.Timer(type)->LapTimes().Max(w); +#ifdef LITE_WITH_OPENCL + info.cl_avg = unit.Timer(type)->CLLapTimes().Avg(w); + info.cl_min = unit.Timer(type)->CLLapTimes().Min(w); + info.cl_max = unit.Timer(type)->CLLapTimes().Max(w); +#endif summary.insert({unit.Character(), info}); } } + // compute total time float total = 0.0; for (const auto& item : summary) { total += item.second.avg; } +#ifdef LITE_WITH_OPENCL + float cl_total = 0.0; + for (const auto& item : summary) { + cl_total += item.second.cl_avg; + } +#endif + for (const auto& item : summary) { float percent = 0; if (total > 0) { percent = 100 * (item.second.avg / total); } // clang-format off - ss << setw(25) << left << fixed << item.first.op_type \ - << " " << setw(40) << left << fixed << item.first.kernel_name \ - << " " << setw(12) << left << fixed << item.first.remark \ - << " " << setw(12) << left << fixed << item.second.avg \ - << " " << setw(12) << left << fixed << item.second.min \ - << " " << setw(12) << left << fixed << item.second.max \ - << " " << setw(12) << left << fixed << percent << "%" \ - << " " << std::endl; + ss << setw(20) << left << fixed << item.first.op_type + << " " << setw(30) << left << fixed << item.first.kernel_attr + << " " << setw(16) << left << fixed << item.first.remark + << " " << setw(7) << left << fixed << setprecision(3) + << item.second.avg + << " " << setw(7) << left << fixed << setprecision(3) + << item.second.min + << " " << setw(7) << left << fixed << setprecision(3) + << item.second.max + << " " << setprecision(2) << percent << "% "; +#ifdef LITE_WITH_OPENCL + float cl_percent = 0; + if (cl_total > 0) { + cl_percent = 100 * (item.second.cl_avg / cl_total); + } + ss << " " << setw(9) << left << fixed << setprecision(3) + << item.second.cl_avg + << " " << setw(9) << left << fixed << setprecision(3) + << item.second.cl_min + << " " << setw(9) << left << fixed << setprecision(3) + << item.second.cl_max + << " " << left << fixed <LapTimes(); total += times.Avg(w); } +#ifdef LITE_WITH_OPENCL + float cl_total = 0.0; + for (auto& unit : units_) { + const auto& cl_times = unit.Timer(type)->CLLapTimes(); + cl_total += cl_times.Avg(w); + } +#endif for (auto& unit : units_) { const auto& times = unit.Timer(type)->LapTimes(); float run = times.Avg(w); @@ -153,17 +229,43 @@ std::string Profiler::Summary(Type type, bool concise, size_t w) { if (total > 0) { percent = 100 * (run / total); } + +#ifdef LITE_WITH_OPENCL + const auto& cl_times = unit.Timer(type)->CLLapTimes(); + float cl_run = cl_times.Avg(w); + float cl_percent = 0; + if (cl_total > 0) { + cl_percent = 100 * (cl_run / cl_total); + } +#endif + // clang-format off - ss << setw(25) << left << fixed << unit.Character().op_type \ - << " " << setw(40) << left << fixed << unit.Character().kernel_name \ - << " " << setw(12) << left << fixed << unit.Character().remark \ - << " " << setw(12) << left << fixed << times.Avg(w) \ - << " " << setw(12) << left << fixed << times.Min(w) \ - << " " << setw(12) << left << fixed << times.Max(w) \ - << " " << setw(12) << left << fixed << times.Last(w) \ - << " " << setw(12) << left << fixed << percent << "%" \ - << std::endl; - // clang-format on + ss << setw(20) << left << fixed << unit.Character().op_type + << " " << setw(30) << left << fixed << unit.Character().kernel_attr + << " " << setw(24) << left << fixed + << unit.Character().kernel_func_name + << " " << setw(16) << left << fixed << unit.Character().remark + << " " << setw(15) << left << fixed << unit.Character().input_shape + << " " << setw(15) << left << fixed << unit.Character().filter_shape + << " " << setw(15) << left << fixed << unit.Character().output_shape + << " " << setw(7) << left << fixed << setprecision(3) << times.Avg(w) + << " " << setw(7) << left << fixed << setprecision(3) << times.Min(w) + << " " << setw(7) << left << fixed << setprecision(3) << times.Max(w) + << " " << setw(7) << left << fixed << setprecision(3) << times.Last(w) + << " " << left << setprecision(2) << percent << "% " + << " " << setw(7) << left << fixed << setprecision(2) + << 1e-9f * unit.Character().macs + << " " << setw(7) << left << fixed << setprecision(2) + << 1e-6f * unit.Character().macs / times.Avg(w); +// clang-format on +#ifdef LITE_WITH_OPENCL + ss << " " << setw(9) << left << fixed << setprecision(3) + << cl_times.Avg(w) << " " << setw(9) << left << fixed + << setprecision(3) << cl_times.Min(w) << " " << setw(9) << left + << fixed << setprecision(3) << cl_times.Max(w) << " " << left + << setprecision(2) << cl_percent << "% "; +#endif + ss << std::endl; } } return ss.str(); diff --git a/lite/core/profile/profiler.h b/lite/core/profile/profiler.h index 3933e5ba01..ff77ef39c3 100644 --- a/lite/core/profile/profiler.h +++ b/lite/core/profile/profiler.h @@ -18,6 +18,7 @@ #include #include #include "lite/core/profile/timer.h" +#include "lite/core/tensor.h" namespace paddle { namespace lite { @@ -35,25 +36,61 @@ struct TimeInfo { float avg; float min; float max; +#ifdef LITE_WITH_OPENCL + float cl_avg; + float cl_min; + float cl_max; +#endif }; struct OpCharacter { TargetType target; + void* op_lite{nullptr}; std::string op_type{std::string("N/A")}; std::string kernel_name{std::string("N/A")}; + std::string kernel_attr{std::string("N/A")}; + std::string kernel_func_name{std::string("N/A")}; std::string remark{std::string("N/A")}; + + std::string input_shape{"N/A"}; + std::string output_shape{"N/A"}; + std::string filter_shape{"N/A"}; + + float macs{0}; + float macs_ps{0}; + + float io_duration{0}; + +#ifdef LITE_WITH_OPENCL + cl::Event cl_event{}; +#else + void* cl_event{nullptr}; +#endif + + std::string DimToStr(const paddle::lite::DDimLite& dim) { + if (!dim.size()) return "NotImpl"; + std::string dim_str{""}; + for (size_t i = 0; i < dim.size(); ++i) { + dim_str += std::to_string(dim[i]); + if (i != dim.size() - 1) { + dim_str += "x"; + } + } + return dim_str; + } }; class StatisUnit final { public: explicit StatisUnit(const OpCharacter& ch); lite::profile::Timer* Timer(Type type); - const OpCharacter& Character() const { return character; } + OpCharacter& Character() { return character; } + + OpCharacter character; protected: std::unique_ptr create_t; std::unique_ptr dispatch_t; - OpCharacter character; }; class Profiler final { @@ -62,8 +99,9 @@ class Profiler final { explicit Profiler(const std::string& name) : name_(name) {} int NewTimer(const OpCharacter& ch); void StartTiming(Type type, const int index, KernelContext* ctx); - float StopTiming(Type type, const int index, KernelContext* ctx); + void StopTiming(Type type, const int index, KernelContext* ctx); std::string Summary(Type type, bool concise = true, size_t warm_up = 10); + OpCharacter* GetOpCharacter(const size_t index); private: std::string name_{std::string("N/A")}; diff --git a/lite/core/profile/timer.h b/lite/core/profile/timer.h index e9bb16bd27..ddb8a25899 100644 --- a/lite/core/profile/timer.h +++ b/lite/core/profile/timer.h @@ -15,6 +15,7 @@ #pragma once #include #include // NOLINT +#include #include #ifdef LITE_WITH_CUDA #include "lite/backends/cuda/cuda_utils.h" @@ -87,6 +88,22 @@ class Timer { this->laps_t_.Add(elapse_ms); return elapse_ms; } + +#ifdef LITE_WITH_OPENCL + float CLStop(const std::string& op_type, float io_duration, cl::Event event) { + float cl_kernel_elapse_ms = 0.0; + if (op_type != "io_copy") { + cl_kernel_elapse_ms = + CLRuntime::Global()->CLRuntime::GetCommandTime(event); + } else { + cl_kernel_elapse_ms = io_duration; + } + this->cl_laps_t_.Add(cl_kernel_elapse_ms); + return cl_kernel_elapse_ms; + } + const TimeList& CLLapTimes() const { return cl_laps_t_; } +#endif + virtual void Start(KernelContext* ctx) { return Start(); } virtual float Stop(KernelContext* ctx) { return Stop(); } float AvgLapTimeMs() const { return laps_t_.Avg(); } @@ -94,6 +111,9 @@ class Timer { protected: TimeList laps_t_; +#ifdef LITE_WITH_OPENCL + TimeList cl_laps_t_; +#endif private: std::chrono::time_point t_start_, t_stop_; diff --git a/lite/core/program.cc b/lite/core/program.cc index e712ed9e0b..9c864ebea5 100644 --- a/lite/core/program.cc +++ b/lite/core/program.cc @@ -167,7 +167,7 @@ void RuntimeProgram::Run() { #endif // LITE_WITH_PRECISION_PROFILE } #ifdef LITE_WITH_PROFILE - LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 0); + LOG(INFO) << "\n" << profiler_.Summary(profile::Type::kDispatch, false, 1); #endif #ifdef LITE_WITH_PRECISION_PROFILE LOG(INFO) << "\n" << precision_profiler_summary; @@ -297,6 +297,13 @@ void Instruction::Run() { op_->InferShape(); kernel_->Launch(); has_run_ = true; + +#ifdef LITE_WITH_PROFILE + if (first_epoch_for_profiler_) { + SetProfileRuntimeOpInfo(profiler_->GetOpCharacter(profile_id_)); + first_epoch_for_profiler_ = false; + } +#endif } STL::ostream& operator<<(STL::ostream& os, const Instruction& other) { diff --git a/lite/core/program.h b/lite/core/program.h index 9d5fef7c03..5e25a5fcda 100644 --- a/lite/core/program.h +++ b/lite/core/program.h @@ -23,6 +23,9 @@ #include "lite/core/op_lite.h" #include "lite/core/op_registry.h" #include "lite/model_parser/cpp/program_desc.h" +#ifdef LITE_WITH_PROFILE +#include "lite/core/profile/profiler.h" +#endif namespace paddle { namespace lite { @@ -125,13 +128,22 @@ struct Instruction { profiler_ = profiler; if (op_->Type() != "feed" && op_->Type() != "fetch") { profile::OpCharacter ch; + ch.op_lite = static_cast(const_cast(op())); ch.target = kernel()->target(); ch.op_type = op_->Type(); ch.kernel_name = kernel()->name(); + ch.kernel_attr = kernel()->name().substr(ch.op_type.size() + 1, + kernel()->name().size()); + // append `ch.kernel_func_name` in StopTiming profile_id_ = profiler->NewTimer(ch); kernel_->SetProfiler(profiler_, profile_id_); } } + + void SetProfileRuntimeOpInfo(paddle::lite::profile::OpCharacter* ch) { + auto* op_lite = static_cast(ch->op_lite); + op_lite->GetOpRuntimeInfo(ch); + } #endif private: @@ -144,6 +156,7 @@ struct Instruction { #ifdef LITE_WITH_PROFILE profile::Profiler* profiler_; int profile_id_{-1}; + bool first_epoch_for_profiler_{true}; #endif // LITE_WITH_PROFILE }; diff --git a/lite/kernels/opencl/conv_image_compute.cc b/lite/kernels/opencl/conv_image_compute.cc index 362be682ef..fed8171cc2 100644 --- a/lite/kernels/opencl/conv_image_compute.cc +++ b/lite/kernels/opencl/conv_image_compute.cc @@ -22,6 +22,8 @@ #include "lite/kernels/opencl/image_helper.h" #include "lite/operators/op_params.h" +#undef LITE_WITH_LOG + namespace paddle { namespace lite { namespace kernels { @@ -651,13 +653,13 @@ void ConvImageCompute::Conv2d1x1opt(bool is_turn) { status = kernel.setArg(++arg_idx, default_w_blk_); CL_CHECK_FATAL(status); - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); CL_CHECK_FATAL(status); if (is_turn) { CLRuntime::Global()->command_queue().finish(); @@ -833,13 +835,13 @@ void ConvImageCompute::Conv2d3x3(bool is_turn) { // VLOG(4) << "global_work_size[3D]: {" << global_work_size[0] << "," // << global_work_size[1] << "," << global_work_size[2] << "}"; - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); CL_CHECK_FATAL(status); } void ConvImageCompute::Conv2d3x3opt(bool is_turn) { @@ -954,13 +956,13 @@ void ConvImageCompute::Conv2d3x3opt(bool is_turn) { << global_work_size_[1] << "," << global_work_size_[2] << "}"; #endif - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); CL_CHECK_FATAL(status); if (is_turn) { CLRuntime::Global()->command_queue().finish(); @@ -1084,13 +1086,13 @@ void ConvImageCompute::Conv2d5x5(bool is_turn) { << global_work_size_[1] << "," << global_work_size_[2] << "}"; #endif - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); CL_CHECK_FATAL(status); if (is_turn) { CLRuntime::Global()->command_queue().finish(); @@ -1202,13 +1204,13 @@ void ConvImageCompute::Conv2d5x5opt(bool is_turn) { // VLOG(4) << "out_image: " << out_image; - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); CL_CHECK_FATAL(status); if (is_turn) { CLRuntime::Global()->command_queue().finish(); @@ -1332,13 +1334,13 @@ void ConvImageCompute::Conv2d7x7(bool is_turn) { << global_work_size_[1] << "," << global_work_size_[2] << "}"; #endif - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); CL_CHECK_FATAL(status); if (is_turn) { @@ -1447,13 +1449,13 @@ void ConvImageCompute::Conv2d7x7opt(bool is_turn) { status = kernel.setArg(++arg_idx, output_height); CL_CHECK_FATAL(status); - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); CL_CHECK_FATAL(status); if (is_turn) { @@ -1530,13 +1532,13 @@ void ConvImageCompute::DepthwiseConv2d3x3s1(bool is_turn) { status = kernel.setArg(++arg_idx, static_cast(output_dims[2])); CL_CHECK_FATAL(status); - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size_, - local_work_size_, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size_, + local_work_size_, + nullptr, + event_); CL_CHECK_FATAL(status); if (is_turn) { @@ -1627,13 +1629,13 @@ void ConvImageCompute::DepthwiseConv2d3x3(bool is_turn) { status = kernel.setArg(++arg_idx, static_cast(output_dims[2])); CL_CHECK_FATAL(status); - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); CL_CHECK_FATAL(status); if (is_turn) { @@ -1762,13 +1764,13 @@ void ConvImageCompute::DepthwiseConv2d(bool is_turn) { << global_work_size_[1] << "," << global_work_size_[2] << "}"; #endif - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); CL_CHECK_FATAL(status); } @@ -1828,3 +1830,4 @@ REGISTER_LITE_KERNEL(depthwise_conv2d, PRECISION(kFP16), DATALAYOUT(kImageDefault))}) .Finalize(); +#define LITE_WITH_LOG diff --git a/lite/kernels/opencl/conv_image_compute.h b/lite/kernels/opencl/conv_image_compute.h index 3b5faa0c42..be045bb0be 100644 --- a/lite/kernels/opencl/conv_image_compute.h +++ b/lite/kernels/opencl/conv_image_compute.h @@ -24,6 +24,10 @@ #include "lite/core/tensor.h" #include "lite/kernels/opencl/image_helper.h" #include "lite/operators/op_params.h" +#ifdef LITE_WITH_PROFILE +#include "lite/core/profile/profiler.h" +#endif +#include "lite/backends/opencl/cl_utility.h" namespace paddle { namespace lite { @@ -41,6 +45,14 @@ class ConvImageCompute : public KernelLitekernel_func_name = kernel_func_names_[0]; + ch->cl_event = + event_; // `event_` defined in `kernel.h`, valid after kernel::Run + } +#endif + private: void Conv2d1x1opt(bool is_turn = false); void Conv2d3x3(bool is_turn = false); diff --git a/lite/kernels/opencl/elementwise_add_image_compute.cc b/lite/kernels/opencl/elementwise_add_image_compute.cc index 4af02e8b73..dc4f013abb 100644 --- a/lite/kernels/opencl/elementwise_add_image_compute.cc +++ b/lite/kernels/opencl/elementwise_add_image_compute.cc @@ -18,6 +18,8 @@ #include "lite/core/op_registry.h" #include "lite/utils/replace_stl/stream.h" +#undef LITE_WITH_LOG + namespace paddle { namespace lite { namespace kernels { @@ -154,13 +156,13 @@ void ElementwiseAddImageCompute::Run() { auto& context = ctx_->As(); CHECK(context.cl_context() != nullptr); - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size_, - cl::NullRange, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size_, + cl::NullRange, + nullptr, + event_); CL_CHECK_FATAL(status); } @@ -196,3 +198,5 @@ REGISTER_LITE_KERNEL(elementwise_add, PRECISION(kFP16), DATALAYOUT(kImageDefault))}) .Finalize(); + +#define LITE_WITH_LOG diff --git a/lite/kernels/opencl/elementwise_add_image_compute.h b/lite/kernels/opencl/elementwise_add_image_compute.h index fae21f3d71..83972d3286 100644 --- a/lite/kernels/opencl/elementwise_add_image_compute.h +++ b/lite/kernels/opencl/elementwise_add_image_compute.h @@ -17,6 +17,7 @@ #include #include #include "lite/backends/opencl/cl_half.h" +#include "lite/backends/opencl/cl_utility.h" #include "lite/core/kernel.h" #include "lite/kernels/opencl/image_helper.h" #include "lite/operators/op_params.h" @@ -42,6 +43,14 @@ class ElementwiseAddImageCompute void Run() override; +#ifdef LITE_WITH_PROFILE + void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) { + ch->kernel_func_name = kernel_func_name_; + ch->cl_event = + event_; // `event_` defined in `kernel.h`, valid after kernel::Run + } +#endif + std::string doc() const override { return "ElementwiseAdd using cl::Image2D, kFP16"; } diff --git a/lite/kernels/opencl/io_copy_buffer_compute.cc b/lite/kernels/opencl/io_copy_buffer_compute.cc index 31fc563c95..39d9e75803 100644 --- a/lite/kernels/opencl/io_copy_buffer_compute.cc +++ b/lite/kernels/opencl/io_copy_buffer_compute.cc @@ -16,19 +16,46 @@ #include "lite/core/kernel.h" #include "lite/core/op_registry.h" +#undef LITE_WITH_LOG + namespace paddle { namespace lite { namespace kernels { namespace opencl { +inline double GetCurrentUS() { + struct timeval time; + gettimeofday(&time, NULL); + return 1e+6 * time.tv_sec + time.tv_usec; +} + // Host to OpenCL memory. -void CopyFromHostSync(void* target, const void* source, size_t size) { +float CopyFromHostSync(void* target, const void* source, size_t size) { +#ifdef LITE_WITH_PROFILE + auto h2d_copy_start = GetCurrentUS(); +#endif TargetWrapperCL::MemcpySync(target, source, size, IoDirection::HtoD); +#ifdef LITE_WITH_PROFILE + auto h2d_duration = (GetCurrentUS() - h2d_copy_start) / 1000.0; + return h2d_duration; +#else + return 0.0; +#endif } // Device to Host memory. -void CopyToHostSync(void* target, const void* source, size_t size) { +float CopyToHostSync(void* target, const void* source, size_t size) { +#ifdef LITE_WITH_PROFILE + auto d2h_copy_start = GetCurrentUS(); +#endif + CLRuntime::Global()->command_queue().finish(); TargetWrapperCL::MemcpySync(target, source, size, IoDirection::DtoH); +#ifdef LITE_WITH_PROFILE + auto d2h_duration = (GetCurrentUS() - d2h_copy_start) / 1000.0; + return d2h_duration; +#else + return 0.0; +#endif } /* @@ -37,6 +64,13 @@ void CopyToHostSync(void* target, const void* source, size_t size) { class IoCopyHostToOpenCLCompute : public KernelLite { public: +#ifdef LITE_WITH_PROFILE + void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) { + ch->kernel_func_name = "HostToOpenCL"; + ch->io_duration = h2d_duration_; + } +#endif + void Run() override { auto& param = Param(); CHECK(param.x->target() == TARGET(kHost) || @@ -50,7 +84,7 @@ class IoCopyHostToOpenCLCompute VLOG(2) << "param.y->dims():" << param.y->dims(); #endif auto* data = param.y->mutable_data(TARGET(kOpenCL), mem_size); - CopyFromHostSync(data, param.x->raw_data(), mem_size); + h2d_duration_ = CopyFromHostSync(data, param.x->raw_data(), mem_size); } std::unique_ptr GetTypeInferHandler() override { @@ -74,6 +108,8 @@ class IoCopyHostToOpenCLCompute } std::string doc() const override { return "Copy IO from HOST to OpenCL"; } + + float h2d_duration_{0}; }; /* @@ -82,6 +118,13 @@ class IoCopyHostToOpenCLCompute class IoCopykOpenCLToHostCompute : public KernelLite { public: +#ifdef LITE_WITH_PROFILE + void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) { + ch->kernel_func_name = "OpenCLToHost"; + ch->io_duration = d2h_duration_; + } +#endif + void Run() override { auto& param = Param(); CHECK(param.x->target() == TARGET(kOpenCL)); @@ -109,12 +152,13 @@ class IoCopykOpenCLToHostCompute #ifdef LITE_WITH_LOG VLOG(2) << "--- Find the sync event for the target cl tensor. ---"; #endif - CLRuntime::Global()->command_queue().finish(); - CopyToHostSync(data, param.x->raw_data(), mem_size); + d2h_duration_ = CopyToHostSync(data, param.x->raw_data(), mem_size); } std::string doc() const override { return "Copy IO from OpenCL to HOST"; } + + float d2h_duration_{0}; }; } // namespace opencl @@ -161,3 +205,5 @@ REGISTER_LITE_KERNEL(io_copy_once, .BindInput("Input", {LiteType::GetTensorTy(TARGET(kOpenCL))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))}) .Finalize(); + +#define LITE_WITH_LOG diff --git a/lite/kernels/opencl/layout_image_compute.cc b/lite/kernels/opencl/layout_image_compute.cc index 3c7a6ae42f..d0163442a9 100644 --- a/lite/kernels/opencl/layout_image_compute.cc +++ b/lite/kernels/opencl/layout_image_compute.cc @@ -16,6 +16,7 @@ #include #include "lite/api/paddle_place.h" #include "lite/backends/opencl/cl_half.h" +#include "lite/backends/opencl/cl_utility.h" #include "lite/core/kernel.h" #include "lite/core/op_registry.h" #include "lite/core/target_wrapper.h" @@ -24,6 +25,8 @@ #include "lite/operators/op_params.h" #include "lite/utils/cp_logging.h" +#undef LITE_WITH_LOG + namespace paddle { namespace lite { namespace kernels { @@ -50,6 +53,14 @@ class LayoutComputeBufferChwToImageDefault time_stamp_); } +#ifdef LITE_WITH_PROFILE + void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) { + ch->kernel_func_name = kernel_func_name_; + ch->cl_event = + event_; // `event_` defined in `kernel.h`, valid after kernel::Run + } +#endif + void Run() override { auto& param = Param(); const cl::Buffer* x_data; @@ -128,13 +139,13 @@ class LayoutComputeBufferChwToImageDefault static_cast(new_dims[3]), static_cast(new_dims[0] * new_dims[2])}; - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size, - cl::NullRange, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_); CL_CHECK_FATAL(status); } @@ -168,6 +179,14 @@ class LayoutComputeImageDefaultToBufferChw time_stamp_); } +#ifdef LITE_WITH_PROFILE + void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) { + ch->kernel_func_name = kernel_func_name_; + ch->cl_event = + event_; // `event_` defined in `kernel.h`, valid after kernel::Run + } +#endif + void Run() override { auto& param = Param(); const cl::Buffer* y_data; @@ -237,13 +256,13 @@ class LayoutComputeImageDefaultToBufferChw static_cast(new_dims[3]), static_cast(new_dims[0] * new_dims[2])}; - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size, - cl::NullRange, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_); CL_CHECK_FATAL(status); } @@ -274,6 +293,14 @@ class LayoutComputeBufferChwToImage2DNw time_stamp_); } +#ifdef LITE_WITH_PROFILE + void SetProfileRuntimeKernelInfo(paddle::lite::profile::OpCharacter* ch) { + ch->kernel_func_name = kernel_func_name_; + ch->cl_event = + event_; // `event_` defined in `kernel.h`, valid after kernel::Run + } +#endif + void Run() override { auto& param = Param(); auto* x_data = param.x->data(); @@ -333,13 +360,13 @@ class LayoutComputeBufferChwToImage2DNw static_cast(out_W), // w static_cast(out_C * out_H)}; // ch - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size, - cl::NullRange, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_); CL_CHECK_FATAL(status); } @@ -394,3 +421,4 @@ REGISTER_LITE_KERNEL( PRECISION(kAny), DATALAYOUT(kNCHW))}) .Finalize(); +#define LITE_WITH_LOG diff --git a/lite/kernels/opencl/pool_image_compute.cc b/lite/kernels/opencl/pool_image_compute.cc index ff15a349cc..db27bf2aed 100644 --- a/lite/kernels/opencl/pool_image_compute.cc +++ b/lite/kernels/opencl/pool_image_compute.cc @@ -16,6 +16,7 @@ #include "lite/backends/opencl/cl_half.h" #include "lite/backends/opencl/cl_include.h" +#include "lite/backends/opencl/cl_utility.h" #include "lite/core/kernel.h" #include "lite/core/op_registry.h" #include "lite/kernels/opencl/image_helper.h" @@ -23,6 +24,8 @@ #include "lite/utils/replace_stl/stream.h" #include "lite/utils/string.h" +#undef LITE_WITH_LOG + namespace paddle { namespace lite { namespace kernels { @@ -50,6 +53,14 @@ class PoolComputeImage2D : public KernelLitekernel_func_name = kernel_func_name_; + ch->cl_event = + event_; // `event_` defined in `kernel.h`, valid after kernel::Run + } +#endif + void Run() override { const auto& param = *param_.get_mutable(); const auto& in_dims = param.x->dims(); @@ -150,13 +161,13 @@ class PoolComputeImage2D : public KernelLite(paddings[0])); CL_CHECK_FATAL(status); - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size, - cl::NullRange, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_); CL_CHECK_FATAL(status); } @@ -186,3 +197,4 @@ REGISTER_LITE_KERNEL(pool2d, PRECISION(kFP16), DATALAYOUT(kImageDefault))}) .Finalize(); +#define LITE_WITH_LOG diff --git a/lite/kernels/opencl/reshape_image_compute.cc b/lite/kernels/opencl/reshape_image_compute.cc index b68ba07653..bcaa46ba3d 100644 --- a/lite/kernels/opencl/reshape_image_compute.cc +++ b/lite/kernels/opencl/reshape_image_compute.cc @@ -14,6 +14,7 @@ #include "lite/backends/opencl/cl_half.h" #include "lite/backends/opencl/cl_include.h" +#include "lite/backends/opencl/cl_utility.h" #include "lite/core/kernel.h" #include "lite/core/op_registry.h" #include "lite/kernels/opencl/image_helper.h" @@ -21,6 +22,8 @@ #include "lite/utils/logging.h" #include "lite/utils/replace_stl/stream.h" +#undef LITE_WITH_LOG + namespace paddle { namespace lite { namespace kernels { @@ -42,6 +45,14 @@ class ReshapeComputeFloatImage : public KernelLitekernel_func_name = kernel_func_name_; + ch->cl_event = + event_; // `event_` defined in `kernel.h`, valid after kernel::Run + } +#endif + void Run() override { auto& param = *param_.get_mutable(); const Tensor* const x = param.x; @@ -154,13 +165,13 @@ class ReshapeComputeFloatImage : public KernelLite(default_work_size.data()[1]), static_cast(default_work_size.data()[2])}; - status = context.cl_context()->GetCommandQueue().enqueueNDRangeKernel( - kernel, - cl::NullRange, - global_work_size, - cl::NullRange, - nullptr, - nullptr); + status = EnqueueNDRangeKernel(context, + kernel, + cl::NullRange, + global_work_size, + cl::NullRange, + nullptr, + event_); CL_CHECK_FATAL(status); } @@ -246,3 +257,4 @@ REGISTER_LITE_KERNEL(flatten2, PRECISION(kFP16), DATALAYOUT(kImageDefault))}) .Finalize(); +#define LITE_WITH_LOG diff --git a/lite/operators/conv_op.h b/lite/operators/conv_op.h index 49452fc44f..993b0d6e71 100644 --- a/lite/operators/conv_op.h +++ b/lite/operators/conv_op.h @@ -36,6 +36,28 @@ class ConvOpLite : public OpLite { bool CheckShape() const override; bool InferShapeImpl() const override; +#ifdef LITE_WITH_PROFILE + void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter* ch) { + auto filter_dims = param_.filter->dims(); + auto input_dims = param_.x->dims(); + auto output_dims = param_.output->dims(); + ch->input_shape = ch->DimToStr(input_dims); + ch->output_shape = ch->DimToStr(output_dims); + ch->filter_shape = ch->DimToStr(filter_dims); + ch->remark = std::to_string(filter_dims[2]) + "x" + + std::to_string(filter_dims[3]) + "p" + + std::to_string((*param_.paddings)[0]) + "s" + + std::to_string(param_.strides[0]) + "g" + + std::to_string(param_.groups) + "d" + + std::to_string((*param_.dilations)[0]); + // MACs = 2.f * kw * kh * batchsize * out_c * out_h * out_w * in_c / group + // GMACs = 1e-9f * MACs + // GMACPS = 1e-6f * MACs / predict_ms + ch->macs = 2.f * filter_dims[2] * filter_dims[3] * + output_dims.production() * input_dims[1] / param_.groups; + } +#endif + // TODO(Superjomn) replace framework::OpDesc with a lite one. bool AttachImpl(const cpp::OpDesc& op_desc, lite::Scope* scope) override { AttachParam(¶m_); diff --git a/lite/operators/io_copy_op.h b/lite/operators/io_copy_op.h index d6922b667d..d734fbd4a7 100644 --- a/lite/operators/io_copy_op.h +++ b/lite/operators/io_copy_op.h @@ -30,6 +30,16 @@ class IoCopyOp : public OpLite { void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } +#ifdef LITE_WITH_PROFILE + void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) { + auto input_dims = param_.x->dims(); + auto output_dims = param_.y->dims(); + ch->input_shape = ch->DimToStr(input_dims); + ch->output_shape = ch->DimToStr(output_dims); + ch->remark = "type" + std::to_string(param_.process_type); + } +#endif + protected: bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; diff --git a/lite/operators/layout_op.h b/lite/operators/layout_op.h index f51768863b..f6bdef82aa 100644 --- a/lite/operators/layout_op.h +++ b/lite/operators/layout_op.h @@ -30,6 +30,16 @@ class LayoutOp : public OpLite { void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } +#ifdef LITE_WITH_PROFILE + void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) { + auto input_dims = param_.x->dims(); + auto output_dims = param_.y->dims(); + ch->input_shape = ch->DimToStr(input_dims); + ch->output_shape = ch->DimToStr(output_dims); + ch->remark = "type" + std::to_string(param_.process_type); + } +#endif + protected: bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) override; diff --git a/lite/operators/pool_op.h b/lite/operators/pool_op.h index 9c29f9597c..92f00a4272 100644 --- a/lite/operators/pool_op.h +++ b/lite/operators/pool_op.h @@ -92,6 +92,25 @@ class PoolOpLite : public OpLite { std::string DebugString() const override { return "pool2d"; } +#ifdef LITE_WITH_PROFILE + void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) { + auto input_dims = param_.x->dims(); + auto output_dims = param_.output->dims(); + ch->input_shape = ch->DimToStr(input_dims); + ch->output_shape = ch->DimToStr(output_dims); + if (param_.global_pooling) { + ch->remark = "global" + param_.pooling_type; + } else { + ch->remark = param_.pooling_type + std::to_string(param_.ksize[0]) + "x" + + std::to_string(param_.ksize[1]) + "s" + + std::to_string(param_.strides[0]) + "p" + + std::to_string((*param_.paddings)[0]); + } + ch->remark += padding_algorithm_; + ch->macs = output_dims.production() * param_.ksize[0] * param_.ksize[1]; + } +#endif + private: mutable PoolParam param_; std::string padding_algorithm_{""}; diff --git a/lite/operators/reshape_op.h b/lite/operators/reshape_op.h index 9dc302ec97..244557bbb9 100644 --- a/lite/operators/reshape_op.h +++ b/lite/operators/reshape_op.h @@ -37,6 +37,15 @@ class ReshapeOp : public OpLite { void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } std::string DebugString() const override { return "reshape"; } +#ifdef LITE_WITH_PROFILE + void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) { + auto input_dims = param_.x->dims(); + auto output_dims = param_.output->dims(); + ch->input_shape = ch->DimToStr(input_dims); + ch->output_shape = ch->DimToStr(output_dims); + } +#endif + protected: mutable ReshapeParam param_; }; diff --git a/lite/operators/softmax_op.h b/lite/operators/softmax_op.h index 20dc2f461e..eb6e50fe6a 100644 --- a/lite/operators/softmax_op.h +++ b/lite/operators/softmax_op.h @@ -37,6 +37,17 @@ class SoftmaxOp : public OpLite { void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } std::string DebugString() const override { return "softmax"; } +#ifdef LITE_WITH_PROFILE + void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) { + auto input_dims = param_.x->dims(); + auto output_dims = param_.output->dims(); + ch->input_shape = ch->DimToStr(input_dims); + ch->output_shape = ch->DimToStr(output_dims); + ch->remark = "axis" + std::to_string(param_.axis); + ch->macs = 2.f * input_dims.production() * 3; + } +#endif + private: mutable SoftmaxParam param_; }; diff --git a/lite/operators/squeeze_op.h b/lite/operators/squeeze_op.h index 983e17acf6..bd26331ddd 100644 --- a/lite/operators/squeeze_op.h +++ b/lite/operators/squeeze_op.h @@ -37,6 +37,15 @@ class SqueezeOp : public OpLite { void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } std::string DebugString() const override { return "squeeze"; } +#ifdef LITE_WITH_PROFILE + void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) { + auto input_dims = param_.X->dims(); + auto output_dims = param_.Out->dims(); + ch->input_shape = ch->DimToStr(input_dims); + ch->output_shape = ch->DimToStr(output_dims); + } +#endif + protected: mutable SqueezeParam param_; }; @@ -54,6 +63,15 @@ class Squeeze2Op : public SqueezeOp { void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); } std::string DebugString() const override { return "squeeze2"; } + +#ifdef LITE_WITH_PROFILE + void GetOpRuntimeInfo(paddle::lite::profile::OpCharacter *ch) { + auto input_dims = param_.X->dims(); + auto output_dims = param_.Out->dims(); + ch->input_shape = ch->DimToStr(input_dims); + ch->output_shape = ch->DimToStr(output_dims); + } +#endif }; } // namespace operators -- GitLab