From 5bc942adab1cf70bf949fdd080c37590b344b326 Mon Sep 17 00:00:00 2001
From: yejianwu <yejianwu@xiaomi.com>
Date: Wed, 22 Nov 2017 16:18:22 +0800
Subject: [PATCH] update opencl profiling time from opencl api

---
 mace/core/net.cc                              | 33 +++++++++++++++----
 mace/core/runtime/opencl/opencl_runtime.cc    | 28 ++++++++++++++--
 mace/core/runtime/opencl/opencl_runtime.h     | 10 ++++++
 mace/core/runtime/opencl/opencl_wrapper.cc    | 21 ++++++++++++
 mace/kernels/opencl/addn.cc                   |  3 +-
 mace/kernels/opencl/batch_norm_opencl.cc      |  3 +-
 mace/kernels/opencl/conv_2d_opencl_1x1.cc     |  6 ++--
 mace/kernels/opencl/conv_2d_opencl_3x3.cc     |  3 +-
 .../opencl/depthwise_conv_opencl_3x3.cc       |  3 +-
 mace/kernels/opencl/pooling_opencl.cc         |  6 ++--
 mace/kernels/opencl/relu_opencl.cc            |  6 ++--
 mace/kernels/opencl/resize_bilinear_opencl.cc |  3 +-
 mace/kernels/opencl/space_to_batch_opecl.cc   |  3 +-
 mace/tools/benchmark/benchmark_model.cc       |  6 +++-
 14 files changed, 113 insertions(+), 21 deletions(-)

diff --git a/mace/core/net.cc b/mace/core/net.cc
index f93089a1..6c1533a2 100644
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -36,7 +36,7 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
     VLOG(1) << "Running operator " << op->debug_def().name() << "("
             << op->debug_def().type() << ").";
     OperatorStats *op_stats = nullptr;
-    if (run_metadata) {
+    if (device_type_ != DeviceType::OPENCL && run_metadata) {
       op_stats = run_metadata->add_op_stats();
       op_stats->set_operator_name(op->debug_def().name());
       op_stats->set_type(op->debug_def().type());
@@ -48,11 +48,32 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
       LOG(ERROR) << "Operator failed: " << ProtoDebugString(op->debug_def());
       return false;
     }
-    if (op_stats) {
-      op_stats->set_op_end_rel_micros(NowInMicroSec() -
-                                      op_stats->all_start_micros());
-      op_stats->set_all_end_rel_micros(NowInMicroSec() -
-                                       op_stats->all_start_micros());
+
+    if (run_metadata) {
+      if (device_type_ == DeviceType::OPENCL) {
+        OpenCLRuntime::Get()->command_queue().finish();
+        op_stats = run_metadata->add_op_stats();
+        op_stats->set_operator_name(op->debug_def().name());
+        op_stats->set_type(op->debug_def().type());
+
+        op_stats->set_all_start_micros(
+            OpenCLRuntime::GetEventProfilingStartInfo() / 1000);
+        op_stats->set_op_start_rel_micros(
+            OpenCLRuntime::GetEventProfilingStartInfo() / 1000 -
+            op_stats->all_start_micros());
+
+        op_stats->set_op_end_rel_micros(
+            OpenCLRuntime::GetEventProfilingEndInfo() / 1000 -
+            op_stats->all_start_micros());
+        op_stats->set_all_end_rel_micros(
+            OpenCLRuntime::GetEventProfilingEndInfo() / 1000 -
+            op_stats->all_start_micros());
+      } else {
+        op_stats->set_op_end_rel_micros(NowInMicroSec() -
+                                        op_stats->all_start_micros());
+        op_stats->set_all_end_rel_micros(NowInMicroSec() -
+                                         op_stats->all_start_micros());
+      }
     }
     VLOG(1) << "Op " << op->debug_def().name()
             << " has shape: " << internal::MakeString(op->Output(0)->shape());
diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc
index 8bc2ecc1..e925e9fb 100644
--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -32,6 +32,8 @@ bool ReadSourceFile(const std::string &filename, std::string *content) {
 
 }  // namespace
 
+bool OpenCLRuntime::enable_profiling_ = false;
+cl::Event* OpenCLRuntime::profiling_ev_ = NULL;
 
 OpenCLRuntime *OpenCLRuntime::Get() {
   static std::once_flag init_once;
@@ -80,13 +82,35 @@ OpenCLRuntime *OpenCLRuntime::Get() {
     // a context is like a "runtime link" to the device and platform;
     // i.e. communication is possible
     cl::Context context({gpu_device});
-    cl::CommandQueue command_queue(context, gpu_device);
+    cl::CommandQueue command_queue(context, gpu_device,
+        enable_profiling_ ? CL_QUEUE_PROFILING_ENABLE : 0);
     instance = new OpenCLRuntime(context, gpu_device, command_queue);
   });
 
   return instance;
 }
 
+void OpenCLRuntime::EnableProfiling() {
+  if (!enable_profiling_) {
+    enable_profiling_ = true;
+    profiling_ev_ = new cl::Event();
+  }
+}
+
+cl::Event* OpenCLRuntime::GetDefaultEvent() {
+  return profiling_ev_;
+}
+
+cl_ulong OpenCLRuntime::GetEventProfilingStartInfo() {
+  MACE_CHECK(enable_profiling_, "should enable profiling first.");
+  return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
+}
+
+cl_ulong OpenCLRuntime::GetEventProfilingEndInfo() {
+  MACE_CHECK(enable_profiling_, "should enable profiling first.");
+  return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
+}
+
 OpenCLRuntime::OpenCLRuntime(cl::Context context,
                              cl::Device device,
                              cl::CommandQueue command_queue)
@@ -95,7 +119,7 @@ OpenCLRuntime::OpenCLRuntime(cl::Context context,
   kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
 }
 
-OpenCLRuntime::~OpenCLRuntime() {}
+OpenCLRuntime::~OpenCLRuntime() { delete profiling_ev_; }
 
 cl::Context &OpenCLRuntime::context() { return context_; }
 
diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h
index b4bda7d6..647fe172 100644
--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -18,6 +18,13 @@ class OpenCLRuntime {
  public:
   static OpenCLRuntime *Get();
 
+  static void EnableProfiling();
+  static cl::Event *GetDefaultEvent();
+
+  static cl_ulong GetEventProfilingStartInfo();
+  static cl_ulong GetEventProfilingEndInfo();
+
+
   cl::Context &context();
   cl::Device &device();
   cl::CommandQueue &command_queue();
@@ -41,6 +48,9 @@ class OpenCLRuntime {
                     cl::Program *program);
 
  private:
+  static bool enable_profiling_;
+  static cl::Event* profiling_ev_;
+
   cl::Context context_;
   cl::Device device_;
   cl::CommandQueue command_queue_;
diff --git a/mace/core/runtime/opencl/opencl_wrapper.cc b/mace/core/runtime/opencl/opencl_wrapper.cc
index 46e82781..6921051f 100644
--- a/mace/core/runtime/opencl/opencl_wrapper.cc
+++ b/mace/core/runtime/opencl/opencl_wrapper.cc
@@ -148,6 +148,11 @@ class OpenCLLibraryImpl final {
                                                   size_t,
                                                   void *,
                                                   size_t *);
+  using clGetEventProfilingInfoFunc = cl_int (*)(cl_event event,
+                                                  cl_profiling_info param_name,
+                                                  size_t param_value_size,
+                                                  void *param_value,
+                                                  size_t *param_value_size_ret);
 
 #define DEFINE_FUNC_PTR(func) func##Func func = nullptr
 
@@ -191,6 +196,7 @@ class OpenCLLibraryImpl final {
   DEFINE_FUNC_PTR(clReleaseDevice);
   DEFINE_FUNC_PTR(clRetainEvent);
   DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo);
+  DEFINE_FUNC_PTR(clGetEventProfilingInfo);
 
 #undef DEFINE_FUNC_PTR
 
@@ -313,6 +319,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
   ASSIGN_FROM_DLSYM(clReleaseDevice);
   ASSIGN_FROM_DLSYM(clRetainEvent);
   ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo);
+  ASSIGN_FROM_DLSYM(clGetEventProfilingInfo);
 
 #undef ASSIGN_FROM_DLSYM
 
@@ -832,3 +839,17 @@ cl_int clGetKernelWorkGroupInfo(cl_kernel kernel,
     return CL_OUT_OF_RESOURCES;
   }
 }
+
+cl_int clGetEventProfilingInfo(cl_event event,
+                               cl_profiling_info param_name,
+                               size_t param_value_size,
+                               void *param_value,
+                               size_t *param_value_size_ret) {
+  auto func = mace::OpenCLLibraryImpl::Get().clGetEventProfilingInfo;
+  if (func != nullptr) {
+    return func(event, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_OUT_OF_RESOURCES;
+  }
+}
diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc
index b906c92d..c6e21010 100644
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -30,7 +30,8 @@ static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
       addn_kernel, cl::NullRange,
       cl::NDRange(gws),
-      cl::NDRange(lws));
+      cl::NDRange(lws),
+      NULL, OpenCLRuntime::GetDefaultEvent());
   MACE_CHECK(error == CL_SUCCESS);
 }
 
diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc
index badb3e7e..06d2e196 100644
--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -61,7 +61,8 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
     cl_int error = runtime->command_queue().enqueueNDRangeKernel(
         bm_kernel, cl::NullRange,
         cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]));
+        cl::NDRange(params[0], params[1], params[2]),
+        NULL, OpenCLRuntime::GetDefaultEvent());
 
     MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
     return error;
diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
index 1d89519e..8f234be1 100644
--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -90,7 +90,8 @@ void Conv1x1V2(const Tensor *input,
       conv_2d_kernel, cl::NullRange,
       cl::NDRange(static_cast<int>(batch), static_cast<int>(channel_blocks),
                   static_cast<int>(pixel_blocks)),
-      cl::NDRange(1, 2, kwg_size / 2));
+      cl::NDRange(1, 2, kwg_size / 2),
+      NULL, OpenCLRuntime::GetDefaultEvent());
   MACE_CHECK(error == CL_SUCCESS, error);
 }
 
@@ -176,7 +177,8 @@ void Conv1x1V3(const Tensor *input,
       conv_2d_kernel, cl::NullRange,
       cl::NDRange(static_cast<int>(channel_blocks), static_cast<int>(height),
                   static_cast<int>(width)),
-      cl::NDRange(1, 2, kwg_size / 2));
+      cl::NDRange(1, 2, kwg_size / 2),
+      NULL, OpenCLRuntime::GetDefaultEvent());
   MACE_CHECK(error == CL_SUCCESS, error);
 }
 
diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
index 452f46fd..3078e4b8 100644
--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -51,7 +51,8 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
       conv_kernel, cl::NullRange,
       cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]));
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      NULL, OpenCLRuntime::GetDefaultEvent());
   MACE_CHECK(error == CL_SUCCESS);
 }
 
diff --git a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
index 84b73071..01a2fa1b 100644
--- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
@@ -59,7 +59,8 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
       conv_kernel, cl::NullRange,
       cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]));
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      NULL, OpenCLRuntime::GetDefaultEvent());
   MACE_CHECK(error == CL_SUCCESS);
 }
 
diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc
index f3fb6812..8c85b78a 100644
--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -51,7 +51,8 @@ static void Pooling3(const Tensor *input,
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
       pooling_kernel, cl::NullRange,
       cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]));
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      NULL, OpenCLRuntime::GetDefaultEvent());
   MACE_CHECK(error == CL_SUCCESS);
 }
 
@@ -99,7 +100,8 @@ static void PoolingN(const Tensor *input,
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
       pooling_kernel, cl::NullRange,
       cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]));
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      NULL, OpenCLRuntime::GetDefaultEvent());
   MACE_CHECK(error == CL_SUCCESS);
 }
 
diff --git a/mace/kernels/opencl/relu_opencl.cc b/mace/kernels/opencl/relu_opencl.cc
index ed562d23..086a653f 100644
--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -35,7 +35,8 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
     cl_int error = runtime->command_queue().enqueueNDRangeKernel(
         relu_kernel, cl::NullRange,
         cl::NDRange(gws),
-        cl::NDRange(lws));
+        cl::NDRange(lws),
+        NULL, OpenCLRuntime::GetDefaultEvent());
     MACE_CHECK(error == CL_SUCCESS);
   } else {
     auto relu_kernel  = runtime->BuildKernel("relu", "relux", built_options);
@@ -51,7 +52,8 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
     cl_int error = runtime->command_queue().enqueueNDRangeKernel(
         relu_kernel, cl::NullRange,
         cl::NDRange(gws),
-        cl::NDRange(lws));
+        cl::NDRange(lws),
+        NULL, OpenCLRuntime::GetDefaultEvent());
     MACE_CHECK(error == CL_SUCCESS);
   }
 }
diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc
index bf603d94..11b6ee01 100644
--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -49,7 +49,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
       cl::NDRange(static_cast<int>(batch * channels),
                   static_cast<int>(out_height), static_cast<int>(out_width)),
       // TODO (heliangliang) tuning and fix when kwg_size < devisor
-      cl::NDRange(1, 16, kwg_size / 16));
+      cl::NDRange(1, 16, kwg_size / 16),
+      NULL, OpenCLRuntime::GetDefaultEvent());
   MACE_CHECK(error == CL_SUCCESS, error);
 }
 
diff --git a/mace/kernels/opencl/space_to_batch_opecl.cc b/mace/kernels/opencl/space_to_batch_opecl.cc
index a4ec2694..52e6de37 100644
--- a/mace/kernels/opencl/space_to_batch_opecl.cc
+++ b/mace/kernels/opencl/space_to_batch_opecl.cc
@@ -44,7 +44,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
   cl_int error = runtime->command_queue().enqueueNDRangeKernel(
       s2b_kernel, cl::NullRange,
       cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]));
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      NULL, OpenCLRuntime::GetDefaultEvent());
   MACE_CHECK(error == CL_SUCCESS);
 }
 
diff --git a/mace/tools/benchmark/benchmark_model.cc b/mace/tools/benchmark/benchmark_model.cc
index d4ae7b5d..09ac6fd6 100644
--- a/mace/tools/benchmark/benchmark_model.cc
+++ b/mace/tools/benchmark/benchmark_model.cc
@@ -3,6 +3,7 @@
 //
 
 #include "mace/core/net.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/tools/benchmark/stat_summarizer.h"
 #include "mace/utils/command_line_flags.h"
 #include "mace/utils/utils.h"
@@ -149,7 +150,7 @@ int Main(int argc, char **argv) {
 
   std::vector<Flag> flag_list = {
       Flag("model_file", &model_file, "graph file name"),
-      Flag("device", &device, "CPU/NEON"),
+      Flag("device", &device, "CPU/NEON/OPENCL"),
       Flag("input_layer", &input_layer_string, "input layer names"),
       Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
       Flag("input_layer_type", &input_layer_type_string, "input layer type"),
@@ -259,6 +260,9 @@ int Main(int argc, char **argv) {
   DeviceType_Parse(device, &device_type);
   VLOG(0) << device_type;
 
+  if (device_type == DeviceType::OPENCL)
+    OpenCLRuntime::EnableProfiling();
+
   // load model
   std::ifstream model_file_stream(model_file, std::ios::in | std::ios::binary);
   if (!model_file_stream.is_open()) {
-- 
GitLab