update opencl profiling time from opencl api

5bc942ad · yejianwu · 3738481f · 5bc942ad · 5bc942ad · 5bc942ad
14 changed file
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -36,7 +36,7 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
    VLOG(1) << "Running operator " << op->debug_def().name() << "("
            << op->debug_def().type() << ").";
    OperatorStats *op_stats = nullptr;
-    if (run_metadata) {
+    if (device_type_ != DeviceType::OPENCL && run_metadata) {
      op_stats = run_metadata->add_op_stats();
      op_stats->set_operator_name(op->debug_def().name());
      op_stats->set_type(op->debug_def().type());
@@ -48,11 +48,32 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
      LOG(ERROR) << "Operator failed: " << ProtoDebugString(op->debug_def());
      return false;
    }
-    if (op_stats) {
-      op_stats->set_op_end_rel_micros(NowInMicroSec() -
-                                      op_stats->all_start_micros());
-      op_stats->set_all_end_rel_micros(NowInMicroSec() -
-                                       op_stats->all_start_micros());
+
+    if (run_metadata) {
+      if (device_type_ == DeviceType::OPENCL) {
+        OpenCLRuntime::Get()->command_queue().finish();
+        op_stats = run_metadata->add_op_stats();
+        op_stats->set_operator_name(op->debug_def().name());
+        op_stats->set_type(op->debug_def().type());
+
+        op_stats->set_all_start_micros(
+            OpenCLRuntime::GetEventProfilingStartInfo() / 1000);
+        op_stats->set_op_start_rel_micros(
+            OpenCLRuntime::GetEventProfilingStartInfo() / 1000 -
+            op_stats->all_start_micros());
+
+        op_stats->set_op_end_rel_micros(
+            OpenCLRuntime::GetEventProfilingEndInfo() / 1000 -
+            op_stats->all_start_micros());
+        op_stats->set_all_end_rel_micros(
+            OpenCLRuntime::GetEventProfilingEndInfo() / 1000 -
+            op_stats->all_start_micros());
+      } else {
+        op_stats->set_op_end_rel_micros(NowInMicroSec() -
+                                        op_stats->all_start_micros());
+        op_stats->set_all_end_rel_micros(NowInMicroSec() -
+                                         op_stats->all_start_micros());
+      }
    }
    VLOG(1) << "Op " << op->debug_def().name()
            << " has shape: " << internal::MakeString(op->Output(0)->shape());

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -32,6 +32,8 @@ bool ReadSourceFile(const std::string &filename, std::string *content) {

 }  // namespace

+bool OpenCLRuntime::enable_profiling_ = false;
+cl::Event* OpenCLRuntime::profiling_ev_ = NULL;

 OpenCLRuntime *OpenCLRuntime::Get() {
  static std::once_flag init_once;
@@ -80,13 +82,35 @@ OpenCLRuntime *OpenCLRuntime::Get() {
    // a context is like a "runtime link" to the device and platform;
    // i.e. communication is possible
    cl::Context context({gpu_device});
-    cl::CommandQueue command_queue(context, gpu_device);
+    cl::CommandQueue command_queue(context, gpu_device,
+        enable_profiling_ ? CL_QUEUE_PROFILING_ENABLE : 0);
    instance = new OpenCLRuntime(context, gpu_device, command_queue);
  });

  return instance;
 }

+void OpenCLRuntime::EnableProfiling() {
+  if (!enable_profiling_) {
+    enable_profiling_ = true;
+    profiling_ev_ = new cl::Event();
+  }
+}
+
+cl::Event* OpenCLRuntime::GetDefaultEvent() {
+  return profiling_ev_;
+}
+
+cl_ulong OpenCLRuntime::GetEventProfilingStartInfo() {
+  MACE_CHECK(enable_profiling_, "should enable profiling first.");
+  return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
+}
+
+cl_ulong OpenCLRuntime::GetEventProfilingEndInfo() {
+  MACE_CHECK(enable_profiling_, "should enable profiling first.");
+  return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
+}
+
 OpenCLRuntime::OpenCLRuntime(cl::Context context,
                             cl::Device device,
                             cl::CommandQueue command_queue)
@@ -95,7 +119,7 @@ OpenCLRuntime::OpenCLRuntime(cl::Context context,
  kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
 }

-OpenCLRuntime::~OpenCLRuntime() {}
+OpenCLRuntime::~OpenCLRuntime() { delete profiling_ev_; }

 cl::Context &OpenCLRuntime::context() { return context_; }


--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -18,6 +18,13 @@ class OpenCLRuntime {
 public:
  static OpenCLRuntime *Get();

+  static void EnableProfiling();
+  static cl::Event *GetDefaultEvent();
+
+  static cl_ulong GetEventProfilingStartInfo();
+  static cl_ulong GetEventProfilingEndInfo();
+
+
  cl::Context &context();
  cl::Device &device();
  cl::CommandQueue &command_queue();
@@ -41,6 +48,9 @@ class OpenCLRuntime {
                    cl::Program *program);

 private:
+  static bool enable_profiling_;
+  static cl::Event* profiling_ev_;
+
  cl::Context context_;
  cl::Device device_;
  cl::CommandQueue command_queue_;

--- a/mace/core/runtime/opencl/opencl_wrapper.cc
+++ b/mace/core/runtime/opencl/opencl_wrapper.cc
@@ -148,6 +148,11 @@ class OpenCLLibraryImpl final {
                                                  size_t,
                                                  void *,
                                                  size_t *);
+  using clGetEventProfilingInfoFunc = cl_int (*)(cl_event event,
+                                                  cl_profiling_info param_name,
+                                                  size_t param_value_size,
+                                                  void *param_value,
+                                                  size_t *param_value_size_ret);

 #define DEFINE_FUNC_PTR(func) func##Func func = nullptr

@@ -191,6 +196,7 @@ class OpenCLLibraryImpl final {
  DEFINE_FUNC_PTR(clReleaseDevice);
  DEFINE_FUNC_PTR(clRetainEvent);
  DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo);
+  DEFINE_FUNC_PTR(clGetEventProfilingInfo);

 #undef DEFINE_FUNC_PTR

@@ -313,6 +319,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
  ASSIGN_FROM_DLSYM(clReleaseDevice);
  ASSIGN_FROM_DLSYM(clRetainEvent);
  ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo);
+  ASSIGN_FROM_DLSYM(clGetEventProfilingInfo);

 #undef ASSIGN_FROM_DLSYM

@@ -832,3 +839,17 @@ cl_int clGetKernelWorkGroupInfo(cl_kernel kernel,
    return CL_OUT_OF_RESOURCES;
  }
 }
+
+cl_int clGetEventProfilingInfo(cl_event event,
+                               cl_profiling_info param_name,
+                               size_t param_value_size,
+                               void *param_value,
+                               size_t *param_value_size_ret) {
+  auto func = mace::OpenCLLibraryImpl::Get().clGetEventProfilingInfo;
+  if (func != nullptr) {
+    return func(event, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_OUT_OF_RESOURCES;
+  }
+}
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -30,7 +30,8 @@ static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      addn_kernel, cl::NullRange,
      cl::NDRange(gws),
-      cl::NDRange(lws));
+      cl::NDRange(lws),
+      NULL, OpenCLRuntime::GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS);
 }


--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -61,7 +61,8 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        bm_kernel, cl::NullRange,
        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]));
+        cl::NDRange(params[0], params[1], params[2]),
+        NULL, OpenCLRuntime::GetDefaultEvent());

    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
    return error;

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -90,7 +90,8 @@ void Conv1x1V2(const Tensor *input,
      conv_2d_kernel, cl::NullRange,
      cl::NDRange(static_cast<int>(batch), static_cast<int>(channel_blocks),
                  static_cast<int>(pixel_blocks)),
-      cl::NDRange(1, 2, kwg_size / 2));
+      cl::NDRange(1, 2, kwg_size / 2),
+      NULL, OpenCLRuntime::GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS, error);
 }

@@ -176,7 +177,8 @@ void Conv1x1V3(const Tensor *input,
      conv_2d_kernel, cl::NullRange,
      cl::NDRange(static_cast<int>(channel_blocks), static_cast<int>(height),
                  static_cast<int>(width)),
-      cl::NDRange(1, 2, kwg_size / 2));
+      cl::NDRange(1, 2, kwg_size / 2),
+      NULL, OpenCLRuntime::GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS, error);
 }


--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -51,7 +51,8 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      conv_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]));
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      NULL, OpenCLRuntime::GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS);
 }


--- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
@@ -59,7 +59,8 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      conv_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]));
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      NULL, OpenCLRuntime::GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS);
 }


--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -51,7 +51,8 @@ static void Pooling3(const Tensor *input,
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      pooling_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]));
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      NULL, OpenCLRuntime::GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS);
 }

@@ -99,7 +100,8 @@ static void PoolingN(const Tensor *input,
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      pooling_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]));
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      NULL, OpenCLRuntime::GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS);
 }


--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -35,7 +35,8 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        relu_kernel, cl::NullRange,
        cl::NDRange(gws),
-        cl::NDRange(lws));
+        cl::NDRange(lws),
+        NULL, OpenCLRuntime::GetDefaultEvent());
    MACE_CHECK(error == CL_SUCCESS);
  } else {
    auto relu_kernel  = runtime->BuildKernel("relu", "relux", built_options);
@@ -51,7 +52,8 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        relu_kernel, cl::NullRange,
        cl::NDRange(gws),
-        cl::NDRange(lws));
+        cl::NDRange(lws),
+        NULL, OpenCLRuntime::GetDefaultEvent());
    MACE_CHECK(error == CL_SUCCESS);
  }
 }

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -49,7 +49,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
      cl::NDRange(static_cast<int>(batch * channels),
                  static_cast<int>(out_height), static_cast<int>(out_width)),
      // TODO (heliangliang) tuning and fix when kwg_size < devisor
-      cl::NDRange(1, 16, kwg_size / 16));
+      cl::NDRange(1, 16, kwg_size / 16),
+      NULL, OpenCLRuntime::GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS, error);
 }


--- a/mace/kernels/opencl/space_to_batch_opecl.cc
+++ b/mace/kernels/opencl/space_to_batch_opecl.cc
@@ -44,7 +44,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      s2b_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]));
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      NULL, OpenCLRuntime::GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS);
 }


--- a/mace/tools/benchmark/benchmark_model.cc
+++ b/mace/tools/benchmark/benchmark_model.cc
@@ -3,6 +3,7 @@
 //

 #include "mace/core/net.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/tools/benchmark/stat_summarizer.h"
 #include "mace/utils/command_line_flags.h"
 #include "mace/utils/utils.h"
@@ -149,7 +150,7 @@ int Main(int argc, char **argv) {

  std::vector<Flag> flag_list = {
      Flag("model_file", &model_file, "graph file name"),
-      Flag("device", &device, "CPU/NEON"),
+      Flag("device", &device, "CPU/NEON/OPENCL"),
      Flag("input_layer", &input_layer_string, "input layer names"),
      Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
      Flag("input_layer_type", &input_layer_type_string, "input layer type"),
@@ -259,6 +260,9 @@ int Main(int argc, char **argv) {
  DeviceType_Parse(device, &device_type);
  VLOG(0) << device_type;

+  if (device_type == DeviceType::OPENCL)
+    OpenCLRuntime::EnableProfiling();
+
  // load model
  std::ifstream model_file_stream(model_file, std::ios::in | std::ios::binary);
  if (!model_file_stream.is_open()) {