Merge branch 'master' into 'master'

update opencl profiling time from opencl api See merge request !111

Merge branch 'master' into 'master'
update opencl profiling time from opencl api See merge request !111
f12366af · 刘琦 · ef590c19 · a343828b · f12366af · f12366af
16 changed file
--- a/mace/core/net.cc
+++ b/mace/core/net.cc
@@ -38,7 +38,7 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
    VLOG(1) << "Running operator " << op->debug_def().name() << "("
            << op->debug_def().type() << ").";
    OperatorStats *op_stats = nullptr;
-    if (run_metadata) {
+    if (run_metadata && device_type_ != DeviceType::OPENCL) {
      op_stats = run_metadata->add_op_stats();
      op_stats->set_operator_name(op->debug_def().name());
      op_stats->set_type(op->debug_def().type());
@@ -50,14 +50,32 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
      LOG(ERROR) << "Operator failed: " << ProtoDebugString(op->debug_def());
      return false;
    }
-    if (op_stats) {
+    if (run_metadata) {
      if (device_type_ == DeviceType::OPENCL) {
        OpenCLRuntime::Get()->command_queue().finish();
+        op_stats = run_metadata->add_op_stats();
+        op_stats->set_operator_name(op->debug_def().name());
+        op_stats->set_type(op->debug_def().type());
+        op_stats->set_all_start_micros(
+            OpenCLRuntime::Get()->GetEventProfilingStartInfo() / 1000);
+        op_stats->set_op_start_rel_micros(
+            OpenCLRuntime::Get()->GetEventProfilingStartInfo() / 1000 -
+            op_stats->all_start_micros());
+        op_stats->set_op_end_rel_micros(
+            OpenCLRuntime::Get()->GetEventProfilingEndInfo() / 1000 -
+            op_stats->all_start_micros());
+        op_stats->set_all_end_rel_micros(
+            OpenCLRuntime::Get()->GetEventProfilingEndInfo() / 1000 -
+            op_stats->all_start_micros());
+      } else {
+        op_stats->set_op_end_rel_micros(NowInMicroSec() -
+                                        op_stats->all_start_micros());
+        op_stats->set_all_end_rel_micros(NowInMicroSec() -
+                                         op_stats->all_start_micros());
      }
-      op_stats->set_op_end_rel_micros(NowInMicroSec() -
-                                      op_stats->all_start_micros());
-      op_stats->set_all_end_rel_micros(NowInMicroSec() -
-                                       op_stats->all_start_micros());
    }
    VLOG(1) << "Op " << op->debug_def().name()
            << " has shape: " << internal::MakeString(op->Output(0)->shape());

--- a/mace/core/runtime/opencl/opencl_runtime.cc
+++ b/mace/core/runtime/opencl/opencl_runtime.cc
@@ -32,6 +32,8 @@ bool ReadSourceFile(const std::string &filename, std::string *content) {
 }  // namespace
+bool OpenCLRuntime::enable_profiling_ = false;
+cl::Event* OpenCLRuntime::profiling_ev_ = NULL;
 OpenCLRuntime *OpenCLRuntime::Get() {
  static std::once_flag init_once;
@@ -80,13 +82,35 @@ OpenCLRuntime *OpenCLRuntime::Get() {
    // a context is like a "runtime link" to the device and platform;
    // i.e. communication is possible
    cl::Context context({gpu_device});
-    cl::CommandQueue command_queue(context, gpu_device);
+    cl::CommandQueue command_queue(context, gpu_device,
+        enable_profiling_ ? CL_QUEUE_PROFILING_ENABLE : 0);
    instance = new OpenCLRuntime(context, gpu_device, command_queue);
  });
  return instance;
 }
+void OpenCLRuntime::EnableProfiling() {
+  if (!enable_profiling_) {
+    enable_profiling_ = true;
+    profiling_ev_ = new cl::Event();
+  }
+}
+cl::Event* OpenCLRuntime::GetDefaultEvent() {
+  return profiling_ev_;
+}
+cl_ulong OpenCLRuntime::GetEventProfilingStartInfo() {
+  MACE_CHECK(enable_profiling_, "should enable profiling first.");
+  return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
+}
+cl_ulong OpenCLRuntime::GetEventProfilingEndInfo() {
+  MACE_CHECK(enable_profiling_, "should enable profiling first.");
+  return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
+}
 OpenCLRuntime::OpenCLRuntime(cl::Context context,
                             cl::Device device,
                             cl::CommandQueue command_queue)
@@ -95,7 +119,10 @@ OpenCLRuntime::OpenCLRuntime(cl::Context context,
  kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
 }
-OpenCLRuntime::~OpenCLRuntime() {}
+OpenCLRuntime::~OpenCLRuntime() {
+  if (profiling_ev_)
+    delete profiling_ev_;
+}
 cl::Context &OpenCLRuntime::context() { return context_; }

--- a/mace/core/runtime/opencl/opencl_runtime.h
+++ b/mace/core/runtime/opencl/opencl_runtime.h
@@ -18,6 +18,13 @@ class OpenCLRuntime {
 public:
  static OpenCLRuntime *Get();
+  static void EnableProfiling();
+  cl::Event *GetDefaultEvent();
+  cl_ulong GetEventProfilingStartInfo();
+  cl_ulong GetEventProfilingEndInfo();
  cl::Context &context();
  cl::Device &device();
  cl::CommandQueue &command_queue();
@@ -41,6 +48,9 @@ class OpenCLRuntime {
                    cl::Program *program);
 private:
+  static bool enable_profiling_;
+  static cl::Event* profiling_ev_;
  cl::Context context_;
  cl::Device device_;
  cl::CommandQueue command_queue_;

--- a/mace/core/runtime/opencl/opencl_wrapper.cc
+++ b/mace/core/runtime/opencl/opencl_wrapper.cc
@@ -160,6 +160,11 @@ class OpenCLLibraryImpl final {
                                                  size_t,
                                                  void *,
                                                  size_t *);
+  using clGetEventProfilingInfoFunc = cl_int (*)(cl_event event,
+                                                  cl_profiling_info param_name,
+                                                  size_t param_value_size,
+                                                  void *param_value,
+                                                  size_t *param_value_size_ret);
  using clGetImageInfoFunc = cl_int (*)(cl_mem,
                                    cl_image_info,
                                    size_t,
@@ -209,6 +214,7 @@ class OpenCLLibraryImpl final {
  DEFINE_FUNC_PTR(clReleaseDevice);
  DEFINE_FUNC_PTR(clRetainEvent);
  DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo);
+  DEFINE_FUNC_PTR(clGetEventProfilingInfo);
  DEFINE_FUNC_PTR(clGetImageInfo);
 #undef DEFINE_FUNC_PTR
@@ -333,6 +339,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
  ASSIGN_FROM_DLSYM(clReleaseDevice);
  ASSIGN_FROM_DLSYM(clRetainEvent);
  ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo);
+  ASSIGN_FROM_DLSYM(clGetEventProfilingInfo);
  ASSIGN_FROM_DLSYM(clGetImageInfo);
 #undef ASSIGN_FROM_DLSYM
@@ -879,6 +886,20 @@ cl_int clGetKernelWorkGroupInfo(cl_kernel kernel,
  }
 }
+cl_int clGetEventProfilingInfo(cl_event event,
+                               cl_profiling_info param_name,
+                               size_t param_value_size,
+                               void *param_value,
+                               size_t *param_value_size_ret) {
+  auto func = mace::OpenCLLibraryImpl::Get().clGetEventProfilingInfo;
+  if (func != nullptr) {
+    return func(event, param_name, param_value_size, param_value,
+                param_value_size_ret);
+  } else {
+    return CL_OUT_OF_RESOURCES;
+  }
+}
 cl_int clGetImageInfo(cl_mem image,
                      cl_image_info param_name,
                      size_t param_value_size,
@@ -892,3 +913,4 @@ cl_int clGetImageInfo(cl_mem image,
    return CL_OUT_OF_RESOURCES;
  }
 }
--- a/mace/kernels/opencl/addn.cc
+++ b/mace/kernels/opencl/addn.cc
@@ -31,7 +31,8 @@ static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      addn_kernel, cl::NullRange,
      cl::NDRange(gws),
-      cl::NDRange(lws));
+      cl::NDRange(lws),
+      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS);
 }

--- a/mace/kernels/opencl/batch_norm_opencl.cc
+++ b/mace/kernels/opencl/batch_norm_opencl.cc
@@ -62,7 +62,8 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        bm_kernel, cl::NullRange,
        cl::NDRange(gws[0], gws[1], gws[2]),
-        cl::NDRange(params[0], params[1], params[2]));
+        cl::NDRange(params[0], params[1], params[2]),
+        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
    MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
    return error;

--- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc
@@ -59,7 +59,8 @@ void Conv1x1V2(const Tensor *input,
      conv_2d_kernel, cl::NullRange,
      cl::NDRange(static_cast<int>(batch), static_cast<int>(channel_blocks),
                  static_cast<int>(pixel_blocks)),
-      cl::NDRange(1, 2, kwg_size / 2));
+      cl::NDRange(1, 2, kwg_size / 2),
+      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS, error);
 }
@@ -104,7 +105,8 @@ void Conv1x1V3(const Tensor *input,
      conv_2d_kernel, cl::NullRange,
      cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(height),
                  static_cast<uint32_t>(height * batch)),
-      cl::NDRange(4, 15, 8));
+      cl::NDRange(4, 15, 8),
+      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS, error);
 }

--- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc
+++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc
@@ -52,7 +52,8 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      conv_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]));
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS);
 }

--- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
+++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc
@@ -60,7 +60,8 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      conv_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]));
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS);
 }

--- a/mace/kernels/opencl/pooling_opencl.cc
+++ b/mace/kernels/opencl/pooling_opencl.cc
@@ -52,7 +52,8 @@ static void Pooling3(const Tensor *input,
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      pooling_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]));
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS);
 }
@@ -100,7 +101,8 @@ static void PoolingN(const Tensor *input,
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      pooling_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]));
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS);
 }

--- a/mace/kernels/opencl/relu_opencl.cc
+++ b/mace/kernels/opencl/relu_opencl.cc
@@ -36,7 +36,8 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        relu_kernel, cl::NullRange,
        cl::NDRange(gws),
-        cl::NDRange(lws));
+        cl::NDRange(lws),
+        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
    MACE_CHECK(error == CL_SUCCESS);
  } else {
    auto relu_kernel  = runtime->BuildKernel("relu", "relux", built_options);
@@ -52,7 +53,8 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
    cl_int error = runtime->command_queue().enqueueNDRangeKernel(
        relu_kernel, cl::NullRange,
        cl::NDRange(gws),
-        cl::NDRange(lws));
+        cl::NDRange(lws),
+        NULL, OpenCLRuntime::Get()->GetDefaultEvent());
    MACE_CHECK(error == CL_SUCCESS);
  }
 }

--- a/mace/kernels/opencl/resize_bilinear_opencl.cc
+++ b/mace/kernels/opencl/resize_bilinear_opencl.cc
@@ -50,7 +50,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
      cl::NDRange(static_cast<int>(batch * channels),
                  static_cast<int>(out_height), static_cast<int>(out_width)),
      // TODO (heliangliang) tuning and fix when kwg_size < devisor
-      cl::NDRange(1, 16, kwg_size / 16));
+      cl::NDRange(1, 16, kwg_size / 16),
+      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS, error);
 }

--- a/mace/kernels/opencl/space_to_batch_opecl.cc
+++ b/mace/kernels/opencl/space_to_batch_opecl.cc
@@ -45,7 +45,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
  cl_int error = runtime->command_queue().enqueueNDRangeKernel(
      s2b_kernel, cl::NullRange,
      cl::NDRange(gws[0], gws[1], gws[2]),
-      cl::NDRange(lws[0], lws[1], lws[2]));
+      cl::NDRange(lws[0], lws[1], lws[2]),
+      NULL, OpenCLRuntime::Get()->GetDefaultEvent());
  MACE_CHECK(error == CL_SUCCESS);
 }

--- a/mace/ops/batch_norm_benchmark.cc
+++ b/mace/ops/batch_norm_benchmark.cc
@@ -3,6 +3,7 @@
 //
 #include "mace/core/operator.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/core/testing/test_benchmark.h"
 #include "mace/ops/ops_test_util.h"
@@ -12,6 +13,9 @@ static void BatchNorm(
    int iters, int batch, int channels, int height, int width) {
  mace::testing::StopTiming();
+  if ( D == OPENCL )
+    OpenCLRuntime::EnableProfiling();
  OpsTestNet net;
  OpDefBuilder("BatchNorm", "BatchNormBM")
      .Input("Input")
@@ -77,4 +81,4 @@ BM_BATCH_NORM(1, 512, 14, 14, float);
 BM_BATCH_NORM(1, 1024, 7, 7, float);
 BM_BATCH_NORM(32, 1, 256, 256, float);
 BM_BATCH_NORM(32, 3, 256, 256, float);
 }  //  namespace mace
\ No newline at end of file
--- a/mace/tools/benchmark/benchmark_model.cc
+++ b/mace/tools/benchmark/benchmark_model.cc
@@ -3,6 +3,7 @@
 //
 #include "mace/core/net.h"
+#include "mace/core/runtime/opencl/opencl_runtime.h"
 #include "mace/tools/benchmark/stat_summarizer.h"
 #include "mace/utils/command_line_flags.h"
 #include "mace/utils/utils.h"
@@ -149,7 +150,7 @@ int Main(int argc, char **argv) {
  std::vector<Flag> flag_list = {
      Flag("model_file", &model_file, "graph file name"),
-      Flag("device", &device, "CPU/NEON"),
+      Flag("device", &device, "CPU/NEON/OPENCL"),
      Flag("input_layer", &input_layer_string, "input layer names"),
      Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
      Flag("input_layer_type", &input_layer_type_string, "input layer type"),
@@ -259,6 +260,9 @@ int Main(int argc, char **argv) {
  DeviceType_Parse(device, &device_type);
  VLOG(0) << device_type;
+  if (device_type == DeviceType::OPENCL)
+    OpenCLRuntime::EnableProfiling();
  // load model
  std::ifstream model_file_stream(model_file, std::ios::in | std::ios::binary);
  if (!model_file_stream.is_open()) {

--- a/mace/utils/tuner.h
+++ b/mace/utils/tuner.h
@@ -131,13 +131,14 @@ class Tuner {
                     double &time_us) {
    RetType res;
    int64_t total_time_us = 0;
-    const int64_t start_time = NowInMicroSec();
    for (int i = 0; i < num_runs; ++i) {
      res = func(params);
+      OpenCLRuntime::Get()->command_queue().finish();
+      double start_time = OpenCLRuntime::Get()->GetEventProfilingStartInfo() / 1000.0;
+      double end_time = OpenCLRuntime::Get()->GetEventProfilingEndInfo() / 1000.0;
+      total_time_us += end_time - start_time;
    }
-    OpenCLRuntime::Get()->command_queue().finish();
-    const int64_t end_time = NowInMicroSec();
-    total_time_us += end_time - start_time;
    time_us = total_time_us * 1.0 / num_runs;
    return res;