From 5bc942adab1cf70bf949fdd080c37590b344b326 Mon Sep 17 00:00:00 2001 From: yejianwu Date: Wed, 22 Nov 2017 16:18:22 +0800 Subject: [PATCH] update opencl profiling time from opencl api --- mace/core/net.cc | 33 +++++++++++++++---- mace/core/runtime/opencl/opencl_runtime.cc | 28 ++++++++++++++-- mace/core/runtime/opencl/opencl_runtime.h | 10 ++++++ mace/core/runtime/opencl/opencl_wrapper.cc | 21 ++++++++++++ mace/kernels/opencl/addn.cc | 3 +- mace/kernels/opencl/batch_norm_opencl.cc | 3 +- mace/kernels/opencl/conv_2d_opencl_1x1.cc | 6 ++-- mace/kernels/opencl/conv_2d_opencl_3x3.cc | 3 +- .../opencl/depthwise_conv_opencl_3x3.cc | 3 +- mace/kernels/opencl/pooling_opencl.cc | 6 ++-- mace/kernels/opencl/relu_opencl.cc | 6 ++-- mace/kernels/opencl/resize_bilinear_opencl.cc | 3 +- mace/kernels/opencl/space_to_batch_opecl.cc | 3 +- mace/tools/benchmark/benchmark_model.cc | 6 +++- 14 files changed, 113 insertions(+), 21 deletions(-) diff --git a/mace/core/net.cc b/mace/core/net.cc index f93089a1..6c1533a2 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -36,7 +36,7 @@ bool SimpleNet::Run(RunMetadata *run_metadata) { VLOG(1) << "Running operator " << op->debug_def().name() << "(" << op->debug_def().type() << ")."; OperatorStats *op_stats = nullptr; - if (run_metadata) { + if (device_type_ != DeviceType::OPENCL && run_metadata) { op_stats = run_metadata->add_op_stats(); op_stats->set_operator_name(op->debug_def().name()); op_stats->set_type(op->debug_def().type()); @@ -48,11 +48,32 @@ bool SimpleNet::Run(RunMetadata *run_metadata) { LOG(ERROR) << "Operator failed: " << ProtoDebugString(op->debug_def()); return false; } - if (op_stats) { - op_stats->set_op_end_rel_micros(NowInMicroSec() - - op_stats->all_start_micros()); - op_stats->set_all_end_rel_micros(NowInMicroSec() - - op_stats->all_start_micros()); + + if (run_metadata) { + if (device_type_ == DeviceType::OPENCL) { + OpenCLRuntime::Get()->command_queue().finish(); + op_stats = run_metadata->add_op_stats(); + op_stats->set_operator_name(op->debug_def().name()); + op_stats->set_type(op->debug_def().type()); + + op_stats->set_all_start_micros( + OpenCLRuntime::GetEventProfilingStartInfo() / 1000); + op_stats->set_op_start_rel_micros( + OpenCLRuntime::GetEventProfilingStartInfo() / 1000 - + op_stats->all_start_micros()); + + op_stats->set_op_end_rel_micros( + OpenCLRuntime::GetEventProfilingEndInfo() / 1000 - + op_stats->all_start_micros()); + op_stats->set_all_end_rel_micros( + OpenCLRuntime::GetEventProfilingEndInfo() / 1000 - + op_stats->all_start_micros()); + } else { + op_stats->set_op_end_rel_micros(NowInMicroSec() - + op_stats->all_start_micros()); + op_stats->set_all_end_rel_micros(NowInMicroSec() - + op_stats->all_start_micros()); + } } VLOG(1) << "Op " << op->debug_def().name() << " has shape: " << internal::MakeString(op->Output(0)->shape()); diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 8bc2ecc1..e925e9fb 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -32,6 +32,8 @@ bool ReadSourceFile(const std::string &filename, std::string *content) { } // namespace +bool OpenCLRuntime::enable_profiling_ = false; +cl::Event* OpenCLRuntime::profiling_ev_ = NULL; OpenCLRuntime *OpenCLRuntime::Get() { static std::once_flag init_once; @@ -80,13 +82,35 @@ OpenCLRuntime *OpenCLRuntime::Get() { // a context is like a "runtime link" to the device and platform; // i.e. communication is possible cl::Context context({gpu_device}); - cl::CommandQueue command_queue(context, gpu_device); + cl::CommandQueue command_queue(context, gpu_device, + enable_profiling_ ? CL_QUEUE_PROFILING_ENABLE : 0); instance = new OpenCLRuntime(context, gpu_device, command_queue); }); return instance; } +void OpenCLRuntime::EnableProfiling() { + if (!enable_profiling_) { + enable_profiling_ = true; + profiling_ev_ = new cl::Event(); + } +} + +cl::Event* OpenCLRuntime::GetDefaultEvent() { + return profiling_ev_; +} + +cl_ulong OpenCLRuntime::GetEventProfilingStartInfo() { + MACE_CHECK(enable_profiling_, "should enable profiling first."); + return profiling_ev_->getProfilingInfo(); +} + +cl_ulong OpenCLRuntime::GetEventProfilingEndInfo() { + MACE_CHECK(enable_profiling_, "should enable profiling first."); + return profiling_ev_->getProfilingInfo(); +} + OpenCLRuntime::OpenCLRuntime(cl::Context context, cl::Device device, cl::CommandQueue command_queue) @@ -95,7 +119,7 @@ OpenCLRuntime::OpenCLRuntime(cl::Context context, kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/"; } -OpenCLRuntime::~OpenCLRuntime() {} +OpenCLRuntime::~OpenCLRuntime() { delete profiling_ev_; } cl::Context &OpenCLRuntime::context() { return context_; } diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index b4bda7d6..647fe172 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -18,6 +18,13 @@ class OpenCLRuntime { public: static OpenCLRuntime *Get(); + static void EnableProfiling(); + static cl::Event *GetDefaultEvent(); + + static cl_ulong GetEventProfilingStartInfo(); + static cl_ulong GetEventProfilingEndInfo(); + + cl::Context &context(); cl::Device &device(); cl::CommandQueue &command_queue(); @@ -41,6 +48,9 @@ class OpenCLRuntime { cl::Program *program); private: + static bool enable_profiling_; + static cl::Event* profiling_ev_; + cl::Context context_; cl::Device device_; cl::CommandQueue command_queue_; diff --git a/mace/core/runtime/opencl/opencl_wrapper.cc b/mace/core/runtime/opencl/opencl_wrapper.cc index 46e82781..6921051f 100644 --- a/mace/core/runtime/opencl/opencl_wrapper.cc +++ b/mace/core/runtime/opencl/opencl_wrapper.cc @@ -148,6 +148,11 @@ class OpenCLLibraryImpl final { size_t, void *, size_t *); + using clGetEventProfilingInfoFunc = cl_int (*)(cl_event event, + cl_profiling_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret); #define DEFINE_FUNC_PTR(func) func##Func func = nullptr @@ -191,6 +196,7 @@ class OpenCLLibraryImpl final { DEFINE_FUNC_PTR(clReleaseDevice); DEFINE_FUNC_PTR(clRetainEvent); DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo); + DEFINE_FUNC_PTR(clGetEventProfilingInfo); #undef DEFINE_FUNC_PTR @@ -313,6 +319,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) { ASSIGN_FROM_DLSYM(clReleaseDevice); ASSIGN_FROM_DLSYM(clRetainEvent); ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo); + ASSIGN_FROM_DLSYM(clGetEventProfilingInfo); #undef ASSIGN_FROM_DLSYM @@ -832,3 +839,17 @@ cl_int clGetKernelWorkGroupInfo(cl_kernel kernel, return CL_OUT_OF_RESOURCES; } } + +cl_int clGetEventProfilingInfo(cl_event event, + cl_profiling_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) { + auto func = mace::OpenCLLibraryImpl::Get().clGetEventProfilingInfo; + if (func != nullptr) { + return func(event, param_name, param_value_size, param_value, + param_value_size_ret); + } else { + return CL_OUT_OF_RESOURCES; + } +} diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index b906c92d..c6e21010 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -30,7 +30,8 @@ static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) { cl_int error = runtime->command_queue().enqueueNDRangeKernel( addn_kernel, cl::NullRange, cl::NDRange(gws), - cl::NDRange(lws)); + cl::NDRange(lws), + NULL, OpenCLRuntime::GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS); } diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index badb3e7e..06d2e196 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -61,7 +61,8 @@ void BatchNormFunctor::operator()( cl_int error = runtime->command_queue().enqueueNDRangeKernel( bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2])); + cl::NDRange(params[0], params[1], params[2]), + NULL, OpenCLRuntime::GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; return error; diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index 1d89519e..8f234be1 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -90,7 +90,8 @@ void Conv1x1V2(const Tensor *input, conv_2d_kernel, cl::NullRange, cl::NDRange(static_cast(batch), static_cast(channel_blocks), static_cast(pixel_blocks)), - cl::NDRange(1, 2, kwg_size / 2)); + cl::NDRange(1, 2, kwg_size / 2), + NULL, OpenCLRuntime::GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS, error); } @@ -176,7 +177,8 @@ void Conv1x1V3(const Tensor *input, conv_2d_kernel, cl::NullRange, cl::NDRange(static_cast(channel_blocks), static_cast(height), static_cast(width)), - cl::NDRange(1, 2, kwg_size / 2)); + cl::NDRange(1, 2, kwg_size / 2), + NULL, OpenCLRuntime::GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS, error); } diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 452f46fd..3078e4b8 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -51,7 +51,8 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter, cl_int error = runtime->command_queue().enqueueNDRangeKernel( conv_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2])); + cl::NDRange(lws[0], lws[1], lws[2]), + NULL, OpenCLRuntime::GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS); } diff --git a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc index 84b73071..01a2fa1b 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl_3x3.cc @@ -59,7 +59,8 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input, cl_int error = runtime->command_queue().enqueueNDRangeKernel( conv_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2])); + cl::NDRange(lws[0], lws[1], lws[2]), + NULL, OpenCLRuntime::GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS); } diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index f3fb6812..8c85b78a 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -51,7 +51,8 @@ static void Pooling3(const Tensor *input, cl_int error = runtime->command_queue().enqueueNDRangeKernel( pooling_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2])); + cl::NDRange(lws[0], lws[1], lws[2]), + NULL, OpenCLRuntime::GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS); } @@ -99,7 +100,8 @@ static void PoolingN(const Tensor *input, cl_int error = runtime->command_queue().enqueueNDRangeKernel( pooling_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2])); + cl::NDRange(lws[0], lws[1], lws[2]), + NULL, OpenCLRuntime::GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS); } diff --git a/mace/kernels/opencl/relu_opencl.cc b/mace/kernels/opencl/relu_opencl.cc index ed562d23..086a653f 100644 --- a/mace/kernels/opencl/relu_opencl.cc +++ b/mace/kernels/opencl/relu_opencl.cc @@ -35,7 +35,8 @@ void ReluFunctor::operator()(const Tensor *input, cl_int error = runtime->command_queue().enqueueNDRangeKernel( relu_kernel, cl::NullRange, cl::NDRange(gws), - cl::NDRange(lws)); + cl::NDRange(lws), + NULL, OpenCLRuntime::GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS); } else { auto relu_kernel = runtime->BuildKernel("relu", "relux", built_options); @@ -51,7 +52,8 @@ void ReluFunctor::operator()(const Tensor *input, cl_int error = runtime->command_queue().enqueueNDRangeKernel( relu_kernel, cl::NullRange, cl::NDRange(gws), - cl::NDRange(lws)); + cl::NDRange(lws), + NULL, OpenCLRuntime::GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS); } } diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index bf603d94..11b6ee01 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -49,7 +49,8 @@ void ResizeBilinearFunctor::operator()( cl::NDRange(static_cast(batch * channels), static_cast(out_height), static_cast(out_width)), // TODO (heliangliang) tuning and fix when kwg_size < devisor - cl::NDRange(1, 16, kwg_size / 16)); + cl::NDRange(1, 16, kwg_size / 16), + NULL, OpenCLRuntime::GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS, error); } diff --git a/mace/kernels/opencl/space_to_batch_opecl.cc b/mace/kernels/opencl/space_to_batch_opecl.cc index a4ec2694..52e6de37 100644 --- a/mace/kernels/opencl/space_to_batch_opecl.cc +++ b/mace/kernels/opencl/space_to_batch_opecl.cc @@ -44,7 +44,8 @@ void SpaceToBatchFunctor::operator()(Tensor *space_te cl_int error = runtime->command_queue().enqueueNDRangeKernel( s2b_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2])); + cl::NDRange(lws[0], lws[1], lws[2]), + NULL, OpenCLRuntime::GetDefaultEvent()); MACE_CHECK(error == CL_SUCCESS); } diff --git a/mace/tools/benchmark/benchmark_model.cc b/mace/tools/benchmark/benchmark_model.cc index d4ae7b5d..09ac6fd6 100644 --- a/mace/tools/benchmark/benchmark_model.cc +++ b/mace/tools/benchmark/benchmark_model.cc @@ -3,6 +3,7 @@ // #include "mace/core/net.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/tools/benchmark/stat_summarizer.h" #include "mace/utils/command_line_flags.h" #include "mace/utils/utils.h" @@ -149,7 +150,7 @@ int Main(int argc, char **argv) { std::vector flag_list = { Flag("model_file", &model_file, "graph file name"), - Flag("device", &device, "CPU/NEON"), + Flag("device", &device, "CPU/NEON/OPENCL"), Flag("input_layer", &input_layer_string, "input layer names"), Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"), Flag("input_layer_type", &input_layer_type_string, "input layer type"), @@ -259,6 +260,9 @@ int Main(int argc, char **argv) { DeviceType_Parse(device, &device_type); VLOG(0) << device_type; + if (device_type == DeviceType::OPENCL) + OpenCLRuntime::EnableProfiling(); + // load model std::ifstream model_file_stream(model_file, std::ios::in | std::ios::binary); if (!model_file_stream.is_open()) { -- GitLab