提交 5bc942ad 编写于 作者: Y yejianwu

update opencl profiling time from opencl api

上级 3738481f
......@@ -36,7 +36,7 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
VLOG(1) << "Running operator " << op->debug_def().name() << "("
<< op->debug_def().type() << ").";
OperatorStats *op_stats = nullptr;
if (run_metadata) {
if (device_type_ != DeviceType::OPENCL && run_metadata) {
op_stats = run_metadata->add_op_stats();
op_stats->set_operator_name(op->debug_def().name());
op_stats->set_type(op->debug_def().type());
......@@ -48,11 +48,32 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
LOG(ERROR) << "Operator failed: " << ProtoDebugString(op->debug_def());
return false;
}
if (op_stats) {
op_stats->set_op_end_rel_micros(NowInMicroSec() -
op_stats->all_start_micros());
op_stats->set_all_end_rel_micros(NowInMicroSec() -
op_stats->all_start_micros());
if (run_metadata) {
if (device_type_ == DeviceType::OPENCL) {
OpenCLRuntime::Get()->command_queue().finish();
op_stats = run_metadata->add_op_stats();
op_stats->set_operator_name(op->debug_def().name());
op_stats->set_type(op->debug_def().type());
op_stats->set_all_start_micros(
OpenCLRuntime::GetEventProfilingStartInfo() / 1000);
op_stats->set_op_start_rel_micros(
OpenCLRuntime::GetEventProfilingStartInfo() / 1000 -
op_stats->all_start_micros());
op_stats->set_op_end_rel_micros(
OpenCLRuntime::GetEventProfilingEndInfo() / 1000 -
op_stats->all_start_micros());
op_stats->set_all_end_rel_micros(
OpenCLRuntime::GetEventProfilingEndInfo() / 1000 -
op_stats->all_start_micros());
} else {
op_stats->set_op_end_rel_micros(NowInMicroSec() -
op_stats->all_start_micros());
op_stats->set_all_end_rel_micros(NowInMicroSec() -
op_stats->all_start_micros());
}
}
VLOG(1) << "Op " << op->debug_def().name()
<< " has shape: " << internal::MakeString(op->Output(0)->shape());
......
......@@ -32,6 +32,8 @@ bool ReadSourceFile(const std::string &filename, std::string *content) {
} // namespace
bool OpenCLRuntime::enable_profiling_ = false;
cl::Event* OpenCLRuntime::profiling_ev_ = NULL;
OpenCLRuntime *OpenCLRuntime::Get() {
static std::once_flag init_once;
......@@ -80,13 +82,35 @@ OpenCLRuntime *OpenCLRuntime::Get() {
// a context is like a "runtime link" to the device and platform;
// i.e. communication is possible
cl::Context context({gpu_device});
cl::CommandQueue command_queue(context, gpu_device);
cl::CommandQueue command_queue(context, gpu_device,
enable_profiling_ ? CL_QUEUE_PROFILING_ENABLE : 0);
instance = new OpenCLRuntime(context, gpu_device, command_queue);
});
return instance;
}
void OpenCLRuntime::EnableProfiling() {
if (!enable_profiling_) {
enable_profiling_ = true;
profiling_ev_ = new cl::Event();
}
}
cl::Event* OpenCLRuntime::GetDefaultEvent() {
return profiling_ev_;
}
cl_ulong OpenCLRuntime::GetEventProfilingStartInfo() {
MACE_CHECK(enable_profiling_, "should enable profiling first.");
return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_START>();
}
cl_ulong OpenCLRuntime::GetEventProfilingEndInfo() {
MACE_CHECK(enable_profiling_, "should enable profiling first.");
return profiling_ev_->getProfilingInfo<CL_PROFILING_COMMAND_END>();
}
OpenCLRuntime::OpenCLRuntime(cl::Context context,
cl::Device device,
cl::CommandQueue command_queue)
......@@ -95,7 +119,7 @@ OpenCLRuntime::OpenCLRuntime(cl::Context context,
kernel_path_ = std::string(kernel_path == nullptr ? "" : kernel_path) + "/";
}
OpenCLRuntime::~OpenCLRuntime() {}
OpenCLRuntime::~OpenCLRuntime() { delete profiling_ev_; }
cl::Context &OpenCLRuntime::context() { return context_; }
......
......@@ -18,6 +18,13 @@ class OpenCLRuntime {
public:
static OpenCLRuntime *Get();
static void EnableProfiling();
static cl::Event *GetDefaultEvent();
static cl_ulong GetEventProfilingStartInfo();
static cl_ulong GetEventProfilingEndInfo();
cl::Context &context();
cl::Device &device();
cl::CommandQueue &command_queue();
......@@ -41,6 +48,9 @@ class OpenCLRuntime {
cl::Program *program);
private:
static bool enable_profiling_;
static cl::Event* profiling_ev_;
cl::Context context_;
cl::Device device_;
cl::CommandQueue command_queue_;
......
......@@ -148,6 +148,11 @@ class OpenCLLibraryImpl final {
size_t,
void *,
size_t *);
using clGetEventProfilingInfoFunc = cl_int (*)(cl_event event,
cl_profiling_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret);
#define DEFINE_FUNC_PTR(func) func##Func func = nullptr
......@@ -191,6 +196,7 @@ class OpenCLLibraryImpl final {
DEFINE_FUNC_PTR(clReleaseDevice);
DEFINE_FUNC_PTR(clRetainEvent);
DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo);
DEFINE_FUNC_PTR(clGetEventProfilingInfo);
#undef DEFINE_FUNC_PTR
......@@ -313,6 +319,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) {
ASSIGN_FROM_DLSYM(clReleaseDevice);
ASSIGN_FROM_DLSYM(clRetainEvent);
ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo);
ASSIGN_FROM_DLSYM(clGetEventProfilingInfo);
#undef ASSIGN_FROM_DLSYM
......@@ -832,3 +839,17 @@ cl_int clGetKernelWorkGroupInfo(cl_kernel kernel,
return CL_OUT_OF_RESOURCES;
}
}
cl_int clGetEventProfilingInfo(cl_event event,
cl_profiling_info param_name,
size_t param_value_size,
void *param_value,
size_t *param_value_size_ret) {
auto func = mace::OpenCLLibraryImpl::Get().clGetEventProfilingInfo;
if (func != nullptr) {
return func(event, param_name, param_value_size, param_value,
param_value_size_ret);
} else {
return CL_OUT_OF_RESOURCES;
}
}
......@@ -30,7 +30,8 @@ static void Add2(const Tensor *input0, const Tensor *input1, Tensor *output) {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel, cl::NullRange,
cl::NDRange(gws),
cl::NDRange(lws));
cl::NDRange(lws),
NULL, OpenCLRuntime::GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
}
......
......@@ -61,7 +61,8 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
bm_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]));
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
......
......@@ -90,7 +90,8 @@ void Conv1x1V2(const Tensor *input,
conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<int>(batch), static_cast<int>(channel_blocks),
static_cast<int>(pixel_blocks)),
cl::NDRange(1, 2, kwg_size / 2));
cl::NDRange(1, 2, kwg_size / 2),
NULL, OpenCLRuntime::GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error);
}
......@@ -176,7 +177,8 @@ void Conv1x1V3(const Tensor *input,
conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<int>(channel_blocks), static_cast<int>(height),
static_cast<int>(width)),
cl::NDRange(1, 2, kwg_size / 2));
cl::NDRange(1, 2, kwg_size / 2),
NULL, OpenCLRuntime::GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error);
}
......
......@@ -51,7 +51,8 @@ static void InnerConv2dK3x3S12(const Tensor *input, const Tensor *filter,
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]));
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
}
......
......@@ -59,7 +59,8 @@ static void InnerDepthwiseConvOpenclK3x3S12(const Tensor *input,
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]));
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
}
......
......@@ -51,7 +51,8 @@ static void Pooling3(const Tensor *input,
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]));
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
}
......@@ -99,7 +100,8 @@ static void PoolingN(const Tensor *input,
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]));
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
}
......
......@@ -35,7 +35,8 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
relu_kernel, cl::NullRange,
cl::NDRange(gws),
cl::NDRange(lws));
cl::NDRange(lws),
NULL, OpenCLRuntime::GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
} else {
auto relu_kernel = runtime->BuildKernel("relu", "relux", built_options);
......@@ -51,7 +52,8 @@ void ReluFunctor<DeviceType::OPENCL, float>::operator()(const Tensor *input,
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
relu_kernel, cl::NullRange,
cl::NDRange(gws),
cl::NDRange(lws));
cl::NDRange(lws),
NULL, OpenCLRuntime::GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
}
}
......
......@@ -49,7 +49,8 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, float>::operator()(
cl::NDRange(static_cast<int>(batch * channels),
static_cast<int>(out_height), static_cast<int>(out_width)),
// TODO (heliangliang) tuning and fix when kwg_size < devisor
cl::NDRange(1, 16, kwg_size / 16));
cl::NDRange(1, 16, kwg_size / 16),
NULL, OpenCLRuntime::GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error);
}
......
......@@ -44,7 +44,8 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, float>::operator()(Tensor *space_te
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
s2b_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]));
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
}
......
......@@ -3,6 +3,7 @@
//
#include "mace/core/net.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/tools/benchmark/stat_summarizer.h"
#include "mace/utils/command_line_flags.h"
#include "mace/utils/utils.h"
......@@ -149,7 +150,7 @@ int Main(int argc, char **argv) {
std::vector<Flag> flag_list = {
Flag("model_file", &model_file, "graph file name"),
Flag("device", &device, "CPU/NEON"),
Flag("device", &device, "CPU/NEON/OPENCL"),
Flag("input_layer", &input_layer_string, "input layer names"),
Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
Flag("input_layer_type", &input_layer_type_string, "input layer type"),
......@@ -259,6 +260,9 @@ int Main(int argc, char **argv) {
DeviceType_Parse(device, &device_type);
VLOG(0) << device_type;
if (device_type == DeviceType::OPENCL)
OpenCLRuntime::EnableProfiling();
// load model
std::ifstream model_file_stream(model_file, std::ios::in | std::ios::binary);
if (!model_file_stream.is_open()) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册