From 365e4fe3e881145da7ac43ed26b9aabdcb10b820 Mon Sep 17 00:00:00 2001 From: liuqi Date: Tue, 10 Apr 2018 11:33:56 +0800 Subject: [PATCH] Fix concat test dead on pengpai. --- mace/core/runtime/opencl/opencl_runtime.cc | 7 +++- mace/core/runtime/opencl/opencl_runtime.h | 2 + mace/core/runtime/opencl/opencl_wrapper.cc | 22 +++++++++++ mace/kernels/opencl/concat.cc | 40 +++++++++++++++++--- mace/kernels/opencl/slice.cc | 43 ++++++++++++++++++---- 5 files changed, 100 insertions(+), 14 deletions(-) diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index f24333ac..79093fa1 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -236,7 +236,7 @@ void GetAdrenoContextProperties(std::vector *properties, OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, GPUPriorityHint gpu_priority_hint): - storage_(nullptr) { + storage_(nullptr), is_profiling_enabled_(false) { LoadOpenCLLibrary(); std::vector all_platforms; @@ -286,6 +286,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, if (Tuner::Get()->IsTuning() || (profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1')) { properties |= CL_QUEUE_PROFILING_ENABLE; + is_profiling_enabled_ = true; } cl_int err; @@ -590,4 +591,8 @@ const bool OpenCLRuntime::IsOutOfRangeCheckEnabled() const { return out_of_range_check_; } +const bool OpenCLRuntime::is_profiling_enabled() const { + return is_profiling_enabled_; +} + } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index c0656490..35a20bff 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -77,6 +77,7 @@ class OpenCLRuntime { const GPUType ParseGPUType(const std::string &device_name); const std::string ParseDeviceVersion(const std::string &device_version); void SaveBuiltCLProgram(); + const bool is_profiling_enabled() const; private: OpenCLRuntime(GPUPerfHint, GPUPriorityHint); @@ -116,6 +117,7 @@ class OpenCLRuntime { std::string platform_info_; bool program_map_changed_; std::unique_ptr storage_; + bool is_profiling_enabled_; static GPUPerfHint kGPUPerfHint; static GPUPriorityHint kGPUPriorityHint; diff --git a/mace/core/runtime/opencl/opencl_wrapper.cc b/mace/core/runtime/opencl/opencl_wrapper.cc index b8cd7097..c9b20577 100644 --- a/mace/core/runtime/opencl/opencl_wrapper.cc +++ b/mace/core/runtime/opencl/opencl_wrapper.cc @@ -168,6 +168,11 @@ class OpenCLLibraryImpl final { size_t, void *, size_t *); + using clGetEventInfoFunc = cl_int (*)(cl_event event, + cl_event_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret); using clGetEventProfilingInfoFunc = cl_int (*)(cl_event event, cl_profiling_info param_name, size_t param_value_size, @@ -221,6 +226,7 @@ class OpenCLLibraryImpl final { MACE_CL_DEFINE_FUNC_PTR(clReleaseDevice); MACE_CL_DEFINE_FUNC_PTR(clRetainEvent); MACE_CL_DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo); + MACE_CL_DEFINE_FUNC_PTR(clGetEventInfo); MACE_CL_DEFINE_FUNC_PTR(clGetEventProfilingInfo); MACE_CL_DEFINE_FUNC_PTR(clGetImageInfo); @@ -344,6 +350,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) { MACE_CL_ASSIGN_FROM_DLSYM(clReleaseDevice); MACE_CL_ASSIGN_FROM_DLSYM(clRetainEvent); MACE_CL_ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo); + MACE_CL_ASSIGN_FROM_DLSYM(clGetEventInfo); MACE_CL_ASSIGN_FROM_DLSYM(clGetEventProfilingInfo); MACE_CL_ASSIGN_FROM_DLSYM(clGetImageInfo); @@ -881,6 +888,21 @@ CL_API_ENTRY cl_int clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 { return func(event); } +// Event API +CL_API_ENTRY cl_int clGetEventInfo(cl_event event, + cl_event_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) +CL_API_SUFFIX__VERSION_1_0 { + MACE_CHECK_NOTNULL(mace::openclLibraryImpl); + auto func = mace::openclLibraryImpl->clGetEventInfo; + MACE_CHECK_NOTNULL(func); + MACE_LATENCY_LOGGER(3, "clGetEventInfo"); + return func(event, param_name, param_value_size, param_value, + param_value_size_ret); +} + // Profiling APIs CL_API_ENTRY cl_int clGetEventProfilingInfo(cl_event event, cl_profiling_info param_name, diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index a2898edb..697ae854 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -137,6 +137,9 @@ static void ConcatN(cl::Kernel *kernel, const int inputs_count = input_list.size(); index_t chan_blk_offset = 0; + cl::Event event; + CallStats call_stats{INT64_MAX, 0}; + const std::vector lws = {8, *kwg_size / 64, 8, 1}; for (int i = 0; i < inputs_count; ++i) { const Tensor *input = input_list[i]; index_t input_channel_blk = input->dim(3) / 4; @@ -160,18 +163,45 @@ static void ConcatN(cl::Kernel *kernel, kernel->setArg(idx++, *(output->opencl_image())); chan_blk_offset += input_channel_blk; - const std::vector lws = {8, *kwg_size / 64, 8, 1}; - std::stringstream ss; - ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_" - << batch * height; - TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future); + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + *kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t j = 0; j < 3; ++j) { + roundup_gws[j] = RoundUp(gws[j], lws[j]); + } + error = runtime->command_queue().enqueueNDRangeKernel( + *kernel, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } + MACE_CHECK_CL_SUCCESS(error); if (runtime->IsOutOfRangeCheckEnabled()) { (*kernel_error)->Map(nullptr); char *kerror_code = (*kernel_error)->mutable_data(); MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; (*kernel_error)->UnMap(); } + if (runtime->is_profiling_enabled()) { + CallStats tmp_stats; + runtime->GetCallStats(event, &tmp_stats); + call_stats.start_micros = std::min(tmp_stats.start_micros, + call_stats.start_micros); + call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; + } + } + if (future != nullptr) { + future->wait_fn = [runtime, event, call_stats](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + stats->start_micros = call_stats.start_micros; + stats->end_micros = stats->start_micros + call_stats.end_micros; + } + }; } } diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc index 1945f14d..07f9086b 100644 --- a/mace/kernels/opencl/slice.cc +++ b/mace/kernels/opencl/slice.cc @@ -63,13 +63,8 @@ void SliceFunctor::operator()( }; const std::vector lws = {8, kwg_size_ / 64, 8, 1}; - std::stringstream ss; - ss << "slice_opencl_kernel_" - << input->dim(0) << "_" - << input->dim(1) << "_" - << input->dim(2) << "_" - << input_channels << "_" - << outputs_count; + cl::Event event; + CallStats call_stats{INT64_MAX, 0}; for (int i = 0; i < outputs_count; ++i) { uint32_t idx = 0; if (runtime->IsOutOfRangeCheckEnabled()) { @@ -85,13 +80,45 @@ void SliceFunctor::operator()( kernel_.setArg(idx++, static_cast(channel_blk * i)); kernel_.setArg(idx++, *(output_list[i]->opencl_image())); - TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t j = 0; j < 3; ++j) { + roundup_gws[j] = RoundUp(gws[j], lws[j]); + } + + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } + MACE_CHECK_CL_SUCCESS(error); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); char *kerror_code = kernel_error_->mutable_data(); MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; kernel_error_->UnMap(); } + if (runtime->is_profiling_enabled()) { + CallStats tmp_stats; + runtime->GetCallStats(event, &tmp_stats); + call_stats.start_micros = std::min(tmp_stats.start_micros, + call_stats.start_micros); + call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; + } + } + if (future != nullptr) { + future->wait_fn = [runtime, event, call_stats](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + stats->start_micros = call_stats.start_micros; + stats->end_micros = stats->start_micros + call_stats.end_micros; + } + }; } } -- GitLab