diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index f24333ac461e9cbff8708f4202ad0f7545e56eeb..79093fa177b2d7a7ee188e47c3e2a368c080c8a4 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -236,7 +236,7 @@ void GetAdrenoContextProperties(std::vector *properties, OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, GPUPriorityHint gpu_priority_hint): - storage_(nullptr) { + storage_(nullptr), is_profiling_enabled_(false) { LoadOpenCLLibrary(); std::vector all_platforms; @@ -286,6 +286,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, if (Tuner::Get()->IsTuning() || (profiling != nullptr && strlen(profiling) == 1 && profiling[0] == '1')) { properties |= CL_QUEUE_PROFILING_ENABLE; + is_profiling_enabled_ = true; } cl_int err; @@ -590,4 +591,8 @@ const bool OpenCLRuntime::IsOutOfRangeCheckEnabled() const { return out_of_range_check_; } +const bool OpenCLRuntime::is_profiling_enabled() const { + return is_profiling_enabled_; +} + } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index c06564908e56ba794963a8a70314760f31b5afc0..35a20bff198fc52f0859f935663118bbde3b45ec 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -77,6 +77,7 @@ class OpenCLRuntime { const GPUType ParseGPUType(const std::string &device_name); const std::string ParseDeviceVersion(const std::string &device_version); void SaveBuiltCLProgram(); + const bool is_profiling_enabled() const; private: OpenCLRuntime(GPUPerfHint, GPUPriorityHint); @@ -116,6 +117,7 @@ class OpenCLRuntime { std::string platform_info_; bool program_map_changed_; std::unique_ptr storage_; + bool is_profiling_enabled_; static GPUPerfHint kGPUPerfHint; static GPUPriorityHint kGPUPriorityHint; diff --git a/mace/core/runtime/opencl/opencl_wrapper.cc b/mace/core/runtime/opencl/opencl_wrapper.cc index b8cd709730d16aec1715e415bbeff4c552cd8787..c9b20577aecd2bf57b04be150b49ffc7d4a9cc05 100644 --- a/mace/core/runtime/opencl/opencl_wrapper.cc +++ b/mace/core/runtime/opencl/opencl_wrapper.cc @@ -168,6 +168,11 @@ class OpenCLLibraryImpl final { size_t, void *, size_t *); + using clGetEventInfoFunc = cl_int (*)(cl_event event, + cl_event_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret); using clGetEventProfilingInfoFunc = cl_int (*)(cl_event event, cl_profiling_info param_name, size_t param_value_size, @@ -221,6 +226,7 @@ class OpenCLLibraryImpl final { MACE_CL_DEFINE_FUNC_PTR(clReleaseDevice); MACE_CL_DEFINE_FUNC_PTR(clRetainEvent); MACE_CL_DEFINE_FUNC_PTR(clGetKernelWorkGroupInfo); + MACE_CL_DEFINE_FUNC_PTR(clGetEventInfo); MACE_CL_DEFINE_FUNC_PTR(clGetEventProfilingInfo); MACE_CL_DEFINE_FUNC_PTR(clGetImageInfo); @@ -344,6 +350,7 @@ void *OpenCLLibraryImpl::LoadFromPath(const std::string &path) { MACE_CL_ASSIGN_FROM_DLSYM(clReleaseDevice); MACE_CL_ASSIGN_FROM_DLSYM(clRetainEvent); MACE_CL_ASSIGN_FROM_DLSYM(clGetKernelWorkGroupInfo); + MACE_CL_ASSIGN_FROM_DLSYM(clGetEventInfo); MACE_CL_ASSIGN_FROM_DLSYM(clGetEventProfilingInfo); MACE_CL_ASSIGN_FROM_DLSYM(clGetImageInfo); @@ -881,6 +888,21 @@ CL_API_ENTRY cl_int clReleaseEvent(cl_event event) CL_API_SUFFIX__VERSION_1_0 { return func(event); } +// Event API +CL_API_ENTRY cl_int clGetEventInfo(cl_event event, + cl_event_info param_name, + size_t param_value_size, + void *param_value, + size_t *param_value_size_ret) +CL_API_SUFFIX__VERSION_1_0 { + MACE_CHECK_NOTNULL(mace::openclLibraryImpl); + auto func = mace::openclLibraryImpl->clGetEventInfo; + MACE_CHECK_NOTNULL(func); + MACE_LATENCY_LOGGER(3, "clGetEventInfo"); + return func(event, param_name, param_value_size, param_value, + param_value_size_ret); +} + // Profiling APIs CL_API_ENTRY cl_int clGetEventProfilingInfo(cl_event event, cl_profiling_info param_name, diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index a2898edb26851fb8e98dea3b6d589c0f544b6c43..697ae85448dce15cbb036ac13b177f2359274022 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -137,6 +137,9 @@ static void ConcatN(cl::Kernel *kernel, const int inputs_count = input_list.size(); index_t chan_blk_offset = 0; + cl::Event event; + CallStats call_stats{INT64_MAX, 0}; + const std::vector lws = {8, *kwg_size / 64, 8, 1}; for (int i = 0; i < inputs_count; ++i) { const Tensor *input = input_list[i]; index_t input_channel_blk = input->dim(3) / 4; @@ -160,18 +163,45 @@ static void ConcatN(cl::Kernel *kernel, kernel->setArg(idx++, *(output->opencl_image())); chan_blk_offset += input_channel_blk; - const std::vector lws = {8, *kwg_size / 64, 8, 1}; - std::stringstream ss; - ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_" - << batch * height; - TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future); + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + *kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t j = 0; j < 3; ++j) { + roundup_gws[j] = RoundUp(gws[j], lws[j]); + } + error = runtime->command_queue().enqueueNDRangeKernel( + *kernel, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } + MACE_CHECK_CL_SUCCESS(error); if (runtime->IsOutOfRangeCheckEnabled()) { (*kernel_error)->Map(nullptr); char *kerror_code = (*kernel_error)->mutable_data(); MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; (*kernel_error)->UnMap(); } + if (runtime->is_profiling_enabled()) { + CallStats tmp_stats; + runtime->GetCallStats(event, &tmp_stats); + call_stats.start_micros = std::min(tmp_stats.start_micros, + call_stats.start_micros); + call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; + } + } + if (future != nullptr) { + future->wait_fn = [runtime, event, call_stats](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + stats->start_micros = call_stats.start_micros; + stats->end_micros = stats->start_micros + call_stats.end_micros; + } + }; } } diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc index 1945f14d135f8bebdf34d3e6d3b3ace5c75649eb..07f9086b74d2a819a79d7b906fe57576aff82d09 100644 --- a/mace/kernels/opencl/slice.cc +++ b/mace/kernels/opencl/slice.cc @@ -63,13 +63,8 @@ void SliceFunctor::operator()( }; const std::vector lws = {8, kwg_size_ / 64, 8, 1}; - std::stringstream ss; - ss << "slice_opencl_kernel_" - << input->dim(0) << "_" - << input->dim(1) << "_" - << input->dim(2) << "_" - << input_channels << "_" - << outputs_count; + cl::Event event; + CallStats call_stats{INT64_MAX, 0}; for (int i = 0; i < outputs_count; ++i) { uint32_t idx = 0; if (runtime->IsOutOfRangeCheckEnabled()) { @@ -85,13 +80,45 @@ void SliceFunctor::operator()( kernel_.setArg(idx++, static_cast(channel_blk * i)); kernel_.setArg(idx++, *(output_list[i]->opencl_image())); - TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); + cl_int error; + if (runtime->IsNonUniformWorkgroupsSupported()) { + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } else { + std::vector roundup_gws(lws.size()); + for (size_t j = 0; j < 3; ++j) { + roundup_gws[j] = RoundUp(gws[j], lws[j]); + } + + error = runtime->command_queue().enqueueNDRangeKernel( + kernel_, cl::NullRange, + cl::NDRange(roundup_gws[0], roundup_gws[1], roundup_gws[2]), + cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); + } + MACE_CHECK_CL_SUCCESS(error); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); char *kerror_code = kernel_error_->mutable_data(); MACE_CHECK(*kerror_code == 0) << "Kernel error code: " << *kerror_code; kernel_error_->UnMap(); } + if (runtime->is_profiling_enabled()) { + CallStats tmp_stats; + runtime->GetCallStats(event, &tmp_stats); + call_stats.start_micros = std::min(tmp_stats.start_micros, + call_stats.start_micros); + call_stats.end_micros += tmp_stats.end_micros - tmp_stats.start_micros; + } + } + if (future != nullptr) { + future->wait_fn = [runtime, event, call_stats](CallStats *stats) { + event.wait(); + if (stats != nullptr) { + stats->start_micros = call_stats.start_micros; + stats->end_micros = stats->start_micros + call_stats.end_micros; + } + }; } } diff --git a/mace/ops/concat_test.cc b/mace/ops/concat_test.cc index efc8bd7fc1bb4e59e92c0f14c11736011432586a..d45939577ec8b837cc4424159bc95891d3ee54f4 100644 --- a/mace/ops/concat_test.cc +++ b/mace/ops/concat_test.cc @@ -151,12 +151,17 @@ void OpenclRandomTest(const std::vector> &shapes, int num_inputs = shapes.size(); int concat_axis_size = 0; // Construct graph + std::vector> inputs(num_inputs, std::vector()); + std::vector input_ptrs(num_inputs); OpsTestNet net; for (int i = 0; i < num_inputs; ++i) { const std::string input_name = MakeString("Input", i); const std::string image_name = MakeString("InputImage", i); concat_axis_size += shapes[i][axis]; - net.AddRandomInput(input_name, shapes[i]); + GenerateRandomRealTypeData(shapes[i], &inputs[i]); + input_ptrs[i] = inputs[i].data(); + net.AddInputFromArray(input_name, + shapes[i], inputs[i]); BufferToImage(&net, input_name, image_name, kernels::BufferType::IN_OUT_CHANNEL); } @@ -186,17 +191,15 @@ void OpenclRandomTest(const std::vector> &shapes, Tensor::MappingGuard output_mapper(output); const float *output_ptr = output->data(); + const float *output_ptr_end = output_ptr + output->size(); int k = 0; - while (output_ptr != (output->data() + output->size())) { + while (output_ptr != output_ptr_end) { for (int i = 0; i < num_inputs; ++i) { index_t num_elements = std::accumulate(shapes[i].begin() + axis, shapes[i].end(), 1, std::multiplies()); - const std::string input_name = MakeString("Input", i); - const Tensor *input_tensor = net.GetTensor(input_name.data()); - Tensor::MappingGuard input_guard(input_tensor); - const float *input_ptr = input_tensor->data() + k * num_elements; + const float *input_ptr = input_ptrs[i] + k * num_elements; for (int j = 0; j < num_elements; ++j) { EXPECT_NEAR(*(input_ptr + j), *output_ptr++, 1e-2) << "With index: " << i << ", " << j;