From a9dce8ec1e809ee48769f9044b6b6495ffd52e3b Mon Sep 17 00:00:00 2001 From: liuqi Date: Wed, 24 Jan 2018 15:53:15 +0800 Subject: [PATCH] Add block tuning to limit the execution time less than 1ms. --- mace/core/runtime/opencl/opencl_runtime.cc | 15 ++++ mace/core/runtime/opencl/opencl_runtime.h | 28 +++--- mace/kernels/opencl/activation_opencl.cc | 83 +++++++++++------ mace/kernels/opencl/addn.cc | 73 ++++++++++----- mace/kernels/opencl/batch_norm_opencl.cc | 83 +++++++++++------ mace/kernels/opencl/concat.cc | 87 ++++++++++++------ mace/kernels/opencl/conv_2d_opencl_1x1.cc | 83 +++++++++++------ mace/kernels/opencl/conv_2d_opencl_3x3.cc | 84 ++++++++++++------ mace/kernels/opencl/conv_2d_opencl_general.cc | 84 ++++++++++++------ mace/kernels/opencl/helper.h | 2 + mace/kernels/opencl/pooling_opencl.cc | 86 ++++++++++++------ mace/kernels/opencl/resize_bilinear_opencl.cc | 82 ++++++++++++----- mace/kernels/opencl/softmax_opencl.cc | 88 +++++++++++++------ mace/kernels/opencl/space_to_batch_opencl.cc | 82 ++++++++++++----- mace/utils/timer.h | 61 ++++++++----- mace/utils/tuner.h | 22 +++-- 16 files changed, 724 insertions(+), 319 deletions(-) diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 3c8b013a..5b7ccdd8 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -50,6 +50,21 @@ double OpenCLProfilingTimer::ElapsedMicros() { return (stop_nanos_ - start_nanos_) / 1000.0; } +double OpenCLProfilingTimer::AccumulatedMicros() { + return accumulated_micros_; +} + +void OpenCLProfilingTimer::AccumulateTiming(){ + StopTiming(); + accumulated_micros_ += (stop_nanos_ - start_nanos_) / 1000.0; +} + +void OpenCLProfilingTimer::ClearTiming() { + start_nanos_ = 0; + stop_nanos_ = 0; + accumulated_micros_ = 0; +} + OpenCLRuntime *OpenCLRuntime::Global() { static OpenCLRuntime instance; return &instance; diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 7245b926..ff596459 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -18,16 +18,20 @@ namespace mace { class OpenCLProfilingTimer : public Timer { - public: - explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event) {}; - void StartTiming() override; - void StopTiming() override; - double ElapsedMicros() override; + public: + explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event), accumulated_micros_(0) {}; + void StartTiming() override; + void StopTiming() override; + void AccumulateTiming() override; + void ClearTiming() override; + double ElapsedMicros() override; + double AccumulatedMicros() override; - private: - const cl::Event *event_; - double start_nanos_; - double stop_nanos_; + private: + const cl::Event *event_; + double start_nanos_; + double stop_nanos_; + double accumulated_micros_; }; class OpenCLRuntime { @@ -40,15 +44,15 @@ class OpenCLRuntime { void GetCallStats(const cl::Event &event, CallStats *stats); uint32_t GetDeviceMaxWorkGroupSize(); - uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel); + uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel); cl::Kernel BuildKernel(const std::string &program_name, const std::string &kernel_name, const std::set &build_options); private: OpenCLRuntime(); ~OpenCLRuntime(); - OpenCLRuntime(const OpenCLRuntime&) = delete; - OpenCLRuntime &operator=(const OpenCLRuntime&) = delete; + OpenCLRuntime(const OpenCLRuntime &) = delete; + OpenCLRuntime &operator=(const OpenCLRuntime &) = delete; void BuildProgram(const std::string &program_file_name, const std::string &binary_file_name, diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc index 44eaa47e..473e5fb5 100644 --- a/mace/kernels/opencl/activation_opencl.cc +++ b/mace/kernels/opencl/activation_opencl.cc @@ -63,7 +63,7 @@ void ActivationFunctor::operator()(const Tensor *input, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width), static_cast(height * batch)}; - const std::vector lws = {8, 16, 8}; + std::vector lws = {8, 16, 8, 1}; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(activation_kernel); auto params_generator = [&]() -> std::vector> { @@ -73,33 +73,66 @@ void ActivationFunctor::operator()(const Tensor *input, local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); return { - {local_ws[0], local_ws[1], local_ws[2]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {4, 15, 8}, // SNPE size + {local_ws[0], local_ws[1], local_ws[2], 1}, + {kwg_size / 16, 4, 4, 1}, + {kwg_size / 32, 4, 8, 1}, + {kwg_size / 32, 8, 4, 1}, + {kwg_size / 64, 8, 8, 1}, + {kwg_size / 64, 16, 4, 1}, + {kwg_size / 128, 8, 16, 1}, + {kwg_size / 128, 16, 8, 1}, + {kwg_size / 128, 32, 4, 1}, + {1, kwg_size / 32, 32, 1}, + {1, kwg_size / 64, 64, 1}, + {1, kwg_size / 128, 128, 1}, + {3, 15, 9, 1}, + {7, 15, 9, 1}, + {9, 7, 15, 1}, + {15, 7, 9, 1}, + {1, kwg_size, 1, 1}, + {4, 15, 8, 1}, // SNPE size }; }; cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - activation_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { + cl_int error = CL_SUCCESS; + if (timer == nullptr) { + uint32_t num_blocks = params.back(); + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + activation_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + } + } else { + timer->StartTiming(); + error = runtime->command_queue().enqueueNDRangeKernel( + activation_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->StopTiming(); + double elapse_time = timer->ElapsedMicros(); + timer->ClearTiming(); + uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); + params.back() = num_blocks; + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + activation_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + } + } return error; }; std::string tuning_key = diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index b4079dc3..946e74cf 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -50,33 +50,66 @@ static void AddN(const std::vector &input_tensors, static_cast(batch_height_pixels) }; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(addn_kernel); - std::vector lws = {64, 16}; + std::vector lws = {64, 16, 1}; auto params_generator = [&]() -> std::vector> { uint32_t local_ws[2]; local_ws[0] = std::min(width_pixels, kwg_size); local_ws[1] = std::min(batch_height_pixels, kwg_size / local_ws[0]); - return {{local_ws[0], local_ws[1]}, - {local_ws[1], local_ws[0]}, - {kwg_size / 4, 4}, - {kwg_size / 16, 16}, - {kwg_size / 32, 32}, - {kwg_size / 64, 64}, - {kwg_size / 128, 128}, - {kwg_size / 256, 256}, - {kwg_size / 512, 512}, - {kwg_size, 1}, - {1, kwg_size} + return {{local_ws[0], local_ws[1], 1}, + {local_ws[1], local_ws[0], 1}, + {kwg_size / 4, 4, 1}, + {kwg_size / 16, 16, 1}, + {kwg_size / 32, 32, 1}, + {kwg_size / 64, 64, 1}, + {kwg_size / 128, 128, 1}, + {kwg_size / 256, 256, 1}, + {kwg_size / 512, 512, 1}, + {kwg_size, 1, 1}, + {1, kwg_size, 1} }; }; cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - addn_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1]), - cl::NDRange(params[0], params[1]), - nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { + cl_int error = CL_SUCCESS; + if (timer == nullptr) { + uint32_t num_blocks = params.back(); + const uint32_t block_size = gws[1] / num_blocks; + if (gws[1] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + addn_kernel, + cl::NDRange(0, i * block_size), + cl::NDRange(gws[0], gws1), + cl::NDRange(params[0], params[1]), + nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + } + } else { + timer->StartTiming(); + error = runtime->command_queue().enqueueNDRangeKernel( + addn_kernel, cl::NullRange, + cl::NDRange(gws[0], gws[1]), + cl::NDRange(params[0], params[1]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->StopTiming(); + double elapse_time = timer->ElapsedMicros(); + timer->ClearTiming(); + uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[1]); + params.back() = num_blocks; + const uint32_t block_size = gws[1] / num_blocks; + if (gws[1] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + addn_kernel, + cl::NDRange(0, i * block_size), + cl::NDRange(gws[0], gws1), + cl::NDRange(params[0], params[1]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + } + } return error; }; std::stringstream ss; diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 2d6c95a3..29a5f2fa 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -83,7 +83,7 @@ void BatchNormFunctor::operator()(const Tensor *input, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width), static_cast(height * batch)}; - const std::vector lws = {8, 16, 8}; + std::vector lws = {8, 16, 8, 1}; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); auto params_generator = [&]() -> std::vector> { std::vector local_ws(3, 0); @@ -92,33 +92,66 @@ void BatchNormFunctor::operator()(const Tensor *input, local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); return { - {local_ws[0], local_ws[1], local_ws[2]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {8, 128, 1}, // SNPE size + {local_ws[0], local_ws[1], local_ws[2], 1}, + {kwg_size / 16, 4, 4, 1}, + {kwg_size / 32, 4, 8, 1}, + {kwg_size / 32, 8, 4, 1}, + {kwg_size / 64, 8, 8, 1}, + {kwg_size / 64, 16, 4, 1}, + {kwg_size / 128, 8, 16, 1}, + {kwg_size / 128, 16, 8, 1}, + {kwg_size / 128, 32, 4, 1}, + {1, kwg_size / 32, 32, 1}, + {1, kwg_size / 64, 64, 1}, + {1, kwg_size / 128, 128, 1}, + {3, 15, 9, 1}, + {7, 15, 9, 1}, + {9, 7, 15, 1}, + {15, 7, 9, 1}, + {1, kwg_size, 1, 1}, + {8, 128, 1, 1}, // SNPE size }; }; cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { + cl_int error = CL_SUCCESS; + if (timer == nullptr) { + uint32_t num_blocks = params.back(); + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + bm_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + } + } else { + timer->StartTiming(); + error = runtime->command_queue().enqueueNDRangeKernel( + bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->StopTiming(); + double elapse_time = timer->ElapsedMicros(); + timer->ClearTiming(); + uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); + params.back() = num_blocks; + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + bm_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + } + } return error; }; std::string tuning_key = diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index a02b1552..23082529 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -51,42 +51,73 @@ static void Concat2(const Tensor *input0, static_cast(batch * height), }; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel); - std::vector lws = {8, 16, 8}; + std::vector lws = {8, 16, 8, 1}; auto params_generator = [&]() -> std::vector> { std::vector local_ws(3, 0); local_ws[0] = std::min(channel_blk, kwg_size); local_ws[1] = std::min(width, kwg_size / local_ws[0]); local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{local_ws[0], local_ws[1], local_ws[2]}, - {local_ws[2], local_ws[1], local_ws[0]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {4, 15, 8}, //SNPE size + return {{local_ws[0], local_ws[1], local_ws[2], 1}, + {local_ws[2], local_ws[1], local_ws[0], 1}, + {kwg_size / 16, 4, 4, 1}, + {kwg_size / 32, 4, 8, 1}, + {kwg_size / 32, 8, 4, 1}, + {kwg_size / 64, 8, 8, 1}, + {kwg_size / 64, 16, 4, 1}, + {kwg_size / 128, 8, 16, 1}, + {kwg_size / 128, 16, 8, 1}, + {kwg_size / 128, 32, 4, 1}, + {1, kwg_size / 32, 32, 1}, + {1, kwg_size / 64, 64, 1}, + {1, kwg_size / 128, 128, 1}, + {3, 15, 9, 1}, + {7, 15, 9, 1}, + {9, 7, 15, 1}, + {15, 7, 9, 1}, + {1, kwg_size, 1, 1}, + {4, 15, 8, 1}, //SNPE size }; }; cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - concat_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), - nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { + cl_int error = CL_SUCCESS; + if (timer == nullptr) { + uint32_t num_blocks = params.back(); + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + concat_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + } + } else { + timer->StartTiming(); + error = runtime->command_queue().enqueueNDRangeKernel( + concat_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->StopTiming(); + double elapse_time = timer->ElapsedMicros(); + timer->ClearTiming(); + uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); + params.back() = num_blocks; + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + concat_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + } + } return error; }; std::stringstream ss; diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index a8e9192d..e4b4ab93 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -96,7 +96,7 @@ void Conv1x1(const Tensor *input, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), static_cast(height * batch)}; - const std::vector lws = {8, 15, 8}; + std::vector lws = {8, 15, 8, 1}; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); auto params_generator = [&]() -> std::vector> { std::vector local_ws(3, 0); @@ -105,33 +105,66 @@ void Conv1x1(const Tensor *input, local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); return { - {local_ws[0], local_ws[1], local_ws[2]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {4, 15, 8}, // SNPE size + {local_ws[0], local_ws[1], local_ws[2], 1}, + {kwg_size / 16, 4, 4, 1}, + {kwg_size / 32, 4, 8, 1}, + {kwg_size / 32, 8, 4, 1}, + {kwg_size / 64, 8, 8, 1}, + {kwg_size / 64, 16, 4, 1}, + {kwg_size / 128, 8, 16, 1}, + {kwg_size / 128, 16, 8, 1}, + {kwg_size / 128, 32, 4, 1}, + {1, kwg_size / 32, 32, 1}, + {1, kwg_size / 64, 64, 1}, + {1, kwg_size / 128, 128, 1}, + {3, 15, 9, 1}, + {7, 15, 9, 1}, + {9, 7, 15, 1}, + {15, 7, 9, 1}, + {1, kwg_size, 1, 1}, + {4, 15, 8, 1}, // SNPE size }; }; cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { + cl_int error = CL_SUCCESS; + if (timer == nullptr) { + uint32_t num_blocks = params.back(); + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + conv_2d_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + } + } else { + timer->StartTiming(); + error = runtime->command_queue().enqueueNDRangeKernel( + conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->StopTiming(); + double elapse_time = timer->ElapsedMicros(); + timer->ClearTiming(); + uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); + params.back() = num_blocks; + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + conv_2d_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + } + } return error; }; std::string tuning_key = diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 9779107b..a374ea51 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -94,7 +94,7 @@ static void Conv2d3x3S12(const Tensor *input, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), static_cast(height * batch)}; - const std::vector lws = {4, 15, 8}; + std::vector lws = {4, 15, 8, 1}; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); auto params_generator = [&]() -> std::vector> { std::vector local_ws(3, 0); @@ -103,34 +103,66 @@ static void Conv2d3x3S12(const Tensor *input, local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); return { - {local_ws[0], local_ws[1], local_ws[2]}, - {local_ws[2], local_ws[1], local_ws[0]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {4, 15, 8}, // SNPE size + {local_ws[0], local_ws[1], local_ws[2], 1}, + {kwg_size / 16, 4, 4, 1}, + {kwg_size / 32, 4, 8, 1}, + {kwg_size / 32, 8, 4, 1}, + {kwg_size / 64, 8, 8, 1}, + {kwg_size / 64, 16, 4, 1}, + {kwg_size / 128, 8, 16, 1}, + {kwg_size / 128, 16, 8, 1}, + {kwg_size / 128, 32, 4, 1}, + {1, kwg_size / 32, 32, 1}, + {1, kwg_size / 64, 64, 1}, + {1, kwg_size / 128, 128, 1}, + {3, 15, 9, 1}, + {7, 15, 9, 1}, + {9, 7, 15, 1}, + {15, 7, 9, 1}, + {1, kwg_size, 1, 1}, + {4, 15, 8, 1}, // SNPE size }; }; cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { + cl_int error = CL_SUCCESS; + if (timer == nullptr) { + uint32_t num_blocks = params.back(); + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + conv_2d_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + } + } else { + timer->StartTiming(); + error = runtime->command_queue().enqueueNDRangeKernel( + conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->StopTiming(); + double elapse_time = timer->ElapsedMicros(); + timer->ClearTiming(); + uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); + params.back() = num_blocks; + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + conv_2d_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + } + } return error; }; std::string tuning_key = diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index 89295799..d671d4d8 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -96,7 +96,7 @@ void Conv2dOpencl(const Tensor *input, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width_blocks), static_cast(height * batch)}; - const std::vector lws = {8, 16, 8}; + std::vector lws = {8, 16, 8, 1}; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); auto params_generator = [&]() -> std::vector> { std::vector local_ws(3, 0); @@ -105,34 +105,66 @@ void Conv2dOpencl(const Tensor *input, local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); return { - {local_ws[0], local_ws[1], local_ws[2]}, - {local_ws[2], local_ws[1], local_ws[0]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {4, 15, 8}, // SNPE size + {local_ws[0], local_ws[1], local_ws[2], 1}, + {kwg_size / 16, 4, 4, 1}, + {kwg_size / 32, 4, 8, 1}, + {kwg_size / 32, 8, 4, 1}, + {kwg_size / 64, 8, 8, 1}, + {kwg_size / 64, 16, 4, 1}, + {kwg_size / 128, 8, 16, 1}, + {kwg_size / 128, 16, 8, 1}, + {kwg_size / 128, 32, 4, 1}, + {1, kwg_size / 32, 32, 1}, + {1, kwg_size / 64, 64, 1}, + {1, kwg_size / 128, 128, 1}, + {3, 15, 9, 1}, + {7, 15, 9, 1}, + {9, 7, 15, 1}, + {15, 7, 9, 1}, + {1, kwg_size, 1, 1}, + {4, 15, 8, 1}, // SNPE size }; }; cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { + cl_int error = CL_SUCCESS; + if (timer == nullptr) { + uint32_t num_blocks = params.back(); + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + conv_2d_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + } + } else { + timer->StartTiming(); + error = runtime->command_queue().enqueueNDRangeKernel( + conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->StopTiming(); + double elapse_time = timer->ElapsedMicros(); + timer->ClearTiming(); + uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); + params.back() = num_blocks; + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + conv_2d_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + } + } return error; }; std::string tuning_key = diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index 2927dbff..34e787a8 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -14,6 +14,8 @@ namespace mace { namespace kernels { +const float kMaxKernelExeTime = 1000.0; // microseconds + enum BufferType { FILTER = 0, IN_OUT= 1, diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index 79a6f102..194ee133 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -60,7 +60,7 @@ static void Pooling(const Tensor *input, static_cast(batch * out_height), }; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel); - std::vector lws(3, 0); + std::vector lws(4, 1); lws[0] = std::min(channel_blocks, kwg_size); lws[1] = std::min(out_width, kwg_size / lws[0]); lws[2] = std::min(out_height * batch, kwg_size / (lws[0] * lws[1])); @@ -69,35 +69,67 @@ static void Pooling(const Tensor *input, local_ws[0] = std::min(channel_blocks, kwg_size); local_ws[1] = std::min(out_width, kwg_size / local_ws[0]); local_ws[2] = std::min(out_height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{local_ws[0], local_ws[1], local_ws[2]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}, - {4, 15, 8}, //SNPE size + return { + {local_ws[0], local_ws[1], local_ws[2], 1}, + {kwg_size / 16, 4, 4, 1}, + {kwg_size / 32, 4, 8, 1}, + {kwg_size / 32, 8, 4, 1}, + {kwg_size / 64, 8, 8, 1}, + {kwg_size / 64, 16, 4, 1}, + {kwg_size / 128, 8, 16, 1}, + {kwg_size / 128, 16, 8, 1}, + {kwg_size / 128, 32, 4, 1}, + {1, kwg_size / 32, 32, 1}, + {1, kwg_size / 64, 64, 1}, + {1, kwg_size / 128, 128, 1}, + {3, 15, 9, 1}, + {7, 15, 9, 1}, + {9, 7, 15, 1}, + {15, 7, 9, 1}, + {1, kwg_size, 1, 1}, + {4, 15, 8, 1}, // SNPE size }; }; cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - pooling_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), - nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { + cl_int error = CL_SUCCESS; + if (timer == nullptr) { + uint32_t num_blocks = params.back(); + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + pooling_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + } + } else { + timer->StartTiming(); + error = runtime->command_queue().enqueueNDRangeKernel( + pooling_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->StopTiming(); + double elapse_time = timer->ElapsedMicros(); + timer->ClearTiming(); + uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); + params.back() = num_blocks; + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + pooling_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + } + } return error; }; std::stringstream ss; diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index dc0d8cd0..0ad87eea 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -59,38 +59,74 @@ void ResizeBilinearFunctor::operator()( const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(out_width), static_cast(out_height * batch)}; - const std::vector lws = {8, 16, 8}; + std::vector lws = {8, 16, 8, 1}; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel); auto params_generator = [&]() -> std::vector> { std::vector local_ws(3, 0); local_ws[0] = std::min(channel_blocks, kwg_size); local_ws[1] = std::min(out_width, kwg_size / local_ws[0]); local_ws[2] = std::min(out_height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{local_ws[0], local_ws[1], local_ws[2]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {1, kwg_size, 1}, - {4, 15, 8}, //SNPE size + return { + {local_ws[0], local_ws[1], local_ws[2], 1}, + {kwg_size / 16, 4, 4, 1}, + {kwg_size / 32, 4, 8, 1}, + {kwg_size / 32, 8, 4, 1}, + {kwg_size / 64, 8, 8, 1}, + {kwg_size / 64, 16, 4, 1}, + {kwg_size / 128, 8, 16, 1}, + {kwg_size / 128, 16, 8, 1}, + {kwg_size / 128, 32, 4, 1}, + {1, kwg_size / 32, 32, 1}, + {1, kwg_size / 64, 64, 1}, + {1, kwg_size / 128, 128, 1}, + {3, 15, 9, 1}, + {7, 15, 9, 1}, + {9, 7, 15, 1}, + {15, 7, 9, 1}, + {1, kwg_size, 1, 1}, + {4, 15, 8, 1}, // SNPE size }; }; cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - rb_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), - nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { + cl_int error = CL_SUCCESS; + if (timer == nullptr) { + uint32_t num_blocks = params.back(); + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + rb_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + } + } else { + timer->StartTiming(); + error = runtime->command_queue().enqueueNDRangeKernel( + rb_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->StopTiming(); + double elapse_time = timer->ElapsedMicros(); + timer->ClearTiming(); + uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); + params.back() = num_blocks; + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + rb_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + } + } return error; }; std::stringstream ss; diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc index bfc75e73..ca9c5fdb 100644 --- a/mace/kernels/opencl/softmax_opencl.cc +++ b/mace/kernels/opencl/softmax_opencl.cc @@ -41,42 +41,74 @@ void SoftmaxFunctor::operator()(const Tensor *logits, const uint32_t gws[3] = {static_cast(channel_blocks), static_cast(width), static_cast(height * batch)}; - const std::vector lws = {8, 16, 8}; + std::vector lws = {8, 16, 8, 1}; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(softmax_kernel); auto params_generator = [&]() -> std::vector> { std::vector local_ws(3, 0); local_ws[0] = std::min(channel_blocks, kwg_size); local_ws[1] = std::min(width, kwg_size / local_ws[0]); local_ws[2] = std::min(height * batch, kwg_size / (local_ws[0] * local_ws[1])); - return {{4, 15, 8}, //SNPE size - {local_ws[0], local_ws[1], local_ws[2]}, - {local_ws[2], local_ws[1], local_ws[0]}, - {kwg_size / 16, 4, 4}, - {kwg_size / 32, 4, 8}, - {kwg_size / 32, 8, 4}, - {kwg_size / 64, 8, 8}, - {kwg_size / 64, 16, 4}, - {kwg_size / 128, 8, 16}, - {kwg_size / 128, 16, 8}, - {kwg_size / 128, 32, 4}, - {1, kwg_size / 32, 32}, - {1, kwg_size / 64, 64}, - {1, kwg_size / 128, 128}, - {3, 15, 9}, - {7, 15, 9}, - {9, 7, 15}, - {15, 7, 9}, - {1, kwg_size, 1}}; + return { + {local_ws[0], local_ws[1], local_ws[2], 1}, + {kwg_size / 16, 4, 4, 1}, + {kwg_size / 32, 4, 8, 1}, + {kwg_size / 32, 8, 4, 1}, + {kwg_size / 64, 8, 8, 1}, + {kwg_size / 64, 16, 4, 1}, + {kwg_size / 128, 8, 16, 1}, + {kwg_size / 128, 16, 8, 1}, + {kwg_size / 128, 32, 4, 1}, + {1, kwg_size / 32, 32, 1}, + {1, kwg_size / 64, 64, 1}, + {1, kwg_size / 128, 128, 1}, + {3, 15, 9, 1}, + {7, 15, 9, 1}, + {9, 7, 15, 1}, + {15, 7, 9, 1}, + {1, kwg_size, 1, 1}, + {4, 15, 8, 1}, // SNPE size + }; }; cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - softmax_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), - nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { + cl_int error = CL_SUCCESS; + if (timer == nullptr) { + uint32_t num_blocks = params.back(); + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + softmax_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + } + } else { + timer->StartTiming(); + error = runtime->command_queue().enqueueNDRangeKernel( + softmax_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->StopTiming(); + double elapse_time = timer->ElapsedMicros(); + timer->ClearTiming(); + uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); + params.back() = num_blocks; + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + softmax_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + } + } return error; }; std::stringstream ss; diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc index 1fd5bf1a..cf4762fc 100644 --- a/mace/kernels/opencl/space_to_batch_opencl.cc +++ b/mace/kernels/opencl/space_to_batch_opencl.cc @@ -61,36 +61,74 @@ void SpaceToBatchFunctor::operator()(Tensor *space_tensor const uint32_t gws[3] = {chan_blk, static_cast(batch_tensor->dim(2)), static_cast(batch_tensor->dim(0) * batch_tensor->dim(1))}; - const std::vector lws = {8, 16, 8}; + std::vector lws = {8, 16, 8, 1}; const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(s2b_kernel); auto params_generator = [&]() -> std::vector> { std::vector local_ws(3, 0); local_ws[0] = std::min(chan_blk, kwg_size); local_ws[1] = std::min(32, kwg_size / local_ws[0]); local_ws[2] = std::min(32, kwg_size / (local_ws[0] * local_ws[1])); - return {{local_ws[0], local_ws[1], local_ws[2]}, - {4, 32, 8}, - {4, 64, 4}, - {4, 128, 2}, - {8, 16, 8}, - {8, 32, 4}, - {8, 64, 2}, - {16, 8, 8}, - {16, 16, 4}, - {16, 32, 2}, - {32, 8, 4}, - {32, 16, 2}, - {64, 4, 4}}; + return { + {local_ws[0], local_ws[1], local_ws[2], 1}, + {kwg_size / 16, 4, 4, 1}, + {kwg_size / 32, 4, 8, 1}, + {kwg_size / 32, 8, 4, 1}, + {kwg_size / 64, 8, 8, 1}, + {kwg_size / 64, 16, 4, 1}, + {kwg_size / 128, 8, 16, 1}, + {kwg_size / 128, 16, 8, 1}, + {kwg_size / 128, 32, 4, 1}, + {1, kwg_size / 32, 32, 1}, + {1, kwg_size / 64, 64, 1}, + {1, kwg_size / 128, 128, 1}, + {3, 15, 9, 1}, + {7, 15, 9, 1}, + {9, 7, 15, 1}, + {15, 7, 9, 1}, + {1, kwg_size, 1, 1}, + {4, 15, 8, 1}, // SNPE size + }; }; cl::Event event; - auto func = [&](const std::vector ¶ms) -> cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - s2b_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), - nullptr, &event); - - MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + auto func = [&](std::vector ¶ms, Timer *timer) -> cl_int { + cl_int error = CL_SUCCESS; + if (timer == nullptr) { + uint32_t num_blocks = params.back(); + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + s2b_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + } + } else { + timer->StartTiming(); + error = runtime->command_queue().enqueueNDRangeKernel( + s2b_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->StopTiming(); + double elapse_time = timer->ElapsedMicros(); + timer->ClearTiming(); + uint32_t num_blocks = std::min(static_cast(elapse_time / kMaxKernelExeTime) + 1, gws[2]); + params.back() = num_blocks; + const uint32_t block_size = gws[2] / num_blocks; + if (gws[2] % num_blocks > 0) num_blocks++; + for (uint32_t i = 0; i < num_blocks; ++i) { + uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; + error = runtime->command_queue().enqueueNDRangeKernel( + s2b_kernel, + cl::NDRange(0, 0, i * block_size), + cl::NDRange(gws[0], gws[1], gws2), + cl::NDRange(params[0], params[1], params[2]), nullptr, &event); + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + timer->AccumulateTiming(); + } + } return error; }; std::stringstream ss; diff --git a/mace/utils/timer.h b/mace/utils/timer.h index cee4411e..ca0c2b3c 100644 --- a/mace/utils/timer.h +++ b/mace/utils/timer.h @@ -10,29 +10,50 @@ namespace mace { class Timer { - public: - virtual void StartTiming() = 0; - virtual void StopTiming() = 0; - virtual double ElapsedMicros() = 0; + public: + virtual void StartTiming() = 0; + virtual void StopTiming() = 0; + virtual void AccumulateTiming() = 0; + virtual void ClearTiming() = 0; + virtual double ElapsedMicros() = 0; + virtual double AccumulatedMicros() = 0; }; class WallClockTimer : public Timer { - public: - void StartTiming() override { - start_micros_ = mace::utils::NowMicros(); - } - - void StopTiming() override { - stop_micros_ = mace::utils::NowMicros(); - } - - double ElapsedMicros() override { - return stop_micros_ - start_micros_; - } - - private: - double start_micros_; - double stop_micros_; + public: + WallClockTimer() : accumulated_micros_(0) {} + + void StartTiming() override { + start_micros_ = mace::utils::NowMicros(); + } + + void StopTiming() override { + stop_micros_ = mace::utils::NowMicros(); + } + + void AccumulateTiming() override { + StopTiming(); + accumulated_micros_ += stop_micros_ - start_micros_; + } + + void ClearTiming() override { + start_micros_ = 0; + stop_micros_ = 0; + accumulated_micros_ = 0; + } + + double ElapsedMicros() override { + return stop_micros_ - start_micros_; + } + + double AccumulatedMicros() override { + return accumulated_micros_; + } + + private: + double start_micros_; + double stop_micros_; + double accumulated_micros_; }; } // namespace mace diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h index b7364e66..e2797fa9 100644 --- a/mace/utils/tuner.h +++ b/mace/utils/tuner.h @@ -41,10 +41,10 @@ class Tuner { template RetType TuneOrRun( const std::string param_key, - const std::vector &default_param, + std::vector &default_param, const std::function>()> ¶m_generator, - const std::function &)> &func, + const std::function &, Timer *)> &func, Timer *timer) { std::string obfucated_param_key = MACE_OBFUSCATE_SYMBOL(param_key); if (IsTuning() && param_generator != nullptr) { @@ -60,12 +60,12 @@ class Tuner { if (param_table_.find(obfucated_param_key) != param_table_.end()) { VLOG(1) << param_key << ": " << internal::MakeString(param_table_[obfucated_param_key]); - return func(param_table_[obfucated_param_key]); + return func(param_table_[obfucated_param_key], nullptr); } else { #ifndef MACE_DISABLE_NO_TUNING_WARNING LOG(WARNING) << "Fallback to default parameter: " << param_key; #endif - return func(default_param); + return func(default_param, nullptr); } } } @@ -119,18 +119,16 @@ class Tuner { template inline RetType Run( - const std::function &)> &func, - const std::vector ¶ms, + const std::function &, Timer *)> &func, + std::vector ¶ms, Timer *timer, int num_runs, double *time_us) { RetType res; int64_t total_time_us = 0; for (int i = 0; i < num_runs; ++i) { - timer->StartTiming(); - res = func(params); - timer->StopTiming(); - total_time_us += timer->ElapsedMicros(); + res = func(params, timer); + total_time_us += timer->AccumulatedMicros(); } *time_us = total_time_us * 1.0 / num_runs; @@ -141,13 +139,13 @@ class Tuner { inline RetType Tune( const std::function>()> ¶m_generator, - const std::function &)> &func, + const std::function &, Timer *)> &func, Timer *timer, std::vector *opt_params) { RetType res; double opt_time = std::numeric_limits::max(); auto params = param_generator(); - for (const auto ¶m : params) { + for (auto param : params) { double tmp_time = 0.0; // warm up Run(func, param, timer, 2, &tmp_time); -- GitLab