diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 4f24249d448409feabda8f002ed3beae491146ed..23bedc6eb30a2753ba67c966c8102385dd293dd0 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -44,42 +44,25 @@ void BatchNormFunctor::operator()( bm_kernel.setArg(idx++, lws[1] * sizeof(float), nullptr); bm_kernel.setArg(idx++, lws[1] * sizeof(float), nullptr); - std::function>()> params_generator = nullptr; - std::function& params)> func; - if (Tuning()) { - params_generator = [&kwg_size]()->std::vector> { - return {{1, 1, 64}, - {1, 1, 128}, - {1, kwg_size/32, 32}, - {1, kwg_size/64, 64}, - {1, kwg_size/128, 128}, - {1, 1, kwg_size}, - {1, kwg_size, 1}}; - }; - func = [&](const std::vector& params)->cl_int { - cl::Event event; - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - bm_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2]), - nullptr, - &event); + auto params_generator = [&kwg_size]()->std::vector> { + return {{1, 1, 64}, + {1, 1, 128}, + {1, kwg_size/16, 16}, + {1, kwg_size/32, 32}, + {1, kwg_size/64, 64}, + {1, kwg_size/128, 128}, + {1, 1, kwg_size}, + {1, kwg_size, 1}}; + }; + auto func = [&](const std::vector& params)->cl_int { + cl_int error = runtime->command_queue().enqueueNDRangeKernel( + bm_kernel, cl::NullRange, + cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2])); - MACE_CHECK(error == CL_SUCCESS); - event.wait(); - return error; - }; - } else { - func = [&](const std::vector& params)->cl_int { - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - bm_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(params[0], params[1], params[2])); - - MACE_CHECK(error == CL_SUCCESS); - return error; - }; - } + MACE_CHECK(error == CL_SUCCESS); + return error; + }; std::stringstream ss; ss << "batch_norm_opencl_kernel_" << input->dim(0) << "_" diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h index 79a3cf3e49ca56030173b97398c5909e08cd633f..de96f87e27402ba757d3cc713a7b0146c7d5bb49 100644 --- a/mace/utils/tuner.h +++ b/mace/utils/tuner.h @@ -10,18 +10,14 @@ #include #include #include -#include #include #include "mace/core/logging.h" #include "mace/utils/utils.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" namespace mace { -bool Tuning() { - const char *tuning = getenv("MACE_TUNING"); - return tuning != nullptr && tuning[0] == '1'; -} template class Tuner { @@ -34,22 +30,22 @@ class Tuner { template RetType TuneOrRun(const std::string param_key, const std::vector &default_param, - const std::function>()> param_generator, - const std::function &)>& func) { + const std::function>()> ¶m_generator, + const std::function &)> &func) { - if (param_generator == nullptr) { + if (IsTuning()) { + // tune + std::vector opt_param = default_param; + RetType res = Tune(param_generator, func, opt_param); + param_table_[param_key] = opt_param; + return res; + } else { // run if (param_table_.find(param_key) != param_table_.end()) { return func(param_table_[param_key]); } else { return func(default_param); } - } else { - // tune - std::vector opt_param = default_param; - RetType res = Tune(param_generator, func, opt_param); - param_table_[param_key] = opt_param; - return res; } } @@ -66,6 +62,11 @@ class Tuner { Tuner(const Tuner&) = delete; Tuner& operator=(const Tuner&) = delete; + inline bool IsTuning() { + const char *tuning = getenv("MACE_TUNING"); + return tuning != nullptr && strlen(tuning) == 1 && tuning[0] == '1'; + } + inline void WriteRunParameters() { VLOG(0) << path_; if (path_ != nullptr) { @@ -127,24 +128,18 @@ class Tuner { inline RetType Run(const std::function &)> &func, const std::vector ¶ms, int num_runs, - int64_t sleep_millisecond, double &time_us) { RetType res; int64_t total_time_us = 0; - int64_t actual_num_runs = 0; - bool util_max_time = (num_runs <= 0); - for (int i = 0; util_max_time || i < num_runs; ++i) { - const int64_t start_time = NowInMicroSec(); + const int64_t start_time = NowInMicroSec(); + for (int i = 0; i < num_runs; ++i) { res = func(params); - const int64_t end_time = NowInMicroSec(); - total_time_us += end_time - start_time; - ++(actual_num_runs); - - if (sleep_millisecond > 0) { - std::this_thread::sleep_for(std::chrono::milliseconds(sleep_millisecond)); - } } - time_us = total_time_us * 1.0 / actual_num_runs; + OpenCLRuntime::Get()->command_queue().finish(); + const int64_t end_time = NowInMicroSec(); + total_time_us += end_time - start_time; + + time_us = total_time_us * 1.0 / num_runs; return res; } @@ -158,10 +153,10 @@ class Tuner { for (const auto ¶m: params) { double tmp_time = 0.0; // warm up - Run(func, param, 2, 10, tmp_time); + Run(func, param, 2, tmp_time); // run - RetType tmp_res = Run(func, param, 10, 10, tmp_time); + RetType tmp_res = Run(func, param, 10, tmp_time); // Check the execution time if (tmp_time < opt_time) { diff --git a/mace/utils/tuner_test.cc b/mace/utils/tuner_test.cc index 10982415fddbc620105b7291babd93d6454ebb5c..bcb5c620f3b553d3d6f8572fd88573d159d0a6cd 100644 --- a/mace/utils/tuner_test.cc +++ b/mace/utils/tuner_test.cc @@ -1,6 +1,7 @@ // // Copyright (c) 2017 XiaoMi All rights reserved. // +#include #include "gtest/gtest.h"