diff --git a/mace/kernels/BUILD b/mace/kernels/BUILD index b6711a0f8f8dcc7b3bc9a885b578703b93f5f8b0..e002b1418576be3ddbb6f7feaa3584dff18a17ae 100644 --- a/mace/kernels/BUILD +++ b/mace/kernels/BUILD @@ -22,6 +22,7 @@ cc_library( "//mace/core", "//mace/core:opencl_runtime", "//mace/utils", + "//mace/utils:tuner", ], ) diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 4d0771f15819e71496100e441799801f35b191f2..23bedc6eb30a2753ba67c966c8102385dd293dd0 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -5,6 +5,7 @@ #include "mace/kernels/batch_norm.h" #include "mace/core/runtime/opencl/cl2.hpp" #include "mace/core/runtime/opencl/opencl_runtime.h" +#include "mace/utils/tuner.h" namespace mace { namespace kernels { @@ -29,7 +30,7 @@ void BatchNormFunctor::operator()( auto bm_kernel = cl::Kernel(program, "batch_norm"); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); - const uint32_t lws[3] = {1, kwg_size/128, 128}; + const std::vector lws = {1, 1, kwg_size}; uint32_t idx = 0; bm_kernel.setArg(idx++, *(static_cast(input->buffer()))); @@ -43,11 +44,35 @@ void BatchNormFunctor::operator()( bm_kernel.setArg(idx++, lws[1] * sizeof(float), nullptr); bm_kernel.setArg(idx++, lws[1] * sizeof(float), nullptr); - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - bm_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2])); - MACE_CHECK(error == CL_SUCCESS); + auto params_generator = [&kwg_size]()->std::vector> { + return {{1, 1, 64}, + {1, 1, 128}, + {1, kwg_size/16, 16}, + {1, kwg_size/32, 32}, + {1, kwg_size/64, 64}, + {1, kwg_size/128, 128}, + {1, 1, kwg_size}, + {1, kwg_size, 1}}; + }; + auto func = [&](const std::vector& params)->cl_int { + cl_int error = runtime->command_queue().enqueueNDRangeKernel( + bm_kernel, cl::NullRange, + cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2])); + + MACE_CHECK(error == CL_SUCCESS); + return error; + }; + std::stringstream ss; + ss << "batch_norm_opencl_kernel_" + << input->dim(0) << "_" + << input->dim(1) << "_" + << input->dim(2) << "_" + << input->dim(3); + Tuner::Get()->template TuneOrRun(ss.str(), + lws, + params_generator, + func); } } // namespace kernels diff --git a/mace/ops/batch_norm_benchmark.cc b/mace/ops/batch_norm_benchmark.cc index 3d17aca7ec153060629a551e4eba3829c22d4529..c1ac84ef60e7c89ab2042f3815f18f1fbaf63da4 100644 --- a/mace/ops/batch_norm_benchmark.cc +++ b/mace/ops/batch_norm_benchmark.cc @@ -31,17 +31,22 @@ static void BatchNorm( net.AddRandomInput("Var", {channels}, true); net.AddInputFromArray("Epsilon", {}, {1e-3}); + // tuning + setenv("MACE_TUNING", "1", 1); + net.RunOp(D); + unsetenv("MACE_TUNING"); + // Warm-up for (int i = 0; i < 5; ++i) { net.RunOp(D); - net.Sync(); } + net.Sync(); mace::testing::StartTiming(); while (iters--) { net.RunOp(D); - net.Sync(); } + net.Sync(); } #define BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \ @@ -56,7 +61,7 @@ static void BatchNorm( #define BM_BATCH_NORM(N, C, H, W, TYPE) \ BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, CPU); \ - BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, NEON); \ + BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, NEON);\ BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, OPENCL); BM_BATCH_NORM(1, 1, 512, 512, float); diff --git a/mace/ops/batch_norm_test.cc b/mace/ops/batch_norm_test.cc index 4c5d73bbe3981ec2af972a5c370ec2794c22ac2d..39b9ff1c9358fd07ec3c775ecced3b1c8ffd0228 100644 --- a/mace/ops/batch_norm_test.cc +++ b/mace/ops/batch_norm_test.cc @@ -165,7 +165,12 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) { net.AddRandomInput("Var", {channels}, true); net.AddInputFromArray("Epsilon", {}, {1e-3}); - // Run NEON + // tuning + setenv("MACE_TUNING", "1", 1); + net.RunOp(DeviceType::OPENCL); + unsetenv("MACE_TUNING"); + + // Run on opencl net.RunOp(DeviceType::OPENCL); // Check @@ -206,7 +211,12 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) { net.AddRandomInput("Var", {channels}, true); net.AddInputFromArray("Epsilon", {}, {1e-3}); - // Run NEON + // tuning + setenv("MACE_TUNING", "1", 1); + net.RunOp(DeviceType::OPENCL); + unsetenv("MACE_TUNING"); + + // Run on opencl net.RunOp(DeviceType::OPENCL); net.Sync(); diff --git a/mace/ops/ops_test_util.h b/mace/ops/ops_test_util.h index 678f855fc7b98b5c7ec66f13e07e75ab121e067e..4abde486559c3140820d5d144303d4b72ee73b82 100644 --- a/mace/ops/ops_test_util.h +++ b/mace/ops/ops_test_util.h @@ -137,13 +137,11 @@ class OpsTestNet { Workspace *ws() { return &ws_; } bool RunOp(DeviceType device) { - if (!net_ || device_ != device) { - NetDef net_def; - net_def.add_op()->CopyFrom(op_def_); - VLOG(3) << net_def.DebugString(); - net_ = CreateNet(net_def, &ws_, device); - device_ = device; - } + NetDef net_def; + net_def.add_op()->CopyFrom(op_def_); + VLOG(3) << net_def.DebugString(); + net_ = CreateNet(net_def, &ws_, device); + device_ = device; return net_->Run(); } @@ -154,7 +152,7 @@ class OpsTestNet { } void Sync() { - if (net_) { + if (net_ && device_ == DeviceType::OPENCL) { OpenCLRuntime::Get()->command_queue().finish(); } } diff --git a/mace/utils/BUILD b/mace/utils/BUILD index 06e2ccc490aef5f1a75920bd7cb0afeb9172f64c..50f65f4e9010686de01f97298ba7ef75d7abcdad 100644 --- a/mace/utils/BUILD +++ b/mace/utils/BUILD @@ -7,6 +7,8 @@ package( licenses(["notice"]) # Apache 2.0 +load("//mace:mace.bzl", "if_android") + cc_library( name = "command_line_flags", srcs = [ @@ -28,3 +30,30 @@ cc_library( ], copts = ["-std=c++11"], ) + +cc_library( + name = "tuner", + hdrs = [ + "tuner.h", + ], + copts = ["-std=c++11"], + deps = [ + "//mace/core", + ], +) + +cc_test( + name = "tuner_test", + testonly = 1, + srcs = [ + "tuner_test.cc", + ], + copts = ["-std=c++11"], + linkopts = if_android(["-lm", "-ldl"]), + linkstatic = 1, + deps = [ + ":tuner", + "@gtest//:gtest", + "@gtest//:gtest_main", + ], +) diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h new file mode 100644 index 0000000000000000000000000000000000000000..de96f87e27402ba757d3cc713a7b0146c7d5bb49 --- /dev/null +++ b/mace/utils/tuner.h @@ -0,0 +1,177 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// + +#ifndef MACE_UTILS_TUNER_H_ +#define MACE_UTILS_TUNER_H_ +#include +#include +#include +#include +#include +#include +#include + +#include "mace/core/logging.h" +#include "mace/utils/utils.h" +#include "mace/core/runtime/opencl/opencl_runtime.h" + +namespace mace { + + +template +class Tuner { + public: + static Tuner* Get() { + static Tuner tuner; + return &tuner; + } + + template + RetType TuneOrRun(const std::string param_key, + const std::vector &default_param, + const std::function>()> ¶m_generator, + const std::function &)> &func) { + + if (IsTuning()) { + // tune + std::vector opt_param = default_param; + RetType res = Tune(param_generator, func, opt_param); + param_table_[param_key] = opt_param; + return res; + } else { + // run + if (param_table_.find(param_key) != param_table_.end()) { + return func(param_table_[param_key]); + } else { + return func(default_param); + } + } + } + + private: + Tuner() { + path_ = getenv("MACE_RUN_PARAMETER_PATH"); + ReadRunParamters(); + } + + ~Tuner() { + WriteRunParameters(); + } + + Tuner(const Tuner&) = delete; + Tuner& operator=(const Tuner&) = delete; + + inline bool IsTuning() { + const char *tuning = getenv("MACE_TUNING"); + return tuning != nullptr && strlen(tuning) == 1 && tuning[0] == '1'; + } + + inline void WriteRunParameters() { + VLOG(0) << path_; + if (path_ != nullptr) { + std::ofstream ofs(path_, std::ios::binary | std::ios::out); + if (ofs.is_open()) { + size_t num_pramas = param_table_.size(); + ofs.write(reinterpret_cast(&num_pramas), sizeof(num_pramas)); + for (auto &kp : param_table_) { + int32_t key_size = kp.first.size(); + ofs.write(reinterpret_cast(&key_size), sizeof(key_size)); + ofs.write(kp.first.c_str(), key_size); + VLOG(0) << kp.first.c_str(); + + auto ¶ms = kp.second; + int32_t params_size = params.size() * sizeof(param_type); + ofs.write(reinterpret_cast(¶ms_size), sizeof(params_size)); + for (auto ¶m : params) { + ofs.write(reinterpret_cast(¶m), sizeof(params_size)); + VLOG(0) << param; + } + } + ofs.close(); + } else { + LOG(WARNING) << "Write run parameter file failed."; + } + } + } + + inline void ReadRunParamters() { + if (path_ != nullptr) { + std::ifstream ifs(path_, std::ios::binary | std::ios::in); + if (ifs.is_open()) { + int32_t key_size = 0; + int32_t params_size = 0; + int32_t params_count = 0; + size_t num_pramas = 0; + ifs.read(reinterpret_cast(&num_pramas), sizeof(num_pramas)); + while (num_pramas--) { + ifs.read(reinterpret_cast(&key_size), sizeof(key_size)); + std::string key(key_size, ' '); + ifs.read(&key[0], key_size); + + ifs.read(reinterpret_cast(¶ms_size), sizeof(params_size)); + params_count = params_size / sizeof(param_type); + std::vector params(params_count); + for (int i = 0; i < params_count; ++i) { + ifs.read(reinterpret_cast(¶ms[i]), sizeof(param_type)); + } + param_table_.emplace(key, params); + } + ifs.close(); + } else { + LOG(WARNING) << "Read run parameter file failed."; + } + } + } + + template + inline RetType Run(const std::function &)> &func, + const std::vector ¶ms, + int num_runs, + double &time_us) { + RetType res; + int64_t total_time_us = 0; + const int64_t start_time = NowInMicroSec(); + for (int i = 0; i < num_runs; ++i) { + res = func(params); + } + OpenCLRuntime::Get()->command_queue().finish(); + const int64_t end_time = NowInMicroSec(); + total_time_us += end_time - start_time; + + time_us = total_time_us * 1.0 / num_runs; + return res; + } + + template + inline RetType Tune(std::function>()> param_generator, + const std::function &)> &func, + std::vector &opt_params) { + RetType res; + double opt_time = std::numeric_limits::max(); + auto params = param_generator(); + for (const auto ¶m: params) { + double tmp_time = 0.0; + // warm up + Run(func, param, 2, tmp_time); + + // run + RetType tmp_res = Run(func, param, 10, tmp_time); + + // Check the execution time + if (tmp_time < opt_time) { + opt_time = tmp_time; + opt_params = param; + res = tmp_res; + } + } + return res; + } + + private: + const char* path_; + std::unordered_map> param_table_; +}; + +} // namespace mace +#endif // MACE_UTILS_TUNER_H_ diff --git a/mace/utils/tuner_test.cc b/mace/utils/tuner_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..bcb5c620f3b553d3d6f8572fd88573d159d0a6cd --- /dev/null +++ b/mace/utils/tuner_test.cc @@ -0,0 +1,64 @@ +// +// Copyright (c) 2017 XiaoMi All rights reserved. +// +#include + +#include "gtest/gtest.h" + +#include "mace/utils/tuner.h" + +namespace mace { + +class TunerTest: public ::testing::Test { + protected: + virtual void SetUp() { + remove( "/data/local/tmp/mace.config" ); + setenv("MACE_RUN_PARAMTER_PATH", "/data/local/tmp/mace.config", 1); + } +}; + +TEST_F(TunerTest, SimpleRun) { + int expect = 1; + auto TunerFunc = [&](const std::vector& params)->int { + if (params.front() == 1) { + return expect; + } else { + return expect + 1; + } + }; + + std::vector default_params(1, 1); + int res = Tuner::Get()->template TuneOrRun("SimpleRun", default_params, nullptr, TunerFunc); + + EXPECT_EQ(expect, res); + + default_params[0] = 2; + res = Tuner::Get()->template TuneOrRun("SimpleRun", default_params, nullptr, TunerFunc); + EXPECT_EQ(expect+1, res); +} + +TEST_F(TunerTest, SimpleTune) { + int expect = 3; + auto TunerFunc = [&](const std::vector& params)->int { + if (params.front() == expect) { + return expect; + } else { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + return params.front(); + } + }; + + std::vector default_params(1, 1); + auto params_generator = []()->std::vector> { + return {{1}, {2}, {3}, {4}}; + }; + // tune + int res = Tuner::Get()->template TuneOrRun("SimpleRun", default_params, *params_generator, TunerFunc); + EXPECT_EQ(expect, res); + + // run + res = Tuner::Get()->template TuneOrRun("SimpleRun", default_params, nullptr, TunerFunc); + EXPECT_EQ(expect, res); +} + +} // namespace mace