提交 5e103649 编写于 作者: 刘琦

Merge branch 'batch_norm_opencl' into 'master'

Add auto-tuning.

See merge request !82
......@@ -22,6 +22,7 @@ cc_library(
"//mace/core",
"//mace/core:opencl_runtime",
"//mace/utils",
"//mace/utils:tuner",
],
)
......
......@@ -5,6 +5,7 @@
#include "mace/kernels/batch_norm.h"
#include "mace/core/runtime/opencl/cl2.hpp"
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
......@@ -29,7 +30,7 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
auto bm_kernel = cl::Kernel(program, "batch_norm");
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
const uint32_t lws[3] = {1, kwg_size/128, 128};
const std::vector<uint32_t> lws = {1, 1, kwg_size};
uint32_t idx = 0;
bm_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
......@@ -43,11 +44,35 @@ void BatchNormFunctor<DeviceType::OPENCL, float>::operator()(
bm_kernel.setArg(idx++, lws[1] * sizeof(float), nullptr);
bm_kernel.setArg(idx++, lws[1] * sizeof(float), nullptr);
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
bm_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]));
MACE_CHECK(error == CL_SUCCESS);
auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> {
return {{1, 1, 64},
{1, 1, 128},
{1, kwg_size/16, 16},
{1, kwg_size/32, 32},
{1, kwg_size/64, 64},
{1, kwg_size/128, 128},
{1, 1, kwg_size},
{1, kwg_size, 1}};
};
auto func = [&](const std::vector<uint32_t>& params)->cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
bm_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]));
MACE_CHECK(error == CL_SUCCESS);
return error;
};
std::stringstream ss;
ss << "batch_norm_opencl_kernel_"
<< input->dim(0) << "_"
<< input->dim(1) << "_"
<< input->dim(2) << "_"
<< input->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
}
} // namespace kernels
......
......@@ -31,17 +31,22 @@ static void BatchNorm(
net.AddRandomInput<D, T>("Var", {channels}, true);
net.AddInputFromArray<D, float>("Epsilon", {}, {1e-3});
// tuning
setenv("MACE_TUNING", "1", 1);
net.RunOp(D);
unsetenv("MACE_TUNING");
// Warm-up
for (int i = 0; i < 5; ++i) {
net.RunOp(D);
net.Sync();
}
net.Sync();
mace::testing::StartTiming();
while (iters--) {
net.RunOp(D);
net.Sync();
}
net.Sync();
}
#define BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, DEVICE) \
......@@ -56,7 +61,7 @@ static void BatchNorm(
#define BM_BATCH_NORM(N, C, H, W, TYPE) \
BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, CPU); \
BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, NEON); \
BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, NEON);\
BM_BATCH_NORM_MACRO(N, C, H, W, TYPE, OPENCL);
BM_BATCH_NORM(1, 1, 512, 512, float);
......
......@@ -165,7 +165,12 @@ TEST_F(BatchNormOpTest, SimpleRandomOPENCL) {
net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true);
net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3});
// Run NEON
// tuning
setenv("MACE_TUNING", "1", 1);
net.RunOp(DeviceType::OPENCL);
unsetenv("MACE_TUNING");
// Run on opencl
net.RunOp(DeviceType::OPENCL);
// Check
......@@ -206,7 +211,12 @@ TEST_F(BatchNormOpTest, ComplexRandomOPENCL) {
net.AddRandomInput<DeviceType::OPENCL, float>("Var", {channels}, true);
net.AddInputFromArray<DeviceType::OPENCL, float>("Epsilon", {}, {1e-3});
// Run NEON
// tuning
setenv("MACE_TUNING", "1", 1);
net.RunOp(DeviceType::OPENCL);
unsetenv("MACE_TUNING");
// Run on opencl
net.RunOp(DeviceType::OPENCL);
net.Sync();
......
......@@ -137,13 +137,11 @@ class OpsTestNet {
Workspace *ws() { return &ws_; }
bool RunOp(DeviceType device) {
if (!net_ || device_ != device) {
NetDef net_def;
net_def.add_op()->CopyFrom(op_def_);
VLOG(3) << net_def.DebugString();
net_ = CreateNet(net_def, &ws_, device);
device_ = device;
}
NetDef net_def;
net_def.add_op()->CopyFrom(op_def_);
VLOG(3) << net_def.DebugString();
net_ = CreateNet(net_def, &ws_, device);
device_ = device;
return net_->Run();
}
......@@ -154,7 +152,7 @@ class OpsTestNet {
}
void Sync() {
if (net_) {
if (net_ && device_ == DeviceType::OPENCL) {
OpenCLRuntime::Get()->command_queue().finish();
}
}
......
......@@ -7,6 +7,8 @@ package(
licenses(["notice"]) # Apache 2.0
load("//mace:mace.bzl", "if_android")
cc_library(
name = "command_line_flags",
srcs = [
......@@ -28,3 +30,30 @@ cc_library(
],
copts = ["-std=c++11"],
)
cc_library(
name = "tuner",
hdrs = [
"tuner.h",
],
copts = ["-std=c++11"],
deps = [
"//mace/core",
],
)
cc_test(
name = "tuner_test",
testonly = 1,
srcs = [
"tuner_test.cc",
],
copts = ["-std=c++11"],
linkopts = if_android(["-lm", "-ldl"]),
linkstatic = 1,
deps = [
":tuner",
"@gtest//:gtest",
"@gtest//:gtest_main",
],
)
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#ifndef MACE_UTILS_TUNER_H_
#define MACE_UTILS_TUNER_H_
#include <stdlib.h>
#include <vector>
#include <functional>
#include <string>
#include <unordered_map>
#include <fstream>
#include <limits>
#include "mace/core/logging.h"
#include "mace/utils/utils.h"
#include "mace/core/runtime/opencl/opencl_runtime.h"
namespace mace {
template<typename param_type>
class Tuner {
public:
static Tuner* Get() {
static Tuner tuner;
return &tuner;
}
template <typename RetType>
RetType TuneOrRun(const std::string param_key,
const std::vector<param_type> &default_param,
const std::function<std::vector<std::vector<param_type>>()> &param_generator,
const std::function<RetType(const std::vector<param_type> &)> &func) {
if (IsTuning()) {
// tune
std::vector<param_type> opt_param = default_param;
RetType res = Tune<RetType>(param_generator, func, opt_param);
param_table_[param_key] = opt_param;
return res;
} else {
// run
if (param_table_.find(param_key) != param_table_.end()) {
return func(param_table_[param_key]);
} else {
return func(default_param);
}
}
}
private:
Tuner() {
path_ = getenv("MACE_RUN_PARAMETER_PATH");
ReadRunParamters();
}
~Tuner() {
WriteRunParameters();
}
Tuner(const Tuner&) = delete;
Tuner& operator=(const Tuner&) = delete;
inline bool IsTuning() {
const char *tuning = getenv("MACE_TUNING");
return tuning != nullptr && strlen(tuning) == 1 && tuning[0] == '1';
}
inline void WriteRunParameters() {
VLOG(0) << path_;
if (path_ != nullptr) {
std::ofstream ofs(path_, std::ios::binary | std::ios::out);
if (ofs.is_open()) {
size_t num_pramas = param_table_.size();
ofs.write(reinterpret_cast<char *>(&num_pramas), sizeof(num_pramas));
for (auto &kp : param_table_) {
int32_t key_size = kp.first.size();
ofs.write(reinterpret_cast<char *>(&key_size), sizeof(key_size));
ofs.write(kp.first.c_str(), key_size);
VLOG(0) << kp.first.c_str();
auto &params = kp.second;
int32_t params_size = params.size() * sizeof(param_type);
ofs.write(reinterpret_cast<char*>(&params_size), sizeof(params_size));
for (auto &param : params) {
ofs.write(reinterpret_cast<char *>(&param), sizeof(params_size));
VLOG(0) << param;
}
}
ofs.close();
} else {
LOG(WARNING) << "Write run parameter file failed.";
}
}
}
inline void ReadRunParamters() {
if (path_ != nullptr) {
std::ifstream ifs(path_, std::ios::binary | std::ios::in);
if (ifs.is_open()) {
int32_t key_size = 0;
int32_t params_size = 0;
int32_t params_count = 0;
size_t num_pramas = 0;
ifs.read(reinterpret_cast<char *>(&num_pramas), sizeof(num_pramas));
while (num_pramas--) {
ifs.read(reinterpret_cast<char *>(&key_size), sizeof(key_size));
std::string key(key_size, ' ');
ifs.read(&key[0], key_size);
ifs.read(reinterpret_cast<char *>(&params_size), sizeof(params_size));
params_count = params_size / sizeof(param_type);
std::vector<param_type> params(params_count);
for (int i = 0; i < params_count; ++i) {
ifs.read(reinterpret_cast<char*>(&params[i]), sizeof(param_type));
}
param_table_.emplace(key, params);
}
ifs.close();
} else {
LOG(WARNING) << "Read run parameter file failed.";
}
}
}
template <typename RetType>
inline RetType Run(const std::function<RetType(const std::vector<param_type> &)> &func,
const std::vector<param_type> &params,
int num_runs,
double &time_us) {
RetType res;
int64_t total_time_us = 0;
const int64_t start_time = NowInMicroSec();
for (int i = 0; i < num_runs; ++i) {
res = func(params);
}
OpenCLRuntime::Get()->command_queue().finish();
const int64_t end_time = NowInMicroSec();
total_time_us += end_time - start_time;
time_us = total_time_us * 1.0 / num_runs;
return res;
}
template <typename RetType>
inline RetType Tune(std::function<std::vector<std::vector<param_type>>()> param_generator,
const std::function<RetType(const std::vector<param_type> &)> &func,
std::vector<param_type> &opt_params) {
RetType res;
double opt_time = std::numeric_limits<double>::max();
auto params = param_generator();
for (const auto &param: params) {
double tmp_time = 0.0;
// warm up
Run<RetType>(func, param, 2, tmp_time);
// run
RetType tmp_res = Run<RetType>(func, param, 10, tmp_time);
// Check the execution time
if (tmp_time < opt_time) {
opt_time = tmp_time;
opt_params = param;
res = tmp_res;
}
}
return res;
}
private:
const char* path_;
std::unordered_map<std::string, std::vector<param_type>> param_table_;
};
} // namespace mace
#endif // MACE_UTILS_TUNER_H_
//
// Copyright (c) 2017 XiaoMi All rights reserved.
//
#include <thread>
#include "gtest/gtest.h"
#include "mace/utils/tuner.h"
namespace mace {
class TunerTest: public ::testing::Test {
protected:
virtual void SetUp() {
remove( "/data/local/tmp/mace.config" );
setenv("MACE_RUN_PARAMTER_PATH", "/data/local/tmp/mace.config", 1);
}
};
TEST_F(TunerTest, SimpleRun) {
int expect = 1;
auto TunerFunc = [&](const std::vector<int>& params)->int {
if (params.front() == 1) {
return expect;
} else {
return expect + 1;
}
};
std::vector<int> default_params(1, 1);
int res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc);
EXPECT_EQ(expect, res);
default_params[0] = 2;
res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc);
EXPECT_EQ(expect+1, res);
}
TEST_F(TunerTest, SimpleTune) {
int expect = 3;
auto TunerFunc = [&](const std::vector<int>& params)->int {
if (params.front() == expect) {
return expect;
} else {
std::this_thread::sleep_for(std::chrono::milliseconds(10));
return params.front();
}
};
std::vector<int> default_params(1, 1);
auto params_generator = []()->std::vector<std::vector<int>> {
return {{1}, {2}, {3}, {4}};
};
// tune
int res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, *params_generator, TunerFunc);
EXPECT_EQ(expect, res);
// run
res = Tuner<int>::Get()->template TuneOrRun<int>("SimpleRun", default_params, nullptr, TunerFunc);
EXPECT_EQ(expect, res);
}
} // namespace mace
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册