提交 0279acae 编写于 作者: L Liangliang He

Merge branch 'minor-kernel' into 'master'

Add block tuning to limit the execution time less than 1ms.

See merge request !225
...@@ -50,6 +50,21 @@ double OpenCLProfilingTimer::ElapsedMicros() { ...@@ -50,6 +50,21 @@ double OpenCLProfilingTimer::ElapsedMicros() {
return (stop_nanos_ - start_nanos_) / 1000.0; return (stop_nanos_ - start_nanos_) / 1000.0;
} }
double OpenCLProfilingTimer::AccumulatedMicros() {
return accumulated_micros_;
}
void OpenCLProfilingTimer::AccumulateTiming(){
StopTiming();
accumulated_micros_ += (stop_nanos_ - start_nanos_) / 1000.0;
}
void OpenCLProfilingTimer::ClearTiming() {
start_nanos_ = 0;
stop_nanos_ = 0;
accumulated_micros_ = 0;
}
OpenCLRuntime *OpenCLRuntime::Global() { OpenCLRuntime *OpenCLRuntime::Global() {
static OpenCLRuntime instance; static OpenCLRuntime instance;
return &instance; return &instance;
......
...@@ -18,16 +18,20 @@ ...@@ -18,16 +18,20 @@
namespace mace { namespace mace {
class OpenCLProfilingTimer : public Timer { class OpenCLProfilingTimer : public Timer {
public: public:
explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event) {}; explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event), accumulated_micros_(0) {};
void StartTiming() override; void StartTiming() override;
void StopTiming() override; void StopTiming() override;
double ElapsedMicros() override; void AccumulateTiming() override;
void ClearTiming() override;
double ElapsedMicros() override;
double AccumulatedMicros() override;
private: private:
const cl::Event *event_; const cl::Event *event_;
double start_nanos_; double start_nanos_;
double stop_nanos_; double stop_nanos_;
double accumulated_micros_;
}; };
class OpenCLRuntime { class OpenCLRuntime {
...@@ -40,15 +44,15 @@ class OpenCLRuntime { ...@@ -40,15 +44,15 @@ class OpenCLRuntime {
void GetCallStats(const cl::Event &event, CallStats *stats); void GetCallStats(const cl::Event &event, CallStats *stats);
uint32_t GetDeviceMaxWorkGroupSize(); uint32_t GetDeviceMaxWorkGroupSize();
uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel); uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
cl::Kernel BuildKernel(const std::string &program_name, cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name, const std::string &kernel_name,
const std::set<std::string> &build_options); const std::set<std::string> &build_options);
private: private:
OpenCLRuntime(); OpenCLRuntime();
~OpenCLRuntime(); ~OpenCLRuntime();
OpenCLRuntime(const OpenCLRuntime&) = delete; OpenCLRuntime(const OpenCLRuntime &) = delete;
OpenCLRuntime &operator=(const OpenCLRuntime&) = delete; OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
void BuildProgram(const std::string &program_file_name, void BuildProgram(const std::string &program_file_name,
const std::string &binary_file_name, const std::string &binary_file_name,
......
...@@ -63,52 +63,11 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -63,52 +63,11 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8}; std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size =
runtime->GetKernelMaxWorkGroupSize(activation_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch,
kwg_size / (local_ws[0] * local_ws[1]));
return {
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1},
{4, 15, 8}, // SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
activation_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::string tuning_key = std::string tuning_key =
Concat("relu_opencl_kernel_", activation_, output->dim(0), output->dim(1), Concat("relu_opencl_kernel_", activation_, output->dim(0), output->dim(1),
output->dim(2), output->dim(3)); output->dim(2), output->dim(3));
OpenCLProfilingTimer timer(&event); TuningOrRun3DKernel(activation_kernel, tuning_key, gws, lws, future);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
SetFuture(future, event);
} }
template struct ActivationFunctor<DeviceType::OPENCL, float>; template struct ActivationFunctor<DeviceType::OPENCL, float>;
......
...@@ -49,56 +49,14 @@ static void AddN(const std::vector<const Tensor *> &input_tensors, ...@@ -49,56 +49,14 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
static_cast<uint32_t>(width_pixels), static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels) static_cast<uint32_t>(batch_height_pixels)
}; };
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(addn_kernel); std::vector<uint32_t> lws = {64, 16, 1};
std::vector<uint32_t> lws = {64, 16};
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
uint32_t local_ws[2];
local_ws[0] = std::min<uint32_t>(width_pixels, kwg_size);
local_ws[1] = std::min<uint32_t>(batch_height_pixels, kwg_size / local_ws[0]);
return {{local_ws[0], local_ws[1]},
{local_ws[1], local_ws[0]},
{kwg_size / 4, 4},
{kwg_size / 16, 16},
{kwg_size / 32, 32},
{kwg_size / 64, 64},
{kwg_size / 128, 128},
{kwg_size / 256, 256},
{kwg_size / 512, 512},
{kwg_size, 1},
{1, kwg_size}
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss; std::stringstream ss;
ss << "addn_opencl_kernel_" ss << "addn_opencl_kernel_"
<< output->dim(0) << "_" << output->dim(0) << "_"
<< output->dim(1) << "_" << output->dim(1) << "_"
<< output->dim(2) << "_" << output->dim(2) << "_"
<< output->dim(3); << output->dim(3);
OpenCLProfilingTimer timer(&event); TuningOrRun2DKernel(addn_kernel, ss.str(), gws, lws, future);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func,
&timer);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
} }
template <typename T> template <typename T>
......
...@@ -83,51 +83,11 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -83,51 +83,11 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8}; std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch,
kwg_size / (local_ws[0] * local_ws[1]));
return {
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1},
{8, 128, 1}, // SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::string tuning_key = std::string tuning_key =
Concat("batch_norm_opencl_kernel_", activation_, output->dim(0), Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3), folded_constant_); output->dim(1), output->dim(2), output->dim(3), folded_constant_);
OpenCLProfilingTimer timer(&event); TuningOrRun3DKernel(bm_kernel, tuning_key, gws, lws, future);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
SetFuture(future, event);
} }
template struct BatchNormFunctor<DeviceType::OPENCL, float>; template struct BatchNormFunctor<DeviceType::OPENCL, float>;
......
...@@ -50,65 +50,14 @@ static void Concat2(const Tensor *input0, ...@@ -50,65 +50,14 @@ static void Concat2(const Tensor *input0,
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height), static_cast<uint32_t>(batch * height),
}; };
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel); std::vector<uint32_t> lws = {8, 16, 8, 1};
std::vector<uint32_t> lws = {8, 16, 8};
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blk, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{local_ws[0], local_ws[1], local_ws[2]},
{local_ws[2], local_ws[1], local_ws[0]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1},
{4, 15, 8}, //SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
concat_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss; std::stringstream ss;
ss << "concat_opencl_kernel_" ss << "concat_opencl_kernel_"
<< output->dim(0) << "_" << output->dim(0) << "_"
<< output->dim(1) << "_" << output->dim(1) << "_"
<< output->dim(2) << "_" << output->dim(2) << "_"
<< output->dim(3); << output->dim(3);
OpenCLProfilingTimer timer(&event); TuningOrRun3DKernel(concat_kernel, ss.str(), gws, lws, future);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func,
&timer);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
} }
template<typename T> template<typename T>
......
...@@ -96,51 +96,11 @@ void Conv1x1(const Tensor *input, ...@@ -96,51 +96,11 @@ void Conv1x1(const Tensor *input,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 15, 8}; std::vector<uint32_t> lws = {8, 15, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch,
kwg_size / (local_ws[0] * local_ws[1]));
return {
{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1},
{4, 15, 8}, // SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::string tuning_key = std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0), Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
OpenCLProfilingTimer timer(&event); TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
SetFuture(future, event);
} }
extern void Conv2dOpenclK1x1S1(const Tensor *input, extern void Conv2dOpenclK1x1S1(const Tensor *input,
......
...@@ -94,52 +94,11 @@ static void Conv2d3x3S12(const Tensor *input, ...@@ -94,52 +94,11 @@ static void Conv2d3x3S12(const Tensor *input,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {4, 15, 8}; std::vector<uint32_t> lws = {4, 15, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch,
kwg_size / (local_ws[0] * local_ws[1]));
return {
{local_ws[0], local_ws[1], local_ws[2]},
{local_ws[2], local_ws[1], local_ws[0]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1},
{4, 15, 8}, // SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::string tuning_key = std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0), Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
OpenCLProfilingTimer timer(&event); TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
SetFuture(future, event);
} }
void Conv2dOpenclK3x3S1(const Tensor *input, void Conv2dOpenclK3x3S1(const Tensor *input,
const Tensor *filter, const Tensor *filter,
......
...@@ -96,52 +96,11 @@ void Conv2dOpencl(const Tensor *input, ...@@ -96,52 +96,11 @@ void Conv2dOpencl(const Tensor *input,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8}; std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch,
kwg_size / (local_ws[0] * local_ws[1]));
return {
{local_ws[0], local_ws[1], local_ws[2]},
{local_ws[2], local_ws[1], local_ws[0]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1},
{4, 15, 8}, // SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::string tuning_key = std::string tuning_key =
Concat("conv2d_general_opencl_kernel_", activation, output->dim(0), Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3)); output->dim(1), output->dim(2), output->dim(3));
OpenCLProfilingTimer timer(&event); TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
SetFuture(future, event);
} }
} // namespace kernels } // namespace kernels
......
...@@ -4,13 +4,14 @@ ...@@ -4,13 +4,14 @@
#include "mace/kernels/opencl/helper.h" #include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h" #include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace mace { namespace mace {
namespace kernels { namespace kernels {
// [(c+3)/4*W, N * H] // [(c+3)/4*W, N * H]
void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */ void CalInOutputImageShape(const std::vector<index_t> &shape, /* NHWC */
std::vector<size_t> &image_shape) { std::vector<size_t> &image_shape) {
MACE_CHECK(shape.size() == 4); MACE_CHECK(shape.size() == 4);
image_shape.resize(2); image_shape.resize(2);
image_shape[0] = RoundUpDiv4(shape[3]) * shape[2]; image_shape[0] = RoundUpDiv4(shape[3]) * shape[2];
...@@ -39,41 +40,30 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */ ...@@ -39,41 +40,30 @@ void CalImage2DShape(const std::vector<index_t> &shape, /* NHWC */
const BufferType type, const BufferType type,
std::vector<size_t> &image_shape) { std::vector<size_t> &image_shape) {
switch (type) { switch (type) {
case FILTER: case FILTER:CalFilterImageShape(shape, image_shape);
CalFilterImageShape(shape, image_shape);
break; break;
case IN_OUT: case IN_OUT:CalInOutputImageShape(shape, image_shape);
CalInOutputImageShape(shape, image_shape);
break; break;
case ARGUMENT: case ARGUMENT:CalArgImageShape(shape, image_shape);
CalArgImageShape(shape, image_shape);
break; break;
default: default:LOG(FATAL) << "Mace not supported yet.";
LOG(FATAL) << "Mace not supported yet.";
} }
} }
std::string DtToCLDt(const DataType dt) { std::string DtToCLDt(const DataType dt) {
switch (dt) { switch (dt) {
case DT_FLOAT: case DT_FLOAT:return "float";
return "float"; case DT_HALF:return "half";
case DT_HALF: default:LOG(FATAL) << "Unsupported data type";
return "half";
default:
LOG(FATAL) << "Unsupported data type";
return ""; return "";
} }
} }
std::string DtToCLCMDDt(const DataType dt) { std::string DtToCLCMDDt(const DataType dt) {
switch (dt) { switch (dt) {
case DT_FLOAT: case DT_FLOAT:return "f";
return "f"; case DT_HALF:return "h";
case DT_HALF: default:LOG(FATAL) << "Not supported data type for opencl cmd data type";
return "h";
default:
LOG(FATAL) << "Not supported data type for opencl cmd data type";
return ""; return "";
} }
} }
...@@ -81,10 +71,8 @@ std::string DtToCLCMDDt(const DataType dt) { ...@@ -81,10 +71,8 @@ std::string DtToCLCMDDt(const DataType dt) {
std::string DtToUpstreamCLDt(const DataType dt) { std::string DtToUpstreamCLDt(const DataType dt) {
switch (dt) { switch (dt) {
case DT_FLOAT: case DT_FLOAT:
case DT_HALF: case DT_HALF:return "float";
return "float"; default:LOG(FATAL) << "Unsupported data type";
default:
LOG(FATAL) << "Unsupported data type";
return ""; return "";
} }
} }
...@@ -92,13 +80,200 @@ std::string DtToUpstreamCLDt(const DataType dt) { ...@@ -92,13 +80,200 @@ std::string DtToUpstreamCLDt(const DataType dt) {
std::string DtToUpstreamCLCMDDt(const DataType dt) { std::string DtToUpstreamCLCMDDt(const DataType dt) {
switch (dt) { switch (dt) {
case DT_FLOAT: case DT_FLOAT:
case DT_HALF: case DT_HALF:return "f";
return "f"; default:LOG(FATAL) << "Not supported data type for opencl cmd data type";
default:
LOG(FATAL) << "Not supported data type for opencl cmd data type";
return ""; return "";
} }
} }
void TuningOrRun3DKernel(cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
std::vector<uint32_t> &lws,
StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(gws[2],
kwg_size / (local_ws[0] * local_ws[1]));
return {
{local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 32, 4, 1},
{1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128, 1},
{3, 15, 9, 1},
{7, 15, 9, 1},
{9, 7, 15, 1},
{15, 7, 9, 1},
{1, kwg_size, 1, 1},
{4, 15, 8, 1}, // SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params,
Timer *timer,
std::vector<uint32_t> *tuning_result) -> cl_int {
MACE_CHECK(params.size() == 4) << "Tuning parameters of 3D kernel must be 4D";
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params[3];
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->ClearTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
tuning_result->assign(params.begin(), params.end());
if (LimitKernelTime()) {
double elapse_time = timer->AccumulatedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
(*tuning_result)[3] = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
}
return error;
};
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
if (future != nullptr) {
future->wait_fn = [event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
OpenCLRuntime::Global()->GetCallStats(event, stats);
}
};
}
}
void TuningOrRun2DKernel(cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
std::vector<uint32_t> &lws,
StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
uint32_t local_ws[2];
local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
return {{local_ws[0], local_ws[1], 1},
{local_ws[1], local_ws[0], 1},
{kwg_size / 4, 4, 1},
{kwg_size / 16, 16, 1},
{kwg_size / 32, 32, 1},
{kwg_size / 64, 64, 1},
{kwg_size / 128, 128, 1},
{kwg_size / 256, 256, 1},
{kwg_size / 512, 512, 1},
{kwg_size, 1, 1},
{1, kwg_size, 1}
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params,
Timer *timer,
std::vector<uint32_t> *tuning_result) -> cl_int {
MACE_CHECK(params.size() == 3) << "Tuning parameters of 2D kernel must be 3d";
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params[2];
const uint32_t block_size = gws[1] / num_blocks;
if (gws[1] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
kernel,
cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->ClearTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
tuning_result->assign(params.begin(), params.end());
if (LimitKernelTime()) {
double elapse_time = timer->AccumulatedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
(*tuning_result)[2] = num_blocks;
const uint32_t block_size = gws[1] / num_blocks;
if (gws[1] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
kernel,
cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
}
return error;
};
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(tuning_key,
lws,
params_generator,
func,
&timer);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
}
} // namespace kernels } // namespace kernels
} // namespace mace } // namespace mace
...@@ -14,9 +14,11 @@ ...@@ -14,9 +14,11 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
const float kMaxKernelExeTime = 1000.0; // microseconds
enum BufferType { enum BufferType {
FILTER = 0, FILTER = 0,
IN_OUT= 1, IN_OUT = 1,
ARGUMENT = 2 ARGUMENT = 2
}; };
...@@ -32,6 +34,19 @@ std::string DtToCLDt(const DataType dt); ...@@ -32,6 +34,19 @@ std::string DtToCLDt(const DataType dt);
std::string DtToUpstreamCLDt(const DataType dt); std::string DtToUpstreamCLDt(const DataType dt);
void TuningOrRun3DKernel(cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
std::vector<uint32_t> &lws,
StatsFuture *future);
void TuningOrRun2DKernel(cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
std::vector<uint32_t> &lws,
StatsFuture *future);
inline void SetFuture(StatsFuture *future, const cl::Event &event) { inline void SetFuture(StatsFuture *future, const cl::Event &event) {
if (future != nullptr) { if (future != nullptr) {
future->wait_fn = [event](CallStats *stats) { future->wait_fn = [event](CallStats *stats) {
...@@ -43,10 +58,15 @@ inline void SetFuture(StatsFuture *future, const cl::Event &event) { ...@@ -43,10 +58,15 @@ inline void SetFuture(StatsFuture *future, const cl::Event &event) {
} }
} }
inline bool LimitKernelTime() {
const char *flag = getenv("MACE_LIMIT_OPENCL_KERNEL_TIME");
return flag != nullptr && strlen(flag) == 1 && flag[0] == '1';
}
namespace { namespace {
template<typename T> template<typename T>
void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) { void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
(*ss) << v; (*ss) << v;
} }
template<typename T, typename... Args> template<typename T, typename... Args>
...@@ -54,8 +74,8 @@ void AppendToStream(std::stringstream *ss, ...@@ -54,8 +74,8 @@ void AppendToStream(std::stringstream *ss,
const std::string &delimiter, const std::string &delimiter,
T first, T first,
Args... args) { Args... args) {
(*ss) << first << delimiter; (*ss) << first << delimiter;
AppendToStream(ss, delimiter, args...); AppendToStream(ss, delimiter, args...);
} }
} // namespace } // namespace
......
...@@ -60,67 +60,17 @@ static void Pooling(const Tensor *input, ...@@ -60,67 +60,17 @@ static void Pooling(const Tensor *input,
static_cast<uint32_t>(batch * out_height), static_cast<uint32_t>(batch * out_height),
}; };
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
std::vector<uint32_t> lws(3, 0); std::vector<uint32_t> lws(4, 1);
lws[0] = std::min<uint32_t>(channel_blocks, kwg_size); lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]); lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1])); lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1},
{4, 15, 8}, //SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss; std::stringstream ss;
ss << "pooling_opencl_kernel_" ss << "pooling_opencl_kernel_"
<< output->dim(0) << "_" << output->dim(0) << "_"
<< output->dim(1) << "_" << output->dim(1) << "_"
<< output->dim(2) << "_" << output->dim(2) << "_"
<< output->dim(3); << output->dim(3);
OpenCLProfilingTimer timer(&event); TuningOrRun3DKernel(pooling_kernel, ss.str(), gws, lws, future);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func,
&timer);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
} }
template<typename T> template<typename T>
......
...@@ -59,60 +59,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -59,60 +59,14 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)}; static_cast<uint32_t>(out_height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8}; std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{local_ws[0], local_ws[1], local_ws[2]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{1, kwg_size, 1},
{4, 15, 8}, //SNPE size
};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
rb_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss; std::stringstream ss;
ss << "resize_bilinear_opencl_kernel_" ss << "resize_bilinear_opencl_kernel_"
<< output->dim(0) << "_" << output->dim(0) << "_"
<< output->dim(1) << "_" << output->dim(1) << "_"
<< output->dim(2) << "_" << output->dim(2) << "_"
<< output->dim(3); << output->dim(3);
OpenCLProfilingTimer timer(&event); TuningOrRun3DKernel(rb_kernel, ss.str(), gws, lws, future);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func,
&timer);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
} }
template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>; template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
......
...@@ -41,64 +41,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -41,64 +41,14 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8}; std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(softmax_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size
{local_ws[0], local_ws[1], local_ws[2]},
{local_ws[2], local_ws[1], local_ws[0]},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
softmax_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss; std::stringstream ss;
ss << "softmax_opencl_kernel_" ss << "softmax_opencl_kernel_"
<< output->dim(0) << "_" << output->dim(0) << "_"
<< output->dim(1) << "_" << output->dim(1) << "_"
<< output->dim(2) << "_" << output->dim(2) << "_"
<< output->dim(3); << output->dim(3);
OpenCLProfilingTimer timer(&event); TuningOrRun3DKernel(softmax_kernel, ss.str(), gws, lws, future);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func,
&timer);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
} }
template template
......
...@@ -61,58 +61,14 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor ...@@ -61,58 +61,14 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
const uint32_t gws[3] = {chan_blk, const uint32_t gws[3] = {chan_blk,
static_cast<uint32_t>(batch_tensor->dim(2)), static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))}; static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
const std::vector<uint32_t> lws = {8, 16, 8}; std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(s2b_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(chan_blk, kwg_size);
local_ws[1] = std::min<uint32_t>(32, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(32, kwg_size / (local_ws[0] * local_ws[1]));
return {{local_ws[0], local_ws[1], local_ws[2]},
{4, 32, 8},
{4, 64, 4},
{4, 128, 2},
{8, 16, 8},
{8, 32, 4},
{8, 64, 2},
{16, 8, 8},
{16, 16, 4},
{16, 32, 2},
{32, 8, 4},
{32, 16, 2},
{64, 4, 4}};
};
cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
s2b_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss; std::stringstream ss;
ss << kernel_name << "_" ss << kernel_name << "_"
<< batch_tensor->dim(0) << "_" << batch_tensor->dim(0) << "_"
<< batch_tensor->dim(1) << "_" << batch_tensor->dim(1) << "_"
<< batch_tensor->dim(2) << "_" << batch_tensor->dim(2) << "_"
<< batch_tensor->dim(3); << batch_tensor->dim(3);
OpenCLProfilingTimer timer(&event); TuningOrRun3DKernel(s2b_kernel, ss.str(), gws, lws, future);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func,
&timer);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
} }
template struct SpaceToBatchFunctor<DeviceType::OPENCL, float>; template struct SpaceToBatchFunctor<DeviceType::OPENCL, float>;
......
...@@ -10,29 +10,50 @@ ...@@ -10,29 +10,50 @@
namespace mace { namespace mace {
class Timer { class Timer {
public: public:
virtual void StartTiming() = 0; virtual void StartTiming() = 0;
virtual void StopTiming() = 0; virtual void StopTiming() = 0;
virtual double ElapsedMicros() = 0; virtual void AccumulateTiming() = 0;
virtual void ClearTiming() = 0;
virtual double ElapsedMicros() = 0;
virtual double AccumulatedMicros() = 0;
}; };
class WallClockTimer : public Timer { class WallClockTimer : public Timer {
public: public:
void StartTiming() override { WallClockTimer() : accumulated_micros_(0) {}
start_micros_ = mace::utils::NowMicros();
} void StartTiming() override {
start_micros_ = mace::utils::NowMicros();
void StopTiming() override { }
stop_micros_ = mace::utils::NowMicros();
} void StopTiming() override {
stop_micros_ = mace::utils::NowMicros();
double ElapsedMicros() override { }
return stop_micros_ - start_micros_;
} void AccumulateTiming() override {
StopTiming();
private: accumulated_micros_ += stop_micros_ - start_micros_;
double start_micros_; }
double stop_micros_;
void ClearTiming() override {
start_micros_ = 0;
stop_micros_ = 0;
accumulated_micros_ = 0;
}
double ElapsedMicros() override {
return stop_micros_ - start_micros_;
}
double AccumulatedMicros() override {
return accumulated_micros_;
}
private:
double start_micros_;
double stop_micros_;
double accumulated_micros_;
}; };
} // namespace mace } // namespace mace
......
...@@ -41,10 +41,10 @@ class Tuner { ...@@ -41,10 +41,10 @@ class Tuner {
template <typename RetType> template <typename RetType>
RetType TuneOrRun( RetType TuneOrRun(
const std::string param_key, const std::string param_key,
const std::vector<param_type> &default_param, std::vector<param_type> &default_param,
const std::function<std::vector<std::vector<param_type>>()> const std::function<std::vector<std::vector<param_type>>()>
&param_generator, &param_generator,
const std::function<RetType(const std::vector<param_type> &)> &func, const std::function<RetType(const std::vector<param_type> &, Timer *, std::vector<param_type> *)> &func,
Timer *timer) { Timer *timer) {
std::string obfucated_param_key = MACE_OBFUSCATE_SYMBOL(param_key); std::string obfucated_param_key = MACE_OBFUSCATE_SYMBOL(param_key);
if (IsTuning() && param_generator != nullptr) { if (IsTuning() && param_generator != nullptr) {
...@@ -60,12 +60,12 @@ class Tuner { ...@@ -60,12 +60,12 @@ class Tuner {
if (param_table_.find(obfucated_param_key) != param_table_.end()) { if (param_table_.find(obfucated_param_key) != param_table_.end()) {
VLOG(1) << param_key << ": " VLOG(1) << param_key << ": "
<< internal::MakeString(param_table_[obfucated_param_key]); << internal::MakeString(param_table_[obfucated_param_key]);
return func(param_table_[obfucated_param_key]); return func(param_table_[obfucated_param_key], nullptr, nullptr);
} else { } else {
#ifndef MACE_DISABLE_NO_TUNING_WARNING #ifndef MACE_DISABLE_NO_TUNING_WARNING
LOG(WARNING) << "Fallback to default parameter: " << param_key; LOG(WARNING) << "Fallback to default parameter: " << param_key;
#endif #endif
return func(default_param); return func(default_param, nullptr, nullptr);
} }
} }
} }
...@@ -119,18 +119,17 @@ class Tuner { ...@@ -119,18 +119,17 @@ class Tuner {
template <typename RetType> template <typename RetType>
inline RetType Run( inline RetType Run(
const std::function<RetType(const std::vector<param_type> &)> &func, const std::function<RetType(const std::vector<param_type> &, Timer *, std::vector<param_type> *)> &func,
const std::vector<param_type> &params, std::vector<param_type> &params,
Timer *timer, Timer *timer,
int num_runs, int num_runs,
double *time_us) { double *time_us,
std::vector<param_type> *tuning_result) {
RetType res; RetType res;
int64_t total_time_us = 0; int64_t total_time_us = 0;
for (int i = 0; i < num_runs; ++i) { for (int i = 0; i < num_runs; ++i) {
timer->StartTiming(); res = func(params, timer, tuning_result);
res = func(params); total_time_us += timer->AccumulatedMicros();
timer->StopTiming();
total_time_us += timer->ElapsedMicros();
} }
*time_us = total_time_us * 1.0 / num_runs; *time_us = total_time_us * 1.0 / num_runs;
...@@ -141,24 +140,25 @@ class Tuner { ...@@ -141,24 +140,25 @@ class Tuner {
inline RetType Tune( inline RetType Tune(
const std::function<std::vector<std::vector<param_type>>()> const std::function<std::vector<std::vector<param_type>>()>
&param_generator, &param_generator,
const std::function<RetType(const std::vector<param_type> &)> &func, const std::function<RetType(const std::vector<param_type> &, Timer *, std::vector<param_type> *)> &func,
Timer *timer, Timer *timer,
std::vector<param_type> *opt_params) { std::vector<param_type> *opt_params) {
RetType res; RetType res;
double opt_time = std::numeric_limits<double>::max(); double opt_time = std::numeric_limits<double>::max();
auto params = param_generator(); auto params = param_generator();
for (const auto &param : params) { std::vector<param_type> tuning_result;
for (auto param : params) {
double tmp_time = 0.0; double tmp_time = 0.0;
// warm up // warm up
Run<RetType>(func, param, timer, 2, &tmp_time); Run<RetType>(func, param, timer, 2, &tmp_time, &tuning_result);
// run // run
RetType tmp_res = Run<RetType>(func, param, timer, 10, &tmp_time); RetType tmp_res = Run<RetType>(func, param, timer, 10, &tmp_time, &tuning_result);
// Check the execution time // Check the execution time
if (tmp_time < opt_time) { if (tmp_time < opt_time) {
opt_time = tmp_time; opt_time = tmp_time;
*opt_params = param; *opt_params = tuning_result;
res = tmp_res; res = tmp_res;
} }
} }
......
...@@ -68,7 +68,6 @@ build_target() ...@@ -68,7 +68,6 @@ build_target()
--copt="-D_GLIBCXX_USE_C99_MATH_TR1" \ --copt="-D_GLIBCXX_USE_C99_MATH_TR1" \
--copt="-Werror=return-type" \ --copt="-Werror=return-type" \
--copt="-DMACE_OBFUSCATE_LITERALS" \ --copt="-DMACE_OBFUSCATE_LITERALS" \
$TUNING_MODE_BUILD_FLAGS \
$DSP_MODE_BUILD_FLAGS || exit -1 $DSP_MODE_BUILD_FLAGS || exit -1
} }
......
import numpy as np
import math
import tensorflow as tf
A_T = np.array([[1, 1, 1, 0], [0, 1, -1, -1]]).astype(np.float32)
A = np.transpose(A_T)
B_T = np.array([
[1, 0, -1, 0],
[0, 1, 1, 0],
[0, -1, 1, 0],
[0, 1, 0, -1]
]).astype(np.float32)
B = np.transpose(B_T)
G = np.array([
[1, 0, 0],
[0.5, 0.5, 0.5],
[0.5, -0.5, 0.5],
[0, 0, 1],
]).astype(np.float32)
G_T = np.transpose(G)
def output_shape(input_shape, filter_shape):
out_shape = np.zeros(4).astype(np.int32)
out_shape[0] = input_shape[0]
out_shape[1] = filter_shape[0]
out_shape[2] = input_shape[2] - 2
out_shape[3] = input_shape[3] - 2
return out_shape
def winog_conv(input, filter):
m = 2
r = 3
alpha = m + r - 1
input_shape = input.shape
filter_shape = filter.shape
out_shape = output_shape(input_shape, filter_shape)
K = filter_shape[0]
C = input_shape[1]
U = np.zeros((K * 16, C))
for k in range(K):
for c in range(C):
u = np.dot(np.dot(G, filter[k, c, :, :]), G_T)
for i in range(4):
for j in range(4) :
U[(i * 4 + j) * K + k, c] = u[i, j]
print 'filter out: ', U.shape
print U[0, 0]
U.astype(np.float32).tofile("filter_out")
rounded_h = int(math.ceil(out_shape[2] / 2.0))
rounded_w = int(math.ceil(out_shape[3] / 2.0))
P = input_shape[0] * rounded_h * rounded_w
V = np.zeros((C * 16, P))
for p in range(P):
for c in range(C):
n = p / (rounded_w * rounded_h)
t = p % (rounded_h * rounded_w)
h_idx = t / rounded_w
w_idx = t % rounded_w
h_start = h_idx * 2
w_start = w_idx * 2
h_end = min(h_start+4, input_shape[2])
w_end = min(w_start+4, input_shape[3])
d = np.zeros((4, 4))
d[0:h_end-h_start, 0:w_end-w_start] = input[n, c, h_start:h_end, w_start:w_end]
v = np.dot(np.dot(B_T, d), B)
for i in range(4):
for j in range(4):
V[(i*4+j)*C + c, p] = v[i, j]
tmp = V.reshape(16, C, P, 1)
print 'input out: ', tmp.shape
tmp.astype(np.float32).tofile("C")
M = np.zeros((16 * K, P))
for i in range(alpha * alpha):
u = U[i * K : (i+1) * K, :]
v = V[i * C : (i+1) * C, :]
M[i * K : (i+1) * K, :] = np.dot(u, v)
print 'M shape: ', M.shape
M.astype(np.float32).tofile("gemm")
res = np.zeros((out_shape[0], out_shape[2], out_shape[3], out_shape[1]))
for k in range(K):
for b in range(P):
m = np.zeros((4, 4))
for i in range(4):
for j in range(4):
m[i][j] = M[(i*4+j) * K + k, b]
y = np.dot(np.dot(A_T, m), A)
for i in range(2):
for j in range(2):
n = b / (rounded_h * rounded_w)
t = b % (rounded_h * rounded_w)
p = (t / rounded_w) * 2 + i
q = (t % rounded_w) * 2 + j
if p >= out_shape[2] or q >= out_shape[3]:
continue
res[n, p, q, k] = y[i, j]
print 'Res shape: ', res.shape
res.astype(np.float32).tofile("res")
return res
def tf_conv(input, filter):
conv_op = tf.nn.conv2d(input, filter, [1, 1, 1, 1], 'VALID')
with tf.Session() as sess:
res = sess.run(conv_op)
return res
def main():
input = np.random.random([7, 61, 71, 31]).astype(np.float32)
# input = np.fromfile(file="A", dtype=np.float32)
# input = input.reshape(1, 3, 3, 5)
print 'input shape: ', input.shape
input.tofile("A")
filter = np.random.random([3, 3, 31, 31]).astype(np.float32)
tf_out = tf_conv(input, filter)
input = input.transpose((0, 3, 1, 2))
filter = filter.transpose((3, 2, 0, 1))
print 'filter shape: ', filter.shape
filter.tofile("filter_in")
winog_out = winog_conv(input, filter)
res = np.allclose(tf_out, winog_out)
if res:
print "=========Pass========="
else:
print "=========Failed========="
print "TF: ", tf_out
print "Winograd: ", winog_out
if __name__ == '__main__':
main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册