提交 a9dce8ec 编写于 作者: L liuqi

Add block tuning to limit the execution time less than 1ms.

上级 537b4600
...@@ -50,6 +50,21 @@ double OpenCLProfilingTimer::ElapsedMicros() { ...@@ -50,6 +50,21 @@ double OpenCLProfilingTimer::ElapsedMicros() {
return (stop_nanos_ - start_nanos_) / 1000.0; return (stop_nanos_ - start_nanos_) / 1000.0;
} }
double OpenCLProfilingTimer::AccumulatedMicros() {
return accumulated_micros_;
}
void OpenCLProfilingTimer::AccumulateTiming(){
StopTiming();
accumulated_micros_ += (stop_nanos_ - start_nanos_) / 1000.0;
}
void OpenCLProfilingTimer::ClearTiming() {
start_nanos_ = 0;
stop_nanos_ = 0;
accumulated_micros_ = 0;
}
OpenCLRuntime *OpenCLRuntime::Global() { OpenCLRuntime *OpenCLRuntime::Global() {
static OpenCLRuntime instance; static OpenCLRuntime instance;
return &instance; return &instance;
......
...@@ -18,16 +18,20 @@ ...@@ -18,16 +18,20 @@
namespace mace { namespace mace {
class OpenCLProfilingTimer : public Timer { class OpenCLProfilingTimer : public Timer {
public: public:
explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event) {}; explicit OpenCLProfilingTimer(const cl::Event *event) : event_(event), accumulated_micros_(0) {};
void StartTiming() override; void StartTiming() override;
void StopTiming() override; void StopTiming() override;
double ElapsedMicros() override; void AccumulateTiming() override;
void ClearTiming() override;
double ElapsedMicros() override;
double AccumulatedMicros() override;
private: private:
const cl::Event *event_; const cl::Event *event_;
double start_nanos_; double start_nanos_;
double stop_nanos_; double stop_nanos_;
double accumulated_micros_;
}; };
class OpenCLRuntime { class OpenCLRuntime {
...@@ -40,15 +44,15 @@ class OpenCLRuntime { ...@@ -40,15 +44,15 @@ class OpenCLRuntime {
void GetCallStats(const cl::Event &event, CallStats *stats); void GetCallStats(const cl::Event &event, CallStats *stats);
uint32_t GetDeviceMaxWorkGroupSize(); uint32_t GetDeviceMaxWorkGroupSize();
uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel& kernel); uint32_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel);
cl::Kernel BuildKernel(const std::string &program_name, cl::Kernel BuildKernel(const std::string &program_name,
const std::string &kernel_name, const std::string &kernel_name,
const std::set<std::string> &build_options); const std::set<std::string> &build_options);
private: private:
OpenCLRuntime(); OpenCLRuntime();
~OpenCLRuntime(); ~OpenCLRuntime();
OpenCLRuntime(const OpenCLRuntime&) = delete; OpenCLRuntime(const OpenCLRuntime &) = delete;
OpenCLRuntime &operator=(const OpenCLRuntime&) = delete; OpenCLRuntime &operator=(const OpenCLRuntime &) = delete;
void BuildProgram(const std::string &program_file_name, void BuildProgram(const std::string &program_file_name,
const std::string &binary_file_name, const std::string &binary_file_name,
......
...@@ -63,7 +63,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -63,7 +63,7 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8}; std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size = const uint32_t kwg_size =
runtime->GetKernelMaxWorkGroupSize(activation_kernel); runtime->GetKernelMaxWorkGroupSize(activation_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
...@@ -73,33 +73,66 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -73,33 +73,66 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
local_ws[2] = std::min<uint32_t>(height * batch, local_ws[2] = std::min<uint32_t>(height * batch,
kwg_size / (local_ws[0] * local_ws[1])); kwg_size / (local_ws[0] * local_ws[1]));
return { return {
{local_ws[0], local_ws[1], local_ws[2]}, {local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4}, {kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8}, {kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 8, 4}, {kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 8, 8}, {kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 16, 4}, {kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 8, 16}, {kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 16, 8}, {kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 32, 4}, {kwg_size / 128, 32, 4, 1},
{1, kwg_size / 32, 32}, {1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64}, {1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128}, {1, kwg_size / 128, 128, 1},
{3, 15, 9}, {3, 15, 9, 1},
{7, 15, 9}, {7, 15, 9, 1},
{9, 7, 15}, {9, 7, 15, 1},
{15, 7, 9}, {15, 7, 9, 1},
{1, kwg_size, 1}, {1, kwg_size, 1, 1},
{4, 15, 8}, // SNPE size {4, 15, 8, 1}, // SNPE size
}; };
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int { auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = CL_SUCCESS;
activation_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), if (timer == nullptr) {
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); uint32_t num_blocks = params.back();
const uint32_t block_size = gws[2] / num_blocks;
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
activation_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
activation_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
activation_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error; return error;
}; };
std::string tuning_key = std::string tuning_key =
......
...@@ -50,33 +50,66 @@ static void AddN(const std::vector<const Tensor *> &input_tensors, ...@@ -50,33 +50,66 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
static_cast<uint32_t>(batch_height_pixels) static_cast<uint32_t>(batch_height_pixels)
}; };
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(addn_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
std::vector<uint32_t> lws = {64, 16}; std::vector<uint32_t> lws = {64, 16, 1};
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
uint32_t local_ws[2]; uint32_t local_ws[2];
local_ws[0] = std::min<uint32_t>(width_pixels, kwg_size); local_ws[0] = std::min<uint32_t>(width_pixels, kwg_size);
local_ws[1] = std::min<uint32_t>(batch_height_pixels, kwg_size / local_ws[0]); local_ws[1] = std::min<uint32_t>(batch_height_pixels, kwg_size / local_ws[0]);
return {{local_ws[0], local_ws[1]}, return {{local_ws[0], local_ws[1], 1},
{local_ws[1], local_ws[0]}, {local_ws[1], local_ws[0], 1},
{kwg_size / 4, 4}, {kwg_size / 4, 4, 1},
{kwg_size / 16, 16}, {kwg_size / 16, 16, 1},
{kwg_size / 32, 32}, {kwg_size / 32, 32, 1},
{kwg_size / 64, 64}, {kwg_size / 64, 64, 1},
{kwg_size / 128, 128}, {kwg_size / 128, 128, 1},
{kwg_size / 256, 256}, {kwg_size / 256, 256, 1},
{kwg_size / 512, 512}, {kwg_size / 512, 512, 1},
{kwg_size, 1}, {kwg_size, 1, 1},
{1, kwg_size} {1, kwg_size, 1}
}; };
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int { auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = CL_SUCCESS;
addn_kernel, cl::NullRange, if (timer == nullptr) {
cl::NDRange(gws[0], gws[1]), uint32_t num_blocks = params.back();
cl::NDRange(params[0], params[1]), const uint32_t block_size = gws[1] / num_blocks;
nullptr, &event); if (gws[1] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel,
cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
params.back() = num_blocks;
const uint32_t block_size = gws[1] / num_blocks;
if (gws[1] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel,
cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error; return error;
}; };
std::stringstream ss; std::stringstream ss;
......
...@@ -83,7 +83,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -83,7 +83,7 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8}; std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0); std::vector<uint32_t> local_ws(3, 0);
...@@ -92,33 +92,66 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input, ...@@ -92,33 +92,66 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
local_ws[2] = std::min<uint32_t>(height * batch, local_ws[2] = std::min<uint32_t>(height * batch,
kwg_size / (local_ws[0] * local_ws[1])); kwg_size / (local_ws[0] * local_ws[1]));
return { return {
{local_ws[0], local_ws[1], local_ws[2]}, {local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4}, {kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8}, {kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 8, 4}, {kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 8, 8}, {kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 16, 4}, {kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 8, 16}, {kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 16, 8}, {kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 32, 4}, {kwg_size / 128, 32, 4, 1},
{1, kwg_size / 32, 32}, {1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64}, {1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128}, {1, kwg_size / 128, 128, 1},
{3, 15, 9}, {3, 15, 9, 1},
{7, 15, 9}, {7, 15, 9, 1},
{9, 7, 15}, {9, 7, 15, 1},
{15, 7, 9}, {15, 7, 9, 1},
{1, kwg_size, 1}, {1, kwg_size, 1, 1},
{8, 128, 1}, // SNPE size {8, 128, 1, 1}, // SNPE size
}; };
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int { auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = CL_SUCCESS;
bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), if (timer == nullptr) {
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); uint32_t num_blocks = params.back();
const uint32_t block_size = gws[2] / num_blocks;
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
bm_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
bm_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error; return error;
}; };
std::string tuning_key = std::string tuning_key =
......
...@@ -51,42 +51,73 @@ static void Concat2(const Tensor *input0, ...@@ -51,42 +51,73 @@ static void Concat2(const Tensor *input0,
static_cast<uint32_t>(batch * height), static_cast<uint32_t>(batch * height),
}; };
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel);
std::vector<uint32_t> lws = {8, 16, 8}; std::vector<uint32_t> lws = {8, 16, 8, 1};
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0); std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blk, kwg_size); local_ws[0] = std::min<uint32_t>(channel_blk, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]); local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1])); local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{local_ws[0], local_ws[1], local_ws[2]}, return {{local_ws[0], local_ws[1], local_ws[2], 1},
{local_ws[2], local_ws[1], local_ws[0]}, {local_ws[2], local_ws[1], local_ws[0], 1},
{kwg_size / 16, 4, 4}, {kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8}, {kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 8, 4}, {kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 8, 8}, {kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 16, 4}, {kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 8, 16}, {kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 16, 8}, {kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 32, 4}, {kwg_size / 128, 32, 4, 1},
{1, kwg_size / 32, 32}, {1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64}, {1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128}, {1, kwg_size / 128, 128, 1},
{3, 15, 9}, {3, 15, 9, 1},
{7, 15, 9}, {7, 15, 9, 1},
{9, 7, 15}, {9, 7, 15, 1},
{15, 7, 9}, {15, 7, 9, 1},
{1, kwg_size, 1}, {1, kwg_size, 1, 1},
{4, 15, 8}, //SNPE size {4, 15, 8, 1}, //SNPE size
}; };
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int { auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = CL_SUCCESS;
concat_kernel, cl::NullRange, if (timer == nullptr) {
cl::NDRange(gws[0], gws[1], gws[2]), uint32_t num_blocks = params.back();
cl::NDRange(params[0], params[1], params[2]), const uint32_t block_size = gws[2] / num_blocks;
nullptr, &event); if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
concat_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
concat_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
concat_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error; return error;
}; };
std::stringstream ss; std::stringstream ss;
......
...@@ -96,7 +96,7 @@ void Conv1x1(const Tensor *input, ...@@ -96,7 +96,7 @@ void Conv1x1(const Tensor *input,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 15, 8}; std::vector<uint32_t> lws = {8, 15, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0); std::vector<uint32_t> local_ws(3, 0);
...@@ -105,33 +105,66 @@ void Conv1x1(const Tensor *input, ...@@ -105,33 +105,66 @@ void Conv1x1(const Tensor *input,
local_ws[2] = std::min<uint32_t>(height * batch, local_ws[2] = std::min<uint32_t>(height * batch,
kwg_size / (local_ws[0] * local_ws[1])); kwg_size / (local_ws[0] * local_ws[1]));
return { return {
{local_ws[0], local_ws[1], local_ws[2]}, {local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4}, {kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8}, {kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 8, 4}, {kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 8, 8}, {kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 16, 4}, {kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 8, 16}, {kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 16, 8}, {kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 32, 4}, {kwg_size / 128, 32, 4, 1},
{1, kwg_size / 32, 32}, {1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64}, {1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128}, {1, kwg_size / 128, 128, 1},
{3, 15, 9}, {3, 15, 9, 1},
{7, 15, 9}, {7, 15, 9, 1},
{9, 7, 15}, {9, 7, 15, 1},
{15, 7, 9}, {15, 7, 9, 1},
{1, kwg_size, 1}, {1, kwg_size, 1, 1},
{4, 15, 8}, // SNPE size {4, 15, 8, 1}, // SNPE size
}; };
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int { auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = CL_SUCCESS;
conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), if (timer == nullptr) {
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); uint32_t num_blocks = params.back();
const uint32_t block_size = gws[2] / num_blocks;
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error; return error;
}; };
std::string tuning_key = std::string tuning_key =
......
...@@ -94,7 +94,7 @@ static void Conv2d3x3S12(const Tensor *input, ...@@ -94,7 +94,7 @@ static void Conv2d3x3S12(const Tensor *input,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {4, 15, 8}; std::vector<uint32_t> lws = {4, 15, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0); std::vector<uint32_t> local_ws(3, 0);
...@@ -103,34 +103,66 @@ static void Conv2d3x3S12(const Tensor *input, ...@@ -103,34 +103,66 @@ static void Conv2d3x3S12(const Tensor *input,
local_ws[2] = std::min<uint32_t>(height * batch, local_ws[2] = std::min<uint32_t>(height * batch,
kwg_size / (local_ws[0] * local_ws[1])); kwg_size / (local_ws[0] * local_ws[1]));
return { return {
{local_ws[0], local_ws[1], local_ws[2]}, {local_ws[0], local_ws[1], local_ws[2], 1},
{local_ws[2], local_ws[1], local_ws[0]}, {kwg_size / 16, 4, 4, 1},
{kwg_size / 16, 4, 4}, {kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 4, 8}, {kwg_size / 32, 8, 4, 1},
{kwg_size / 32, 8, 4}, {kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 8, 8}, {kwg_size / 64, 16, 4, 1},
{kwg_size / 64, 16, 4}, {kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 8, 16}, {kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 16, 8}, {kwg_size / 128, 32, 4, 1},
{kwg_size / 128, 32, 4}, {1, kwg_size / 32, 32, 1},
{1, kwg_size / 32, 32}, {1, kwg_size / 64, 64, 1},
{1, kwg_size / 64, 64}, {1, kwg_size / 128, 128, 1},
{1, kwg_size / 128, 128}, {3, 15, 9, 1},
{3, 15, 9}, {7, 15, 9, 1},
{7, 15, 9}, {9, 7, 15, 1},
{9, 7, 15}, {15, 7, 9, 1},
{15, 7, 9}, {1, kwg_size, 1, 1},
{1, kwg_size, 1}, {4, 15, 8, 1}, // SNPE size
{4, 15, 8}, // SNPE size
}; };
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int { auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = CL_SUCCESS;
conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), if (timer == nullptr) {
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); uint32_t num_blocks = params.back();
const uint32_t block_size = gws[2] / num_blocks;
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error; return error;
}; };
std::string tuning_key = std::string tuning_key =
......
...@@ -96,7 +96,7 @@ void Conv2dOpencl(const Tensor *input, ...@@ -96,7 +96,7 @@ void Conv2dOpencl(const Tensor *input,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8}; std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0); std::vector<uint32_t> local_ws(3, 0);
...@@ -105,34 +105,66 @@ void Conv2dOpencl(const Tensor *input, ...@@ -105,34 +105,66 @@ void Conv2dOpencl(const Tensor *input,
local_ws[2] = std::min<uint32_t>(height * batch, local_ws[2] = std::min<uint32_t>(height * batch,
kwg_size / (local_ws[0] * local_ws[1])); kwg_size / (local_ws[0] * local_ws[1]));
return { return {
{local_ws[0], local_ws[1], local_ws[2]}, {local_ws[0], local_ws[1], local_ws[2], 1},
{local_ws[2], local_ws[1], local_ws[0]}, {kwg_size / 16, 4, 4, 1},
{kwg_size / 16, 4, 4}, {kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 4, 8}, {kwg_size / 32, 8, 4, 1},
{kwg_size / 32, 8, 4}, {kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 8, 8}, {kwg_size / 64, 16, 4, 1},
{kwg_size / 64, 16, 4}, {kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 8, 16}, {kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 16, 8}, {kwg_size / 128, 32, 4, 1},
{kwg_size / 128, 32, 4}, {1, kwg_size / 32, 32, 1},
{1, kwg_size / 32, 32}, {1, kwg_size / 64, 64, 1},
{1, kwg_size / 64, 64}, {1, kwg_size / 128, 128, 1},
{1, kwg_size / 128, 128}, {3, 15, 9, 1},
{3, 15, 9}, {7, 15, 9, 1},
{7, 15, 9}, {9, 7, 15, 1},
{9, 7, 15}, {15, 7, 9, 1},
{15, 7, 9}, {1, kwg_size, 1, 1},
{1, kwg_size, 1}, {4, 15, 8, 1}, // SNPE size
{4, 15, 8}, // SNPE size
}; };
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int { auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = CL_SUCCESS;
conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), if (timer == nullptr) {
cl::NDRange(params[0], params[1], params[2]), nullptr, &event); uint32_t num_blocks = params.back();
const uint32_t block_size = gws[2] / num_blocks;
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error; return error;
}; };
std::string tuning_key = std::string tuning_key =
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
namespace mace { namespace mace {
namespace kernels { namespace kernels {
const float kMaxKernelExeTime = 1000.0; // microseconds
enum BufferType { enum BufferType {
FILTER = 0, FILTER = 0,
IN_OUT= 1, IN_OUT= 1,
......
...@@ -60,7 +60,7 @@ static void Pooling(const Tensor *input, ...@@ -60,7 +60,7 @@ static void Pooling(const Tensor *input,
static_cast<uint32_t>(batch * out_height), static_cast<uint32_t>(batch * out_height),
}; };
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(pooling_kernel);
std::vector<uint32_t> lws(3, 0); std::vector<uint32_t> lws(4, 1);
lws[0] = std::min<uint32_t>(channel_blocks, kwg_size); lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]); lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1])); lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
...@@ -69,35 +69,67 @@ static void Pooling(const Tensor *input, ...@@ -69,35 +69,67 @@ static void Pooling(const Tensor *input,
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size); local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]); local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1])); local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{local_ws[0], local_ws[1], local_ws[2]}, return {
{kwg_size / 16, 4, 4}, {local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 32, 4, 8}, {kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 8, 4}, {kwg_size / 32, 4, 8, 1},
{kwg_size / 64, 8, 8}, {kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 16, 4}, {kwg_size / 64, 8, 8, 1},
{kwg_size / 128, 8, 16}, {kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 16, 8}, {kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 32, 4}, {kwg_size / 128, 16, 8, 1},
{1, kwg_size / 32, 32}, {kwg_size / 128, 32, 4, 1},
{1, kwg_size / 64, 64}, {1, kwg_size / 32, 32, 1},
{1, kwg_size / 128, 128}, {1, kwg_size / 64, 64, 1},
{3, 15, 9}, {1, kwg_size / 128, 128, 1},
{7, 15, 9}, {3, 15, 9, 1},
{9, 7, 15}, {7, 15, 9, 1},
{15, 7, 9}, {9, 7, 15, 1},
{1, kwg_size, 1}, {15, 7, 9, 1},
{4, 15, 8}, //SNPE size {1, kwg_size, 1, 1},
{4, 15, 8, 1}, // SNPE size
}; };
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int { auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = CL_SUCCESS;
pooling_kernel, cl::NullRange, if (timer == nullptr) {
cl::NDRange(gws[0], gws[1], gws[2]), uint32_t num_blocks = params.back();
cl::NDRange(params[0], params[1], params[2]), const uint32_t block_size = gws[2] / num_blocks;
nullptr, &event); if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error; return error;
}; };
std::stringstream ss; std::stringstream ss;
......
...@@ -59,38 +59,74 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()( ...@@ -59,38 +59,74 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(out_width), static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)}; static_cast<uint32_t>(out_height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8}; std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0); std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size); local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]); local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1])); local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{local_ws[0], local_ws[1], local_ws[2]}, return {
{kwg_size / 16, 4, 4}, {local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 32, 4, 8}, {kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 8, 4}, {kwg_size / 32, 4, 8, 1},
{kwg_size / 64, 8, 8}, {kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 16, 4}, {kwg_size / 64, 8, 8, 1},
{kwg_size / 128, 8, 16}, {kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 16, 8}, {kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 32, 4}, {kwg_size / 128, 16, 8, 1},
{1, kwg_size / 32, 32}, {kwg_size / 128, 32, 4, 1},
{1, kwg_size / 64, 64}, {1, kwg_size / 32, 32, 1},
{1, kwg_size / 128, 128}, {1, kwg_size / 64, 64, 1},
{1, kwg_size, 1}, {1, kwg_size / 128, 128, 1},
{4, 15, 8}, //SNPE size {3, 15, 9, 1},
{7, 15, 9, 1},
{9, 7, 15, 1},
{15, 7, 9, 1},
{1, kwg_size, 1, 1},
{4, 15, 8, 1}, // SNPE size
}; };
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int { auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = CL_SUCCESS;
rb_kernel, cl::NullRange, if (timer == nullptr) {
cl::NDRange(gws[0], gws[1], gws[2]), uint32_t num_blocks = params.back();
cl::NDRange(params[0], params[1], params[2]), const uint32_t block_size = gws[2] / num_blocks;
nullptr, &event); if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
rb_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
rb_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
rb_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error; return error;
}; };
std::stringstream ss; std::stringstream ss;
......
...@@ -41,42 +41,74 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits, ...@@ -41,42 +41,74 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks), const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width), static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)}; static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8}; std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(softmax_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(softmax_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0); std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size); local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]); local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1])); local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{4, 15, 8}, //SNPE size return {
{local_ws[0], local_ws[1], local_ws[2]}, {local_ws[0], local_ws[1], local_ws[2], 1},
{local_ws[2], local_ws[1], local_ws[0]}, {kwg_size / 16, 4, 4, 1},
{kwg_size / 16, 4, 4}, {kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 4, 8}, {kwg_size / 32, 8, 4, 1},
{kwg_size / 32, 8, 4}, {kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 8, 8}, {kwg_size / 64, 16, 4, 1},
{kwg_size / 64, 16, 4}, {kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 8, 16}, {kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 16, 8}, {kwg_size / 128, 32, 4, 1},
{kwg_size / 128, 32, 4}, {1, kwg_size / 32, 32, 1},
{1, kwg_size / 32, 32}, {1, kwg_size / 64, 64, 1},
{1, kwg_size / 64, 64}, {1, kwg_size / 128, 128, 1},
{1, kwg_size / 128, 128}, {3, 15, 9, 1},
{3, 15, 9}, {7, 15, 9, 1},
{7, 15, 9}, {9, 7, 15, 1},
{9, 7, 15}, {15, 7, 9, 1},
{15, 7, 9}, {1, kwg_size, 1, 1},
{1, kwg_size, 1}}; {4, 15, 8, 1}, // SNPE size
};
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int { auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = CL_SUCCESS;
softmax_kernel, cl::NullRange, if (timer == nullptr) {
cl::NDRange(gws[0], gws[1], gws[2]), uint32_t num_blocks = params.back();
cl::NDRange(params[0], params[1], params[2]), const uint32_t block_size = gws[2] / num_blocks;
nullptr, &event); if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
softmax_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
softmax_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
softmax_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error; return error;
}; };
std::stringstream ss; std::stringstream ss;
......
...@@ -61,36 +61,74 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor ...@@ -61,36 +61,74 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
const uint32_t gws[3] = {chan_blk, const uint32_t gws[3] = {chan_blk,
static_cast<uint32_t>(batch_tensor->dim(2)), static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))}; static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
const std::vector<uint32_t> lws = {8, 16, 8}; std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(s2b_kernel); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(s2b_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> { auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0); std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(chan_blk, kwg_size); local_ws[0] = std::min<uint32_t>(chan_blk, kwg_size);
local_ws[1] = std::min<uint32_t>(32, kwg_size / local_ws[0]); local_ws[1] = std::min<uint32_t>(32, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(32, kwg_size / (local_ws[0] * local_ws[1])); local_ws[2] = std::min<uint32_t>(32, kwg_size / (local_ws[0] * local_ws[1]));
return {{local_ws[0], local_ws[1], local_ws[2]}, return {
{4, 32, 8}, {local_ws[0], local_ws[1], local_ws[2], 1},
{4, 64, 4}, {kwg_size / 16, 4, 4, 1},
{4, 128, 2}, {kwg_size / 32, 4, 8, 1},
{8, 16, 8}, {kwg_size / 32, 8, 4, 1},
{8, 32, 4}, {kwg_size / 64, 8, 8, 1},
{8, 64, 2}, {kwg_size / 64, 16, 4, 1},
{16, 8, 8}, {kwg_size / 128, 8, 16, 1},
{16, 16, 4}, {kwg_size / 128, 16, 8, 1},
{16, 32, 2}, {kwg_size / 128, 32, 4, 1},
{32, 8, 4}, {1, kwg_size / 32, 32, 1},
{32, 16, 2}, {1, kwg_size / 64, 64, 1},
{64, 4, 4}}; {1, kwg_size / 128, 128, 1},
{3, 15, 9, 1},
{7, 15, 9, 1},
{9, 7, 15, 1},
{15, 7, 9, 1},
{1, kwg_size, 1, 1},
{4, 15, 8, 1}, // SNPE size
};
}; };
cl::Event event; cl::Event event;
auto func = [&](const std::vector<uint32_t> &params) -> cl_int { auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel( cl_int error = CL_SUCCESS;
s2b_kernel, cl::NullRange, if (timer == nullptr) {
cl::NDRange(gws[0], gws[1], gws[2]), uint32_t num_blocks = params.back();
cl::NDRange(params[0], params[1], params[2]), const uint32_t block_size = gws[2] / num_blocks;
nullptr, &event); if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
s2b_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
s2b_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
s2b_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error; return error;
}; };
std::stringstream ss; std::stringstream ss;
......
...@@ -10,29 +10,50 @@ ...@@ -10,29 +10,50 @@
namespace mace { namespace mace {
class Timer { class Timer {
public: public:
virtual void StartTiming() = 0; virtual void StartTiming() = 0;
virtual void StopTiming() = 0; virtual void StopTiming() = 0;
virtual double ElapsedMicros() = 0; virtual void AccumulateTiming() = 0;
virtual void ClearTiming() = 0;
virtual double ElapsedMicros() = 0;
virtual double AccumulatedMicros() = 0;
}; };
class WallClockTimer : public Timer { class WallClockTimer : public Timer {
public: public:
void StartTiming() override { WallClockTimer() : accumulated_micros_(0) {}
start_micros_ = mace::utils::NowMicros();
} void StartTiming() override {
start_micros_ = mace::utils::NowMicros();
void StopTiming() override { }
stop_micros_ = mace::utils::NowMicros();
} void StopTiming() override {
stop_micros_ = mace::utils::NowMicros();
double ElapsedMicros() override { }
return stop_micros_ - start_micros_;
} void AccumulateTiming() override {
StopTiming();
private: accumulated_micros_ += stop_micros_ - start_micros_;
double start_micros_; }
double stop_micros_;
void ClearTiming() override {
start_micros_ = 0;
stop_micros_ = 0;
accumulated_micros_ = 0;
}
double ElapsedMicros() override {
return stop_micros_ - start_micros_;
}
double AccumulatedMicros() override {
return accumulated_micros_;
}
private:
double start_micros_;
double stop_micros_;
double accumulated_micros_;
}; };
} // namespace mace } // namespace mace
......
...@@ -41,10 +41,10 @@ class Tuner { ...@@ -41,10 +41,10 @@ class Tuner {
template <typename RetType> template <typename RetType>
RetType TuneOrRun( RetType TuneOrRun(
const std::string param_key, const std::string param_key,
const std::vector<param_type> &default_param, std::vector<param_type> &default_param,
const std::function<std::vector<std::vector<param_type>>()> const std::function<std::vector<std::vector<param_type>>()>
&param_generator, &param_generator,
const std::function<RetType(const std::vector<param_type> &)> &func, const std::function<RetType(std::vector<param_type> &, Timer *)> &func,
Timer *timer) { Timer *timer) {
std::string obfucated_param_key = MACE_OBFUSCATE_SYMBOL(param_key); std::string obfucated_param_key = MACE_OBFUSCATE_SYMBOL(param_key);
if (IsTuning() && param_generator != nullptr) { if (IsTuning() && param_generator != nullptr) {
...@@ -60,12 +60,12 @@ class Tuner { ...@@ -60,12 +60,12 @@ class Tuner {
if (param_table_.find(obfucated_param_key) != param_table_.end()) { if (param_table_.find(obfucated_param_key) != param_table_.end()) {
VLOG(1) << param_key << ": " VLOG(1) << param_key << ": "
<< internal::MakeString(param_table_[obfucated_param_key]); << internal::MakeString(param_table_[obfucated_param_key]);
return func(param_table_[obfucated_param_key]); return func(param_table_[obfucated_param_key], nullptr);
} else { } else {
#ifndef MACE_DISABLE_NO_TUNING_WARNING #ifndef MACE_DISABLE_NO_TUNING_WARNING
LOG(WARNING) << "Fallback to default parameter: " << param_key; LOG(WARNING) << "Fallback to default parameter: " << param_key;
#endif #endif
return func(default_param); return func(default_param, nullptr);
} }
} }
} }
...@@ -119,18 +119,16 @@ class Tuner { ...@@ -119,18 +119,16 @@ class Tuner {
template <typename RetType> template <typename RetType>
inline RetType Run( inline RetType Run(
const std::function<RetType(const std::vector<param_type> &)> &func, const std::function<RetType(std::vector<param_type> &, Timer *)> &func,
const std::vector<param_type> &params, std::vector<param_type> &params,
Timer *timer, Timer *timer,
int num_runs, int num_runs,
double *time_us) { double *time_us) {
RetType res; RetType res;
int64_t total_time_us = 0; int64_t total_time_us = 0;
for (int i = 0; i < num_runs; ++i) { for (int i = 0; i < num_runs; ++i) {
timer->StartTiming(); res = func(params, timer);
res = func(params); total_time_us += timer->AccumulatedMicros();
timer->StopTiming();
total_time_us += timer->ElapsedMicros();
} }
*time_us = total_time_us * 1.0 / num_runs; *time_us = total_time_us * 1.0 / num_runs;
...@@ -141,13 +139,13 @@ class Tuner { ...@@ -141,13 +139,13 @@ class Tuner {
inline RetType Tune( inline RetType Tune(
const std::function<std::vector<std::vector<param_type>>()> const std::function<std::vector<std::vector<param_type>>()>
&param_generator, &param_generator,
const std::function<RetType(const std::vector<param_type> &)> &func, const std::function<RetType(std::vector<param_type> &, Timer *)> &func,
Timer *timer, Timer *timer,
std::vector<param_type> *opt_params) { std::vector<param_type> *opt_params) {
RetType res; RetType res;
double opt_time = std::numeric_limits<double>::max(); double opt_time = std::numeric_limits<double>::max();
auto params = param_generator(); auto params = param_generator();
for (const auto &param : params) { for (auto param : params) {
double tmp_time = 0.0; double tmp_time = 0.0;
// warm up // warm up
Run<RetType>(func, param, timer, 2, &tmp_time); Run<RetType>(func, param, timer, 2, &tmp_time);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册