提交 faa8459b 编写于 作者: L liuqi

Refactor tuning code.

上级 a9dce8ec
......@@ -64,84 +64,10 @@ void ActivationFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size =
runtime->GetKernelMaxWorkGroupSize(activation_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch,
kwg_size / (local_ws[0] * local_ws[1]));
return {
{local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 32, 4, 1},
{1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128, 1},
{3, 15, 9, 1},
{7, 15, 9, 1},
{9, 7, 15, 1},
{15, 7, 9, 1},
{1, kwg_size, 1, 1},
{4, 15, 8, 1}, // SNPE size
};
};
cl::Event event;
auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params.back();
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
activation_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
activation_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
activation_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error;
};
std::string tuning_key =
Concat("relu_opencl_kernel_", activation_, output->dim(0), output->dim(1),
output->dim(2), output->dim(3));
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
SetFuture(future, event);
TuningOrRun3DKernel(activation_kernel, tuning_key, gws, lws, future);
}
template struct ActivationFunctor<DeviceType::OPENCL, float>;
......
......@@ -49,89 +49,14 @@ static void AddN(const std::vector<const Tensor *> &input_tensors,
static_cast<uint32_t>(width_pixels),
static_cast<uint32_t>(batch_height_pixels)
};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(addn_kernel);
std::vector<uint32_t> lws = {64, 16, 1};
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
uint32_t local_ws[2];
local_ws[0] = std::min<uint32_t>(width_pixels, kwg_size);
local_ws[1] = std::min<uint32_t>(batch_height_pixels, kwg_size / local_ws[0]);
return {{local_ws[0], local_ws[1], 1},
{local_ws[1], local_ws[0], 1},
{kwg_size / 4, 4, 1},
{kwg_size / 16, 16, 1},
{kwg_size / 32, 32, 1},
{kwg_size / 64, 64, 1},
{kwg_size / 128, 128, 1},
{kwg_size / 256, 256, 1},
{kwg_size / 512, 512, 1},
{kwg_size, 1, 1},
{1, kwg_size, 1}
};
};
cl::Event event;
auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params.back();
const uint32_t block_size = gws[1] / num_blocks;
if (gws[1] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel,
cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
params.back() = num_blocks;
const uint32_t block_size = gws[1] / num_blocks;
if (gws[1] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
addn_kernel,
cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error;
};
std::stringstream ss;
ss << "addn_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func,
&timer);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
TuningOrRun2DKernel(addn_kernel, ss.str(), gws, lws, future);
}
template <typename T>
......
......@@ -84,83 +84,10 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch,
kwg_size / (local_ws[0] * local_ws[1]));
return {
{local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 32, 4, 1},
{1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128, 1},
{3, 15, 9, 1},
{7, 15, 9, 1},
{9, 7, 15, 1},
{15, 7, 9, 1},
{1, kwg_size, 1, 1},
{8, 128, 1, 1}, // SNPE size
};
};
cl::Event event;
auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params.back();
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
bm_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
bm_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error;
};
std::string tuning_key =
Concat("batch_norm_opencl_kernel_", activation_, output->dim(0),
output->dim(1), output->dim(2), output->dim(3), folded_constant_);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
SetFuture(future, event);
TuningOrRun3DKernel(bm_kernel, tuning_key, gws, lws, future);
}
template struct BatchNormFunctor<DeviceType::OPENCL, float>;
......
......@@ -50,96 +50,14 @@ static void Concat2(const Tensor *input0,
static_cast<uint32_t>(width),
static_cast<uint32_t>(batch * height),
};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel);
std::vector<uint32_t> lws = {8, 16, 8, 1};
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blk, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {{local_ws[0], local_ws[1], local_ws[2], 1},
{local_ws[2], local_ws[1], local_ws[0], 1},
{kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 32, 4, 1},
{1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128, 1},
{3, 15, 9, 1},
{7, 15, 9, 1},
{9, 7, 15, 1},
{15, 7, 9, 1},
{1, kwg_size, 1, 1},
{4, 15, 8, 1}, //SNPE size
};
};
cl::Event event;
auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params.back();
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
concat_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
concat_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
concat_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error;
};
std::stringstream ss;
ss << "concat_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func,
&timer);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
TuningOrRun3DKernel(concat_kernel, ss.str(), gws, lws, future);
}
template<typename T>
......
......@@ -97,83 +97,10 @@ void Conv1x1(const Tensor *input,
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
std::vector<uint32_t> lws = {8, 15, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch,
kwg_size / (local_ws[0] * local_ws[1]));
return {
{local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 32, 4, 1},
{1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128, 1},
{3, 15, 9, 1},
{7, 15, 9, 1},
{9, 7, 15, 1},
{15, 7, 9, 1},
{1, kwg_size, 1, 1},
{4, 15, 8, 1}, // SNPE size
};
};
cl::Event event;
auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params.back();
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error;
};
std::string tuning_key =
Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
SetFuture(future, event);
TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future);
}
extern void Conv2dOpenclK1x1S1(const Tensor *input,
......
......@@ -95,83 +95,10 @@ static void Conv2d3x3S12(const Tensor *input,
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
std::vector<uint32_t> lws = {4, 15, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch,
kwg_size / (local_ws[0] * local_ws[1]));
return {
{local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 32, 4, 1},
{1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128, 1},
{3, 15, 9, 1},
{7, 15, 9, 1},
{9, 7, 15, 1},
{15, 7, 9, 1},
{1, kwg_size, 1, 1},
{4, 15, 8, 1}, // SNPE size
};
};
cl::Event event;
auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params.back();
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error;
};
std::string tuning_key =
Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
SetFuture(future, event);
TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future);
}
void Conv2dOpenclK3x3S1(const Tensor *input,
const Tensor *filter,
......
......@@ -97,83 +97,10 @@ void Conv2dOpencl(const Tensor *input,
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width_blocks, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch,
kwg_size / (local_ws[0] * local_ws[1]));
return {
{local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 32, 4, 1},
{1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128, 1},
{3, 15, 9, 1},
{7, 15, 9, 1},
{9, 7, 15, 1},
{15, 7, 9, 1},
{1, kwg_size, 1, 1},
{4, 15, 8, 1}, // SNPE size
};
};
cl::Event event;
auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params.back();
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error;
};
std::string tuning_key =
Concat("conv2d_general_opencl_kernel_", activation, output->dim(0),
output->dim(1), output->dim(2), output->dim(3));
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
SetFuture(future, event);
TuningOrRun3DKernel(conv_2d_kernel, tuning_key, gws, lws, future);
}
} // namespace kernels
......
......@@ -4,6 +4,7 @@
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
......@@ -100,5 +101,181 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) {
}
}
void TuningOrRun3DKernel(cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
std::vector<uint32_t> &lws,
StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(gws[2],
kwg_size / (local_ws[0] * local_ws[1]));
return {
{local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 32, 4, 1},
{1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128, 1},
{3, 15, 9, 1},
{7, 15, 9, 1},
{9, 7, 15, 1},
{15, 7, 9, 1},
{1, kwg_size, 1, 1},
{4, 15, 8, 1}, // SNPE size
};
};
cl::Event event;
auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params.back();
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error;
};
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(
tuning_key, lws, params_generator, func, &timer);
if (future != nullptr) {
future->wait_fn = [event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
OpenCLRuntime::Global()->GetCallStats(event, stats);
}
};
}
}
void TuningOrRun2DKernel(cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
std::vector<uint32_t> &lws,
StatsFuture *future) {
auto runtime = OpenCLRuntime::Global();
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
uint32_t local_ws[2];
local_ws[0] = std::min<uint32_t>(gws[0], kwg_size);
local_ws[1] = std::min<uint32_t>(gws[1], kwg_size / local_ws[0]);
return {{local_ws[0], local_ws[1], 1},
{local_ws[1], local_ws[0], 1},
{kwg_size / 4, 4, 1},
{kwg_size / 16, 16, 1},
{kwg_size / 32, 32, 1},
{kwg_size / 64, 64, 1},
{kwg_size / 128, 128, 1},
{kwg_size / 256, 256, 1},
{kwg_size / 512, 512, 1},
{kwg_size, 1, 1},
{1, kwg_size, 1}
};
};
cl::Event event;
auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params.back();
const uint32_t block_size = gws[1] / num_blocks;
if (gws[1] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
kernel,
cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]),
nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1]),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[1]);
params.back() = num_blocks;
const uint32_t block_size = gws[1] / num_blocks;
if (gws[1] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
kernel,
cl::NDRange(0, i * block_size),
cl::NDRange(gws[0], gws1),
cl::NDRange(params[0], params[1]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error;
};
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(tuning_key,
lws,
params_generator,
func,
&timer);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
}
} // namespace kernels
} // namespace mace
......@@ -18,7 +18,7 @@ const float kMaxKernelExeTime = 1000.0; // microseconds
enum BufferType {
FILTER = 0,
IN_OUT= 1,
IN_OUT = 1,
ARGUMENT = 2
};
......@@ -34,6 +34,19 @@ std::string DtToCLDt(const DataType dt);
std::string DtToUpstreamCLDt(const DataType dt);
void TuningOrRun3DKernel(cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
std::vector<uint32_t> &lws,
StatsFuture *future);
void TuningOrRun2DKernel(cl::Kernel &kernel,
const std::string tuning_key,
const uint32_t *gws,
std::vector<uint32_t> &lws,
StatsFuture *future);
inline void SetFuture(StatsFuture *future, const cl::Event &event) {
if (future != nullptr) {
future->wait_fn = [event](CallStats *stats) {
......@@ -48,7 +61,7 @@ inline void SetFuture(StatsFuture *future, const cl::Event &event) {
namespace {
template<typename T>
void AppendToStream(std::stringstream *ss, const std::string &delimiter, T v) {
(*ss) << v;
(*ss) << v;
}
template<typename T, typename... Args>
......@@ -56,8 +69,8 @@ void AppendToStream(std::stringstream *ss,
const std::string &delimiter,
T first,
Args... args) {
(*ss) << first << delimiter;
AppendToStream(ss, delimiter, args...);
(*ss) << first << delimiter;
AppendToStream(ss, delimiter, args...);
}
} // namespace
......
......@@ -64,95 +64,13 @@ static void Pooling(const Tensor *input,
lws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
lws[1] = std::min<uint32_t>(out_width, kwg_size / lws[0]);
lws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (lws[0] * lws[1]));
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {
{local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 32, 4, 1},
{1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128, 1},
{3, 15, 9, 1},
{7, 15, 9, 1},
{9, 7, 15, 1},
{15, 7, 9, 1},
{1, kwg_size, 1, 1},
{4, 15, 8, 1}, // SNPE size
};
};
cl::Event event;
auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params.back();
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
pooling_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error;
};
std::stringstream ss;
ss << "pooling_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func,
&timer);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
TuningOrRun3DKernel(pooling_kernel, ss.str(), gws, lws, future);
}
template<typename T>
......
......@@ -60,95 +60,13 @@ void ResizeBilinearFunctor<DeviceType::OPENCL, T>::operator()(
static_cast<uint32_t>(out_width),
static_cast<uint32_t>(out_height * batch)};
std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(rb_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(out_width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(out_height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {
{local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 32, 4, 1},
{1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128, 1},
{3, 15, 9, 1},
{7, 15, 9, 1},
{9, 7, 15, 1},
{15, 7, 9, 1},
{1, kwg_size, 1, 1},
{4, 15, 8, 1}, // SNPE size
};
};
cl::Event event;
auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params.back();
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
rb_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
rb_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
rb_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error;
};
std::stringstream ss;
ss << "resize_bilinear_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func,
&timer);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
TuningOrRun3DKernel(rb_kernel, ss.str(), gws, lws, future);
}
template struct ResizeBilinearFunctor<DeviceType::OPENCL, float>;
......
......@@ -42,95 +42,13 @@ void SoftmaxFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *logits,
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(softmax_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(channel_blocks, kwg_size);
local_ws[1] = std::min<uint32_t>(width, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(height * batch, kwg_size / (local_ws[0] * local_ws[1]));
return {
{local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 32, 4, 1},
{1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128, 1},
{3, 15, 9, 1},
{7, 15, 9, 1},
{9, 7, 15, 1},
{15, 7, 9, 1},
{1, kwg_size, 1, 1},
{4, 15, 8, 1}, // SNPE size
};
};
cl::Event event;
auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params.back();
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
softmax_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
softmax_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
softmax_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error;
};
std::stringstream ss;
ss << "softmax_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func,
&timer);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
TuningOrRun3DKernel(softmax_kernel, ss.str(), gws, lws, future);
}
template
......
......@@ -62,95 +62,13 @@ void SpaceToBatchFunctor<DeviceType::OPENCL, T>::operator()(Tensor *space_tensor
static_cast<uint32_t>(batch_tensor->dim(2)),
static_cast<uint32_t>(batch_tensor->dim(0) * batch_tensor->dim(1))};
std::vector<uint32_t> lws = {8, 16, 8, 1};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(s2b_kernel);
auto params_generator = [&]() -> std::vector<std::vector<uint32_t>> {
std::vector<uint32_t> local_ws(3, 0);
local_ws[0] = std::min<uint32_t>(chan_blk, kwg_size);
local_ws[1] = std::min<uint32_t>(32, kwg_size / local_ws[0]);
local_ws[2] = std::min<uint32_t>(32, kwg_size / (local_ws[0] * local_ws[1]));
return {
{local_ws[0], local_ws[1], local_ws[2], 1},
{kwg_size / 16, 4, 4, 1},
{kwg_size / 32, 4, 8, 1},
{kwg_size / 32, 8, 4, 1},
{kwg_size / 64, 8, 8, 1},
{kwg_size / 64, 16, 4, 1},
{kwg_size / 128, 8, 16, 1},
{kwg_size / 128, 16, 8, 1},
{kwg_size / 128, 32, 4, 1},
{1, kwg_size / 32, 32, 1},
{1, kwg_size / 64, 64, 1},
{1, kwg_size / 128, 128, 1},
{3, 15, 9, 1},
{7, 15, 9, 1},
{9, 7, 15, 1},
{15, 7, 9, 1},
{1, kwg_size, 1, 1},
{4, 15, 8, 1}, // SNPE size
};
};
cl::Event event;
auto func = [&](std::vector<uint32_t> &params, Timer *timer) -> cl_int {
cl_int error = CL_SUCCESS;
if (timer == nullptr) {
uint32_t num_blocks = params.back();
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
s2b_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
}
} else {
timer->StartTiming();
error = runtime->command_queue().enqueueNDRangeKernel(
s2b_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->StopTiming();
double elapse_time = timer->ElapsedMicros();
timer->ClearTiming();
uint32_t num_blocks = std::min(static_cast<uint32_t>(elapse_time / kMaxKernelExeTime) + 1, gws[2]);
params.back() = num_blocks;
const uint32_t block_size = gws[2] / num_blocks;
if (gws[2] % num_blocks > 0) num_blocks++;
for (uint32_t i = 0; i < num_blocks; ++i) {
uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size;
error = runtime->command_queue().enqueueNDRangeKernel(
s2b_kernel,
cl::NDRange(0, 0, i * block_size),
cl::NDRange(gws[0], gws[1], gws2),
cl::NDRange(params[0], params[1], params[2]), nullptr, &event);
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
timer->AccumulateTiming();
}
}
return error;
};
std::stringstream ss;
ss << kernel_name << "_"
<< batch_tensor->dim(0) << "_"
<< batch_tensor->dim(1) << "_"
<< batch_tensor->dim(2) << "_"
<< batch_tensor->dim(3);
OpenCLProfilingTimer timer(&event);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func,
&timer);
if (future != nullptr) {
future->wait_fn = [runtime, event](CallStats *stats) {
event.wait();
if (stats != nullptr) {
runtime->GetCallStats(event, stats);
}
};
}
TuningOrRun3DKernel(s2b_kernel, ss.str(), gws, lws, future);
}
template struct SpaceToBatchFunctor<DeviceType::OPENCL, float>;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册