提交 116d33db 编写于 作者: L liuqi

Add auto-tuning for opencl kernel.

上级 a66e0e35
......@@ -87,6 +87,11 @@ bool SimpleNet::Run(RunMetadata *run_metadata) {
VLOG(1) << "Op " << op->debug_def().name()
<< " has shape: " << internal::MakeString(op->Output(0)->shape());
}
#ifdef __USE_OPENCL
if (device_type_ == DeviceType::OPENCL) {
OpenCLRuntime::Get()->command_queue().finish();
}
#endif
return true;
}
......
......@@ -11,7 +11,7 @@
namespace mace {
namespace kernels {
template <typename T>
template<typename T>
void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
const Tensor *input,
const Tensor *scale,
......@@ -27,10 +27,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Get();
std::set<std::string> built_options;
auto dt = DataTypeToEnum<T>::value;
......@@ -38,9 +34,6 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
const std::vector<uint32_t> lws = {1, kwg_size, 1};
uint32_t idx = 0;
bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
bm_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(scale->buffer())));
......@@ -50,18 +43,31 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
bm_kernel.setArg(idx++, epsilon_);
bm_kernel.setArg(idx++, *(static_cast<cl::Image2D *>(output->buffer())));
auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> {
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel);
auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
return {{8, 128, 1}, //SNPE size
{1, 1, 64},
{1, 1, 128},
{1, kwg_size/16, 16},
{1, kwg_size/32, 32},
{1, kwg_size/64, 64},
{1, kwg_size/128, 128},
{1, 1, kwg_size},
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
};
auto func = [&](const std::vector<uint32_t>& params)->cl_int {
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
bm_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
......@@ -73,10 +79,10 @@ void BatchNormFunctor<DeviceType::OPENCL, T>::operator()(
};
std::stringstream ss;
ss << "batch_norm_opencl_kernel_"
<< input->dim(0) << "_"
<< input->dim(1) << "_"
<< input->dim(2) << "_"
<< input->dim(3);
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
......
......@@ -36,7 +36,7 @@ void BiasAddFunctor<DeviceType::OPENCL, T>::operator()(
auto bias_kernel = runtime->BuildKernel("bias_add", "bias_add", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bias_kernel);
const std::vector<uint32_t> lws = {1, kwg_size, 1};
const std::vector<uint32_t> lws = {8, 16, 8};
uint32_t idx = 0;
bias_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
......
......@@ -12,19 +12,17 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */
const int w = get_global_id(0);
const int hb = get_global_id(1);
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
DATA_TYPE4 in0 = READ_IMAGET(input0, sampler, (int2)(w, hb));
DATA_TYPE4 in1 = READ_IMAGET(input1, sampler, (int2)(w, hb));
DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb));
DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb));
DATA_TYPE4 out = in0 + in1;
#if INPUT_NUM > 2
DATA_TYPE4 in2 = READ_IMAGET(input2, sampler, (int2)(w, hb));
DATA_TYPE4 in2 = READ_IMAGET(input2, SAMPLER, (int2)(w, hb));
out = out + in2;
#endif
#if INPUT_NUM > 3
DATA_TYPE4 in3 = READ_IMAGET(input3, sampler, (int2)(w, hb));
DATA_TYPE4 in3 = READ_IMAGET(input3, SAMPLER, (int2)(w, hb));
out = out + in3;
#endif
......
......@@ -54,9 +54,8 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc
+ out_channel_idx;
if (in_channel_idx < in_channel) {
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coord = (int2)(w, h);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord);
const int size = (out_channel - out_channel_idx);
if (size < 4) {
switch (size) {
......@@ -119,9 +118,8 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels
+ channel_idx;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coord = (int2)(w, h);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord);
const int size = channels - channel_idx;
if (size < 4) {
switch (size) {
......@@ -169,9 +167,8 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */
int h = get_global_id(1);
const int offset = w * 4;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
int2 coord = (int2)(w, h);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord);
VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord);
const int size = count - offset;
if (size < 4) {
switch (size) {
......
......@@ -21,10 +21,9 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
const int out_hb = get_global_id(2);
const int rounded_in_ch = in_ch_blks * 4;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
#ifdef BIAS
DATA_TYPE4 out0 =
READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
DATA_TYPE4 out1 = out0;
DATA_TYPE4 out2 = out0;
DATA_TYPE4 out3 = out0;
......@@ -71,7 +70,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
in_width_value = select(in_idx + in_width_value, \
-1, \
(in_width_value < 0 || in_width_value >= in_width)); \
in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
in##i = READ_IMAGET(input, SAMPLER, (int2)(in_width_value, in_hb_value));
READ_INPUT(0);
READ_INPUT(1);
......@@ -81,10 +80,10 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */
#undef READ_INPUT
int filter_idx = (in_ch_blk << 2) + (hb_idx * filter_width + width_idx) * rounded_in_ch;
weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data?
......
......@@ -16,10 +16,8 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const int out_w_blks = get_global_size(1);
const int out_hb = get_global_id(2);
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
#ifdef BIAS
DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
DATA_TYPE4 out1 = out0;
DATA_TYPE4 out2 = out0;
DATA_TYPE4 out3 = out0;
......@@ -58,16 +56,16 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
int in_x_base = 0;
for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) {
DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx));
DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx));
DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx));
DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx));
DATA_TYPE4 in0 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.x, out_hb_idx));
DATA_TYPE4 in1 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.y, out_hb_idx));
DATA_TYPE4 in2 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.z, out_hb_idx));
DATA_TYPE4 in3 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.w, out_hb_idx));
const int filter_x0 = in_ch_blk << 2;
DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk));
DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk));
DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk));
DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk));
DATA_TYPE4 weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0, out_ch_blk));
DATA_TYPE4 weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 1, out_ch_blk));
DATA_TYPE4 weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 2, out_ch_blk));
DATA_TYPE4 weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data?
out0 += in0.x * weights0;
......
......@@ -19,10 +19,9 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
const int out_hb = get_global_id(2);
const int rounded_in_ch = in_ch_blks * 4;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
#ifdef BIAS
DATA_TYPE4 out0 =
READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0));
READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0));
DATA_TYPE4 out1 = out0;
DATA_TYPE4 out2 = out0;
DATA_TYPE4 out3 = out0;
......@@ -72,7 +71,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
in_width_value = select(in_idx + in_width_value, \
-1, \
(in_width_value < 0 || in_width_value >= in_width)); \
in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value));
in##i = READ_IMAGET(input, SAMPLER, (int2)(in_width_value, in_hb_value));
READ_INPUT(0);
READ_INPUT(1);
......@@ -83,10 +82,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b]
#undef READ_INPUT
int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch;
weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk));
weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk));
weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk));
weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk));
weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk));
weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk));
weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk));
weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk));
// Will prefetch L2 improve performance? How to pretch image data?
......
......@@ -25,17 +25,16 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w *
const float h_lerp = h_in - h_lower;
const float w_lerp = w_in - w_lower;
const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
const int in_w_offset = ch_blk * in_width;
const int in_h_offset = b * in_height;
DATA_TYPE4 top_left = READ_IMAGET(input, sampler,
DATA_TYPE4 top_left = READ_IMAGET(input, SAMPLER,
(int2)(in_w_offset + w_lower, in_h_offset + h_lower));
DATA_TYPE4 top_right = READ_IMAGET(input, sampler,
DATA_TYPE4 top_right = READ_IMAGET(input, SAMPLER,
(int2)(in_w_offset + w_upper, in_h_offset + h_lower));
DATA_TYPE4 bottom_left = READ_IMAGET(input, sampler,
DATA_TYPE4 bottom_left = READ_IMAGET(input, SAMPLER,
(int2)(in_w_offset + w_lower, in_h_offset + h_upper));
DATA_TYPE4 bottom_right = READ_IMAGET(input, sampler,
DATA_TYPE4 bottom_right = READ_IMAGET(input, SAMPLER,
(int2)(in_w_offset + w_upper, in_h_offset + h_upper));
DATA_TYPE4 top = top_left + (top_right - top_left) * w_lerp;
......
......@@ -43,10 +43,10 @@ static void Concat2(const Tensor *input0,
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel);
uint32_t lws[3];
lws[0] = std::min<uint32_t>(channel_blk, kwg_size);
lws[1] = std::min<uint32_t>(width, kwg_size / lws[0]);
lws[2] = std::min<uint32_t>(height * batch, kwg_size / (lws[0] * lws[1]));
uint32_t lws[3] = {8, 16, 8};
// lws[0] = std::min<uint32_t>(channel_blk, kwg_size);
// lws[1] = std::min<uint32_t>(width, kwg_size / lws[0]);
// lws[2] = std::min<uint32_t>(height * batch, kwg_size / (lws[0] * lws[1]));
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
concat_kernel, cl::NullRange,
......
......@@ -7,6 +7,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
......@@ -48,7 +49,6 @@ void Conv1x1(const Tensor *input,
auto program = runtime->program();
auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0;
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
......@@ -63,16 +63,51 @@ void Conv1x1(const Tensor *input,
conv_2d_kernel.setArg(idx++, static_cast<int>(height));
conv_2d_kernel.setArg(idx++, static_cast<int>(width));
auto command_queue = runtime->command_queue();
cl_int error;
error = command_queue.enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)),
cl::NDRange(4, 15, 8), // TODO auto tuning
nullptr, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 15, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&kwg_size]()->std::vector<std::vector<uint32_t>> {
return {{4, 15, 8}, //SNPE size
{kwg_size/16, 4, 4},
{kwg_size/32, 4, 8},
{kwg_size/32, 8, 4},
{kwg_size/64, 8, 8},
{kwg_size/64, 16, 4},
{kwg_size/128, 8, 16},
{kwg_size/128, 16, 8},
{kwg_size/128, 32, 4},
{1, kwg_size/32, 32},
{1, kwg_size/64, 64},
{1, kwg_size/128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
};
auto func = [&](const std::vector<uint32_t>& params)->cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss;
ss << "conv2d_1x1_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
}
extern void Conv2dOpenclK1x1S1(const Tensor *input,
......
......@@ -7,6 +7,7 @@
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
......@@ -54,15 +55,50 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter,
conv_2d_kernel.setArg(idx++, padding[0] / 2);
conv_2d_kernel.setArg(idx++, padding[1] / 2);
auto command_queue = runtime->command_queue();
cl_int error;
error = command_queue.enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)),
cl::NDRange(16, 16, 4),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {4, 15, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
return {{4, 15, 8}, //SNPE size
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
};
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss;
ss << "conv2d_3x3_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
}
void Conv2dOpenclK3x3S1(const Tensor *input,
......
......@@ -7,6 +7,7 @@
#include "mace/kernels/conv_2d.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
......@@ -38,7 +39,6 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
auto program = runtime->program();
auto conv_2d_kernel = runtime->BuildKernel("conv_2d", "conv_2d", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
uint32_t idx = 0;
conv_2d_kernel.setArg(idx++, *(static_cast<const cl::Image2D *>(input->buffer())));
......@@ -57,15 +57,50 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter,
conv_2d_kernel.setArg(idx++, padding[0] / 2);
conv_2d_kernel.setArg(idx++, padding[1] / 2);
auto command_queue = runtime->command_queue();
cl_int error;
error = command_queue.enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(static_cast<uint32_t>(channel_blocks), static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)),
cl::NDRange(16, 16, 4),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS, error);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width_blocks),
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel);
auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
return {{4, 15, 8}, //SNPE size
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
};
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
conv_2d_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss;
ss << "conv2d_general_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
}
......
......@@ -7,6 +7,7 @@
#include "mace/core/runtime/opencl/opencl_runtime.h"
#include "mace/kernels/opencl/helper.h"
#include "mace/utils/utils.h"
#include "mace/utils/tuner.h"
namespace mace {
namespace kernels {
......@@ -22,10 +23,6 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
const index_t channel_blocks = RoundUpDiv4(channels);
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
auto runtime = OpenCLRuntime::Get();
auto program = runtime->program();
......@@ -33,38 +30,65 @@ void ReluFunctor<DeviceType::OPENCL, T>::operator()(const Tensor *input,
auto dt = DataTypeToEnum<T>::value;
built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt));
built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt));
cl::Kernel relu_kernel;
if (max_limit_ < 0) {
auto relu_kernel = runtime->BuildKernel("relu", "relu", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
const uint32_t lws[3] = {1, kwg_size, 1};
relu_kernel = runtime->BuildKernel("relu", "relu", built_options);
uint32_t idx = 0;
relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
relu_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
} else {
auto relu_kernel = runtime->BuildKernel("relu", "relux", built_options);
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
const uint32_t lws[3] = {1, kwg_size, 1};
relu_kernel = runtime->BuildKernel("relu", "relux", built_options);
uint32_t idx = 0;
relu_kernel.setArg(idx++, *(static_cast<const cl::Buffer *>(input->buffer())));
relu_kernel.setArg(idx++, max_limit_);
relu_kernel.setArg(idx++, *(static_cast<cl::Buffer *>(output->buffer())));
}
const uint32_t gws[3] = {static_cast<uint32_t>(channel_blocks),
static_cast<uint32_t>(width),
static_cast<uint32_t>(height * batch)};
const std::vector<uint32_t> lws = {8, 16, 8};
const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel);
auto params_generator = [&kwg_size]() -> std::vector<std::vector<uint32_t>> {
return {{4, 15, 8}, //SNPE size
{kwg_size / 16, 4, 4},
{kwg_size / 32, 4, 8},
{kwg_size / 32, 8, 4},
{kwg_size / 64, 8, 8},
{kwg_size / 64, 16, 4},
{kwg_size / 128, 8, 16},
{kwg_size / 128, 16, 8},
{kwg_size / 128, 32, 4},
{1, kwg_size / 32, 32},
{1, kwg_size / 64, 64},
{1, kwg_size / 128, 128},
{3, 15, 9},
{7, 15, 9},
{9, 7, 15},
{15, 7, 9},
{1, kwg_size, 1}};
};
auto func = [&](const std::vector<uint32_t> &params) -> cl_int {
cl_int error = runtime->command_queue().enqueueNDRangeKernel(
relu_kernel, cl::NullRange,
cl::NDRange(gws[0], gws[1], gws[2]),
cl::NDRange(lws[0], lws[1], lws[2]),
cl::NDRange(params[0], params[1], params[2]),
NULL, OpenCLRuntime::Get()->GetDefaultEvent());
MACE_CHECK(error == CL_SUCCESS);
}
MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error;
return error;
};
std::stringstream ss;
ss << "relu_opencl_kernel_"
<< output->dim(0) << "_"
<< output->dim(1) << "_"
<< output->dim(2) << "_"
<< output->dim(3);
Tuner<uint32_t>::Get()->template TuneOrRun<cl_int>(ss.str(),
lws,
params_generator,
func);
}
template
......
......@@ -102,6 +102,8 @@ BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half);
BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half);
// SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8
BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half);
BM_CONV_2D(1, 3, 512, 512, 7, 7, 2, SAME, 64, half);
BM_CONV_2D(1, 512, 64, 64, 1, 1, 1, SAME, 256, half);
// Test RGB <-> YUV
//BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float);
......
......@@ -260,8 +260,6 @@ int Main(int argc, char **argv) {
DeviceType_Parse(device, &device_type);
VLOG(0) << device_type;
if (device_type == DeviceType::OPENCL)
OpenCLRuntime::EnableProfiling();
// load model
std::ifstream model_file_stream(model_file, std::ios::in | std::ios::binary);
......@@ -296,9 +294,11 @@ int Main(int argc, char **argv) {
}
}
}
auto net = CreateNet(net_def, &ws, device_type, NetMode::INIT);
net->Run();
// create net
auto net = CreateNet(net_def, &ws, device_type);
net = CreateNet(net_def, &ws, device_type);
int64_t warmup_time_us = 0;
int64_t num_warmup_runs = 0;
......
......@@ -42,6 +42,7 @@ class Tuner {
} else {
// run
if (param_table_.find(param_key) != param_table_.end()) {
VLOG(1) << param_key << ": " << internal::MakeString(param_table_[param_key]);
return func(param_table_[param_key]);
} else {
return func(default_param);
......
......@@ -18,10 +18,9 @@ from tensorflow import gfile
# --input_file input_file \
# --mace_out_file icnet.out
def generate_data(shape):
np.random.seed(FLAGS.random_seed)
data = np.random.random(shape)
data = np.random.random(shape) * -1
print FLAGS.input_file
data.astype(np.float32).tofile(FLAGS.input_file)
print "Generate input file done."
......@@ -36,12 +35,8 @@ def valid_output(out_shape, mace_out_file, tf_out_value):
mace_out_value = load_data(mace_out_file)
if mace_out_value.size != 0:
mace_out_value = mace_out_value.reshape(out_shape)
np.testing.assert_allclose(tf_out_value, mace_out_value, rtol=0, atol=1e-3)
res = np.allclose(tf_out_value, mace_out_value, rtol=0, atol=1e-3)
if res:
print '=======================Passed! Haha======================'
else:
print '=======================Failed! Oops======================'
np.testing.assert_allclose(mace_out_value, tf_out_value, rtol=0.05)
print '=======================Passed! Haha======================'
else:
print '=======================Skip empty node==================='
......
......@@ -32,7 +32,7 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \
--output=${MODEL_DIR}/${MACE_MODEL_NAME} \
--input_node=input \
--output_node=GCN/br_result_2/fcn_br \
--data_type=DT_FLOAT \
--data_type=DT_HALF\
--runtime=gpu
......@@ -50,7 +50,7 @@ adb push ${MODEL_DIR}/${MACE_MODEL_NAME} ${PHONE_DATA_DIR}
adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR}
adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR}
num_threads=${1:-1}
num_threads=${1:-4}
adb </dev/null shell MACE_RUN_PARAMETER_PATH=${PHONE_DATA_DIR}/mace_run.config \
MACE_KERNEL_PATH=$KERNEL_DIR \
......@@ -77,4 +77,3 @@ python tools/validate.py --model_file ${TF_MODEL_FILE_PATH} \
--input_node input \
--output_node GCN/br_result_2/fcn_br\
--output_shape 1,512,512,2
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册