From 116d33db77ecadad5730f0eda9318d81b5833a4f Mon Sep 17 00:00:00 2001 From: liuqi Date: Thu, 7 Dec 2017 14:00:32 +0800 Subject: [PATCH] Add auto-tuning for opencl kernel. --- mace/core/net.cc | 5 ++ mace/kernels/opencl/batch_norm_opencl.cc | 48 ++++++++------ mace/kernels/opencl/bias_add_opencl.cc | 2 +- mace/kernels/opencl/cl/addn.cl | 10 ++- mace/kernels/opencl/cl/buffer_to_image.cl | 9 +-- mace/kernels/opencl/cl/conv_2d.cl | 13 ++-- mace/kernels/opencl/cl/conv_2d_1x1.cl | 20 +++--- mace/kernels/opencl/cl/conv_2d_3x3.cl | 13 ++-- mace/kernels/opencl/cl/resize_bilinear.cl | 9 ++- mace/kernels/opencl/concat.cc | 8 +-- mace/kernels/opencl/conv_2d_opencl_1x1.cc | 57 ++++++++++++---- mace/kernels/opencl/conv_2d_opencl_3x3.cc | 54 ++++++++++++--- mace/kernels/opencl/conv_2d_opencl_general.cc | 55 +++++++++++++--- mace/kernels/opencl/relu_opencl.cc | 66 +++++++++++++------ mace/ops/conv_2d_benchmark.cc | 2 + mace/tools/benchmark/benchmark_model.cc | 6 +- mace/utils/tuner.h | 1 + tools/validate.py | 11 +--- tools/validate_gcn.sh | 5 +- 19 files changed, 261 insertions(+), 133 deletions(-) diff --git a/mace/core/net.cc b/mace/core/net.cc index 40312f3d..e1b16a03 100644 --- a/mace/core/net.cc +++ b/mace/core/net.cc @@ -87,6 +87,11 @@ bool SimpleNet::Run(RunMetadata *run_metadata) { VLOG(1) << "Op " << op->debug_def().name() << " has shape: " << internal::MakeString(op->Output(0)->shape()); } +#ifdef __USE_OPENCL + if (device_type_ == DeviceType::OPENCL) { + OpenCLRuntime::Get()->command_queue().finish(); + } +#endif return true; } diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 09b160d7..a5362262 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -11,7 +11,7 @@ namespace mace { namespace kernels { -template +template void BatchNormFunctor::operator()( const Tensor *input, const Tensor *scale, @@ -27,10 +27,6 @@ void BatchNormFunctor::operator()( const index_t channel_blocks = RoundUpDiv4(channels); - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - auto runtime = OpenCLRuntime::Get(); std::set built_options; auto dt = DataTypeToEnum::value; @@ -38,9 +34,6 @@ void BatchNormFunctor::operator()( built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); auto bm_kernel = runtime->BuildKernel("batch_norm", "batch_norm", built_options); - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); - const std::vector lws = {1, kwg_size, 1}; - uint32_t idx = 0; bm_kernel.setArg(idx++, *(static_cast(input->buffer()))); bm_kernel.setArg(idx++, *(static_cast(scale->buffer()))); @@ -50,18 +43,31 @@ void BatchNormFunctor::operator()( bm_kernel.setArg(idx++, epsilon_); bm_kernel.setArg(idx++, *(static_cast(output->buffer()))); - auto params_generator = [&kwg_size]()->std::vector> { + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + const std::vector lws = {8, 16, 8}; + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bm_kernel); + auto params_generator = [&kwg_size]() -> std::vector> { return {{8, 128, 1}, //SNPE size - {1, 1, 64}, - {1, 1, 128}, - {1, kwg_size/16, 16}, - {1, kwg_size/32, 32}, - {1, kwg_size/64, 64}, - {1, kwg_size/128, 128}, - {1, 1, kwg_size}, + {kwg_size / 16, 4, 4}, + {kwg_size / 32, 4, 8}, + {kwg_size / 32, 8, 4}, + {kwg_size / 64, 8, 8}, + {kwg_size / 64, 16, 4}, + {kwg_size / 128, 8, 16}, + {kwg_size / 128, 16, 8}, + {kwg_size / 128, 32, 4}, + {1, kwg_size / 32, 32}, + {1, kwg_size / 64, 64}, + {1, kwg_size / 128, 128}, + {3, 15, 9}, + {7, 15, 9}, + {9, 7, 15}, + {15, 7, 9}, {1, kwg_size, 1}}; }; - auto func = [&](const std::vector& params)->cl_int { + auto func = [&](const std::vector ¶ms) -> cl_int { cl_int error = runtime->command_queue().enqueueNDRangeKernel( bm_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), @@ -73,10 +79,10 @@ void BatchNormFunctor::operator()( }; std::stringstream ss; ss << "batch_norm_opencl_kernel_" - << input->dim(0) << "_" - << input->dim(1) << "_" - << input->dim(2) << "_" - << input->dim(3); + << output->dim(0) << "_" + << output->dim(1) << "_" + << output->dim(2) << "_" + << output->dim(3); Tuner::Get()->template TuneOrRun(ss.str(), lws, params_generator, diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc index b778c6b7..b03bbc5e 100644 --- a/mace/kernels/opencl/bias_add_opencl.cc +++ b/mace/kernels/opencl/bias_add_opencl.cc @@ -36,7 +36,7 @@ void BiasAddFunctor::operator()( auto bias_kernel = runtime->BuildKernel("bias_add", "bias_add", built_options); const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(bias_kernel); - const std::vector lws = {1, kwg_size, 1}; + const std::vector lws = {8, 16, 8}; uint32_t idx = 0; bias_kernel.setArg(idx++, *(static_cast(input->buffer()))); diff --git a/mace/kernels/opencl/cl/addn.cl b/mace/kernels/opencl/cl/addn.cl index a9309930..9504d12a 100644 --- a/mace/kernels/opencl/cl/addn.cl +++ b/mace/kernels/opencl/cl/addn.cl @@ -12,19 +12,17 @@ __kernel void addn(__read_only image2d_t input0, /* [c%4 * w * c/4, h * b] */ const int w = get_global_id(0); const int hb = get_global_id(1); - const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - - DATA_TYPE4 in0 = READ_IMAGET(input0, sampler, (int2)(w, hb)); - DATA_TYPE4 in1 = READ_IMAGET(input1, sampler, (int2)(w, hb)); + DATA_TYPE4 in0 = READ_IMAGET(input0, SAMPLER, (int2)(w, hb)); + DATA_TYPE4 in1 = READ_IMAGET(input1, SAMPLER, (int2)(w, hb)); DATA_TYPE4 out = in0 + in1; #if INPUT_NUM > 2 - DATA_TYPE4 in2 = READ_IMAGET(input2, sampler, (int2)(w, hb)); + DATA_TYPE4 in2 = READ_IMAGET(input2, SAMPLER, (int2)(w, hb)); out = out + in2; #endif #if INPUT_NUM > 3 - DATA_TYPE4 in3 = READ_IMAGET(input3, sampler, (int2)(w, hb)); + DATA_TYPE4 in3 = READ_IMAGET(input3, SAMPLER, (int2)(w, hb)); out = out + in3; #endif diff --git a/mace/kernels/opencl/cl/buffer_to_image.cl b/mace/kernels/opencl/cl/buffer_to_image.cl index b7812d6e..78e009a6 100644 --- a/mace/kernels/opencl/cl/buffer_to_image.cl +++ b/mace/kernels/opencl/cl/buffer_to_image.cl @@ -54,9 +54,8 @@ __kernel void filter_image_to_buffer(__global DATA_TYPE *output, /* h, w, ic, oc + out_channel_idx; if (in_channel_idx < in_channel) { - const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; int2 coord = (int2)(w, h); - VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord); + VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord); const int size = (out_channel - out_channel_idx); if (size < 4) { switch (size) { @@ -119,9 +118,8 @@ __kernel void in_out_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ const int offset = ((batch_idx * height + height_idx) * width + width_idx) * channels + channel_idx; - const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; int2 coord = (int2)(w, h); - VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord); + VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord); const int size = channels - channel_idx; if (size < 4) { switch (size) { @@ -169,9 +167,8 @@ __kernel void arg_image_to_buffer(__global DATA_TYPE *output, /* nhwc */ int h = get_global_id(1); const int offset = w * 4; - const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; int2 coord = (int2)(w, h); - VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, sampler, coord); + VEC_DATA_TYPE(DATA_TYPE, 4) values = CMD_TYPE(read_image, CMD_DATA_TYPE)(input, SAMPLER, coord); const int size = count - offset; if (size < 4) { switch (size) { diff --git a/mace/kernels/opencl/cl/conv_2d.cl b/mace/kernels/opencl/cl/conv_2d.cl index e5ddb3d7..060cb39a 100644 --- a/mace/kernels/opencl/cl/conv_2d.cl +++ b/mace/kernels/opencl/cl/conv_2d.cl @@ -21,10 +21,9 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ const int out_hb = get_global_id(2); const int rounded_in_ch = in_ch_blks * 4; - const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; #ifdef BIAS DATA_TYPE4 out0 = - READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)); + READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); DATA_TYPE4 out1 = out0; DATA_TYPE4 out2 = out0; DATA_TYPE4 out3 = out0; @@ -71,7 +70,7 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ in_width_value = select(in_idx + in_width_value, \ -1, \ (in_width_value < 0 || in_width_value >= in_width)); \ - in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value)); + in##i = READ_IMAGET(input, SAMPLER, (int2)(in_width_value, in_hb_value)); READ_INPUT(0); READ_INPUT(1); @@ -81,10 +80,10 @@ __kernel void conv_2d(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] */ #undef READ_INPUT int filter_idx = (in_ch_blk << 2) + (hb_idx * filter_width + width_idx) * rounded_in_ch; - weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk)); - weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk)); - weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk)); - weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk)); + weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk)); + weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk)); + weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk)); + weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk)); // Will prefetch L2 improve performance? How to pretch image data? diff --git a/mace/kernels/opencl/cl/conv_2d_1x1.cl b/mace/kernels/opencl/cl/conv_2d_1x1.cl index bf384467..1249f1fc 100644 --- a/mace/kernels/opencl/cl/conv_2d_1x1.cl +++ b/mace/kernels/opencl/cl/conv_2d_1x1.cl @@ -16,10 +16,8 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const int out_w_blks = get_global_size(1); const int out_hb = get_global_id(2); - const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - #ifdef BIAS - DATA_TYPE4 out0 = READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)); + DATA_TYPE4 out0 = READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); DATA_TYPE4 out1 = out0; DATA_TYPE4 out2 = out0; DATA_TYPE4 out3 = out0; @@ -58,16 +56,16 @@ __kernel void conv_2d_1x1(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] int in_x_base = 0; for (int in_ch_blk = 0; in_ch_blk < in_ch_blks; ++in_ch_blk) { - DATA_TYPE4 in0 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.x, out_hb_idx)); - DATA_TYPE4 in1 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.y, out_hb_idx)); - DATA_TYPE4 in2 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.z, out_hb_idx)); - DATA_TYPE4 in3 = READ_IMAGET(input, sampler, (int2)(in_x_base + w.w, out_hb_idx)); + DATA_TYPE4 in0 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.x, out_hb_idx)); + DATA_TYPE4 in1 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.y, out_hb_idx)); + DATA_TYPE4 in2 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.z, out_hb_idx)); + DATA_TYPE4 in3 = READ_IMAGET(input, SAMPLER, (int2)(in_x_base + w.w, out_hb_idx)); const int filter_x0 = in_ch_blk << 2; - DATA_TYPE4 weights0 = READ_IMAGET(filter, sampler, (int2)(filter_x0, out_ch_blk)); - DATA_TYPE4 weights1 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 1, out_ch_blk)); - DATA_TYPE4 weights2 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 2, out_ch_blk)); - DATA_TYPE4 weights3 = READ_IMAGET(filter, sampler, (int2)(filter_x0 + 3, out_ch_blk)); + DATA_TYPE4 weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0, out_ch_blk)); + DATA_TYPE4 weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 1, out_ch_blk)); + DATA_TYPE4 weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 2, out_ch_blk)); + DATA_TYPE4 weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_x0 + 3, out_ch_blk)); // Will prefetch L2 improve performance? How to pretch image data? out0 += in0.x * weights0; diff --git a/mace/kernels/opencl/cl/conv_2d_3x3.cl b/mace/kernels/opencl/cl/conv_2d_3x3.cl index 08bf04d3..9d032a72 100644 --- a/mace/kernels/opencl/cl/conv_2d_3x3.cl +++ b/mace/kernels/opencl/cl/conv_2d_3x3.cl @@ -19,10 +19,9 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] const int out_hb = get_global_id(2); const int rounded_in_ch = in_ch_blks * 4; - const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; #ifdef BIAS DATA_TYPE4 out0 = - READ_IMAGET(bias, sampler, (int2)(out_ch_blk, 0)); + READ_IMAGET(bias, SAMPLER, (int2)(out_ch_blk, 0)); DATA_TYPE4 out1 = out0; DATA_TYPE4 out2 = out0; DATA_TYPE4 out3 = out0; @@ -72,7 +71,7 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] in_width_value = select(in_idx + in_width_value, \ -1, \ (in_width_value < 0 || in_width_value >= in_width)); \ - in##i = READ_IMAGET(input, sampler, (int2)(in_width_value, in_hb_value)); + in##i = READ_IMAGET(input, SAMPLER, (int2)(in_width_value, in_hb_value)); READ_INPUT(0); READ_INPUT(1); @@ -83,10 +82,10 @@ __kernel void conv_2d_3x3(__read_only image2d_t input, /* [c%4 * w * c/4, h * b] #undef READ_INPUT int filter_idx = (in_ch_blk << 2) + (hb_idx * 3 + width_idx) * rounded_in_ch; - weights0 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 0, out_ch_blk)); - weights1 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 1, out_ch_blk)); - weights2 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 2, out_ch_blk)); - weights3 = READ_IMAGET(filter, sampler, (int2)(filter_idx + 3, out_ch_blk)); + weights0 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 0, out_ch_blk)); + weights1 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 1, out_ch_blk)); + weights2 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 2, out_ch_blk)); + weights3 = READ_IMAGET(filter, SAMPLER, (int2)(filter_idx + 3, out_ch_blk)); // Will prefetch L2 improve performance? How to pretch image data? diff --git a/mace/kernels/opencl/cl/resize_bilinear.cl b/mace/kernels/opencl/cl/resize_bilinear.cl index efb769d2..72f09c96 100644 --- a/mace/kernels/opencl/cl/resize_bilinear.cl +++ b/mace/kernels/opencl/cl/resize_bilinear.cl @@ -25,17 +25,16 @@ __kernel void resize_bilinear_nocache(__read_only image2d_t input, /* [c%4 * w * const float h_lerp = h_in - h_lower; const float w_lerp = w_in - w_lower; - const sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; const int in_w_offset = ch_blk * in_width; const int in_h_offset = b * in_height; - DATA_TYPE4 top_left = READ_IMAGET(input, sampler, + DATA_TYPE4 top_left = READ_IMAGET(input, SAMPLER, (int2)(in_w_offset + w_lower, in_h_offset + h_lower)); - DATA_TYPE4 top_right = READ_IMAGET(input, sampler, + DATA_TYPE4 top_right = READ_IMAGET(input, SAMPLER, (int2)(in_w_offset + w_upper, in_h_offset + h_lower)); - DATA_TYPE4 bottom_left = READ_IMAGET(input, sampler, + DATA_TYPE4 bottom_left = READ_IMAGET(input, SAMPLER, (int2)(in_w_offset + w_lower, in_h_offset + h_upper)); - DATA_TYPE4 bottom_right = READ_IMAGET(input, sampler, + DATA_TYPE4 bottom_right = READ_IMAGET(input, SAMPLER, (int2)(in_w_offset + w_upper, in_h_offset + h_upper)); DATA_TYPE4 top = top_left + (top_right - top_left) * w_lerp; diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index 80a23a44..f80f370d 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -43,10 +43,10 @@ static void Concat2(const Tensor *input0, const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(concat_kernel); - uint32_t lws[3]; - lws[0] = std::min(channel_blk, kwg_size); - lws[1] = std::min(width, kwg_size / lws[0]); - lws[2] = std::min(height * batch, kwg_size / (lws[0] * lws[1])); + uint32_t lws[3] = {8, 16, 8}; +// lws[0] = std::min(channel_blk, kwg_size); +// lws[1] = std::min(width, kwg_size / lws[0]); +// lws[2] = std::min(height * batch, kwg_size / (lws[0] * lws[1])); cl_int error = runtime->command_queue().enqueueNDRangeKernel( concat_kernel, cl::NullRange, diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index d759689c..1fe00494 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -7,6 +7,7 @@ #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/helper.h" #include "mace/utils/utils.h" +#include "mace/utils/tuner.h" namespace mace { namespace kernels { @@ -48,7 +49,6 @@ void Conv1x1(const Tensor *input, auto program = runtime->program(); auto conv_2d_kernel = runtime->BuildKernel("conv_2d_1x1", "conv_2d_1x1", built_options); - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); uint32_t idx = 0; conv_2d_kernel.setArg(idx++, *(static_cast(input->buffer()))); @@ -63,16 +63,51 @@ void Conv1x1(const Tensor *input, conv_2d_kernel.setArg(idx++, static_cast(height)); conv_2d_kernel.setArg(idx++, static_cast(width)); - auto command_queue = runtime->command_queue(); - cl_int error; - error = command_queue.enqueueNDRangeKernel( - conv_2d_kernel, cl::NullRange, - cl::NDRange(static_cast(channel_blocks), - static_cast(width_blocks), - static_cast(height * batch)), - cl::NDRange(4, 15, 8), // TODO auto tuning - nullptr, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS, error); + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width_blocks), + static_cast(height * batch)}; + const std::vector lws = {8, 15, 8}; + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); + auto params_generator = [&kwg_size]()->std::vector> { + return {{4, 15, 8}, //SNPE size + {kwg_size/16, 4, 4}, + {kwg_size/32, 4, 8}, + {kwg_size/32, 8, 4}, + {kwg_size/64, 8, 8}, + {kwg_size/64, 16, 4}, + {kwg_size/128, 8, 16}, + {kwg_size/128, 16, 8}, + {kwg_size/128, 32, 4}, + {1, kwg_size/32, 32}, + {1, kwg_size/64, 64}, + {1, kwg_size/128, 128}, + {3, 15, 9}, + {7, 15, 9}, + {9, 7, 15}, + {15, 7, 9}, + {1, kwg_size, 1}}; + }; + auto func = [&](const std::vector& params)->cl_int { + cl_int error = runtime->command_queue().enqueueNDRangeKernel( + conv_2d_kernel, cl::NullRange, + cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), + NULL, OpenCLRuntime::Get()->GetDefaultEvent()); + + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + return error; + }; + std::stringstream ss; + ss << "conv2d_1x1_opencl_kernel_" + << output->dim(0) << "_" + << output->dim(1) << "_" + << output->dim(2) << "_" + << output->dim(3); + Tuner::Get()->template TuneOrRun(ss.str(), + lws, + params_generator, + func); + } extern void Conv2dOpenclK1x1S1(const Tensor *input, diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 7b7453ad..858fc5fc 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -7,6 +7,7 @@ #include "mace/kernels/conv_2d.h" #include "mace/kernels/opencl/helper.h" #include "mace/utils/utils.h" +#include "mace/utils/tuner.h" namespace mace { namespace kernels { @@ -54,15 +55,50 @@ static void Conv2d3x3S12(const Tensor *input, const Tensor *filter, conv_2d_kernel.setArg(idx++, padding[0] / 2); conv_2d_kernel.setArg(idx++, padding[1] / 2); - auto command_queue = runtime->command_queue(); - cl_int error; - error = command_queue.enqueueNDRangeKernel( - conv_2d_kernel, cl::NullRange, - cl::NDRange(static_cast(channel_blocks), static_cast(width_blocks), - static_cast(height * batch)), - cl::NDRange(16, 16, 4), - NULL, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS, error); + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width_blocks), + static_cast(height * batch)}; + const std::vector lws = {4, 15, 8}; + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); + auto params_generator = [&kwg_size]() -> std::vector> { + return {{4, 15, 8}, //SNPE size + {kwg_size / 16, 4, 4}, + {kwg_size / 32, 4, 8}, + {kwg_size / 32, 8, 4}, + {kwg_size / 64, 8, 8}, + {kwg_size / 64, 16, 4}, + {kwg_size / 128, 8, 16}, + {kwg_size / 128, 16, 8}, + {kwg_size / 128, 32, 4}, + {1, kwg_size / 32, 32}, + {1, kwg_size / 64, 64}, + {1, kwg_size / 128, 128}, + {3, 15, 9}, + {7, 15, 9}, + {9, 7, 15}, + {15, 7, 9}, + {1, kwg_size, 1}}; + }; + auto func = [&](const std::vector ¶ms) -> cl_int { + cl_int error = runtime->command_queue().enqueueNDRangeKernel( + conv_2d_kernel, cl::NullRange, + cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), + NULL, OpenCLRuntime::Get()->GetDefaultEvent()); + + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + return error; + }; + std::stringstream ss; + ss << "conv2d_3x3_opencl_kernel_" + << output->dim(0) << "_" + << output->dim(1) << "_" + << output->dim(2) << "_" + << output->dim(3); + Tuner::Get()->template TuneOrRun(ss.str(), + lws, + params_generator, + func); } void Conv2dOpenclK3x3S1(const Tensor *input, diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index e46ecbca..7a74f86b 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -7,6 +7,7 @@ #include "mace/kernels/conv_2d.h" #include "mace/kernels/opencl/helper.h" #include "mace/utils/utils.h" +#include "mace/utils/tuner.h" namespace mace { namespace kernels { @@ -38,7 +39,6 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter, auto program = runtime->program(); auto conv_2d_kernel = runtime->BuildKernel("conv_2d", "conv_2d", built_options); - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); uint32_t idx = 0; conv_2d_kernel.setArg(idx++, *(static_cast(input->buffer()))); @@ -57,15 +57,50 @@ void Conv2dOpencl(const Tensor *input, const Tensor *filter, conv_2d_kernel.setArg(idx++, padding[0] / 2); conv_2d_kernel.setArg(idx++, padding[1] / 2); - auto command_queue = runtime->command_queue(); - cl_int error; - error = command_queue.enqueueNDRangeKernel( - conv_2d_kernel, cl::NullRange, - cl::NDRange(static_cast(channel_blocks), static_cast(width_blocks), - static_cast(height * batch)), - cl::NDRange(16, 16, 4), - NULL, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS, error); + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width_blocks), + static_cast(height * batch)}; + const std::vector lws = {8, 16, 8}; + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(conv_2d_kernel); + auto params_generator = [&kwg_size]() -> std::vector> { + return {{4, 15, 8}, //SNPE size + {kwg_size / 16, 4, 4}, + {kwg_size / 32, 4, 8}, + {kwg_size / 32, 8, 4}, + {kwg_size / 64, 8, 8}, + {kwg_size / 64, 16, 4}, + {kwg_size / 128, 8, 16}, + {kwg_size / 128, 16, 8}, + {kwg_size / 128, 32, 4}, + {1, kwg_size / 32, 32}, + {1, kwg_size / 64, 64}, + {1, kwg_size / 128, 128}, + {3, 15, 9}, + {7, 15, 9}, + {9, 7, 15}, + {15, 7, 9}, + {1, kwg_size, 1}}; + }; + auto func = [&](const std::vector ¶ms) -> cl_int { + cl_int error = runtime->command_queue().enqueueNDRangeKernel( + conv_2d_kernel, cl::NullRange, + cl::NDRange(gws[0], gws[1], gws[2]), + cl::NDRange(params[0], params[1], params[2]), + NULL, OpenCLRuntime::Get()->GetDefaultEvent()); + + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + return error; + }; + std::stringstream ss; + ss << "conv2d_general_opencl_kernel_" + << output->dim(0) << "_" + << output->dim(1) << "_" + << output->dim(2) << "_" + << output->dim(3); + Tuner::Get()->template TuneOrRun(ss.str(), + lws, + params_generator, + func); } diff --git a/mace/kernels/opencl/relu_opencl.cc b/mace/kernels/opencl/relu_opencl.cc index 46988793..28ff881b 100644 --- a/mace/kernels/opencl/relu_opencl.cc +++ b/mace/kernels/opencl/relu_opencl.cc @@ -7,6 +7,7 @@ #include "mace/core/runtime/opencl/opencl_runtime.h" #include "mace/kernels/opencl/helper.h" #include "mace/utils/utils.h" +#include "mace/utils/tuner.h" namespace mace { namespace kernels { @@ -22,10 +23,6 @@ void ReluFunctor::operator()(const Tensor *input, const index_t channel_blocks = RoundUpDiv4(channels); - const uint32_t gws[3] = {static_cast(channel_blocks), - static_cast(width), - static_cast(height * batch)}; - auto runtime = OpenCLRuntime::Get(); auto program = runtime->program(); @@ -33,38 +30,65 @@ void ReluFunctor::operator()(const Tensor *input, auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); + cl::Kernel relu_kernel; if (max_limit_ < 0) { - auto relu_kernel = runtime->BuildKernel("relu", "relu", built_options); - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel); - const uint32_t lws[3] = {1, kwg_size, 1}; + relu_kernel = runtime->BuildKernel("relu", "relu", built_options); uint32_t idx = 0; relu_kernel.setArg(idx++, *(static_cast(input->buffer()))); relu_kernel.setArg(idx++, *(static_cast(output->buffer()))); - - cl_int error = runtime->command_queue().enqueueNDRangeKernel( - relu_kernel, cl::NullRange, - cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), - NULL, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS); } else { - auto relu_kernel = runtime->BuildKernel("relu", "relux", built_options); - const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel); - const uint32_t lws[3] = {1, kwg_size, 1}; + relu_kernel = runtime->BuildKernel("relu", "relux", built_options); uint32_t idx = 0; relu_kernel.setArg(idx++, *(static_cast(input->buffer()))); relu_kernel.setArg(idx++, max_limit_); relu_kernel.setArg(idx++, *(static_cast(output->buffer()))); - + } + const uint32_t gws[3] = {static_cast(channel_blocks), + static_cast(width), + static_cast(height * batch)}; + const std::vector lws = {8, 16, 8}; + const uint32_t kwg_size = runtime->GetKernelMaxWorkGroupSize(relu_kernel); + auto params_generator = [&kwg_size]() -> std::vector> { + return {{4, 15, 8}, //SNPE size + {kwg_size / 16, 4, 4}, + {kwg_size / 32, 4, 8}, + {kwg_size / 32, 8, 4}, + {kwg_size / 64, 8, 8}, + {kwg_size / 64, 16, 4}, + {kwg_size / 128, 8, 16}, + {kwg_size / 128, 16, 8}, + {kwg_size / 128, 32, 4}, + {1, kwg_size / 32, 32}, + {1, kwg_size / 64, 64}, + {1, kwg_size / 128, 128}, + {3, 15, 9}, + {7, 15, 9}, + {9, 7, 15}, + {15, 7, 9}, + {1, kwg_size, 1}}; + }; + auto func = [&](const std::vector ¶ms) -> cl_int { cl_int error = runtime->command_queue().enqueueNDRangeKernel( relu_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), - cl::NDRange(lws[0], lws[1], lws[2]), + cl::NDRange(params[0], params[1], params[2]), NULL, OpenCLRuntime::Get()->GetDefaultEvent()); - MACE_CHECK(error == CL_SUCCESS); - } + + MACE_CHECK(error == CL_SUCCESS) << "Error code: " << error; + return error; + }; + std::stringstream ss; + ss << "relu_opencl_kernel_" + << output->dim(0) << "_" + << output->dim(1) << "_" + << output->dim(2) << "_" + << output->dim(3); + Tuner::Get()->template TuneOrRun(ss.str(), + lws, + params_generator, + func); } template diff --git a/mace/ops/conv_2d_benchmark.cc b/mace/ops/conv_2d_benchmark.cc index b7f6fc73..0870f2b8 100644 --- a/mace/ops/conv_2d_benchmark.cc +++ b/mace/ops/conv_2d_benchmark.cc @@ -102,6 +102,8 @@ BM_CONV_2D(1, 32, 60, 60, 1, 1, 1, VALID, 128, half); BM_CONV_2D(1, 128, 60, 60, 3, 3, 1, VALID, 128, half); // SNPE GPU ExecutionDuration = 506us, % ALU Utilization = 106.8 BM_CONV_2D(1, 32, 60, 60, 3, 3, 1, SAME, 32, half); +BM_CONV_2D(1, 3, 512, 512, 7, 7, 2, SAME, 64, half); +BM_CONV_2D(1, 512, 64, 64, 1, 1, 1, SAME, 256, half); // Test RGB <-> YUV //BM_CONV_2D(1, 3, 2160, 1080, 1, 1, 1, VALID, 3, float); diff --git a/mace/tools/benchmark/benchmark_model.cc b/mace/tools/benchmark/benchmark_model.cc index 09ac6fd6..06816386 100644 --- a/mace/tools/benchmark/benchmark_model.cc +++ b/mace/tools/benchmark/benchmark_model.cc @@ -260,8 +260,6 @@ int Main(int argc, char **argv) { DeviceType_Parse(device, &device_type); VLOG(0) << device_type; - if (device_type == DeviceType::OPENCL) - OpenCLRuntime::EnableProfiling(); // load model std::ifstream model_file_stream(model_file, std::ios::in | std::ios::binary); @@ -296,9 +294,11 @@ int Main(int argc, char **argv) { } } } + auto net = CreateNet(net_def, &ws, device_type, NetMode::INIT); + net->Run(); // create net - auto net = CreateNet(net_def, &ws, device_type); + net = CreateNet(net_def, &ws, device_type); int64_t warmup_time_us = 0; int64_t num_warmup_runs = 0; diff --git a/mace/utils/tuner.h b/mace/utils/tuner.h index 722c9c86..32c21e9b 100644 --- a/mace/utils/tuner.h +++ b/mace/utils/tuner.h @@ -42,6 +42,7 @@ class Tuner { } else { // run if (param_table_.find(param_key) != param_table_.end()) { + VLOG(1) << param_key << ": " << internal::MakeString(param_table_[param_key]); return func(param_table_[param_key]); } else { return func(default_param); diff --git a/tools/validate.py b/tools/validate.py index 42a856b0..9edbdd24 100644 --- a/tools/validate.py +++ b/tools/validate.py @@ -18,10 +18,9 @@ from tensorflow import gfile # --input_file input_file \ # --mace_out_file icnet.out - def generate_data(shape): np.random.seed(FLAGS.random_seed) - data = np.random.random(shape) + data = np.random.random(shape) * -1 print FLAGS.input_file data.astype(np.float32).tofile(FLAGS.input_file) print "Generate input file done." @@ -36,12 +35,8 @@ def valid_output(out_shape, mace_out_file, tf_out_value): mace_out_value = load_data(mace_out_file) if mace_out_value.size != 0: mace_out_value = mace_out_value.reshape(out_shape) - np.testing.assert_allclose(tf_out_value, mace_out_value, rtol=0, atol=1e-3) - res = np.allclose(tf_out_value, mace_out_value, rtol=0, atol=1e-3) - if res: - print '=======================Passed! Haha======================' - else: - print '=======================Failed! Oops======================' + np.testing.assert_allclose(mace_out_value, tf_out_value, rtol=0.05) + print '=======================Passed! Haha======================' else: print '=======================Skip empty node===================' diff --git a/tools/validate_gcn.sh b/tools/validate_gcn.sh index f4dfc6eb..524c752b 100644 --- a/tools/validate_gcn.sh +++ b/tools/validate_gcn.sh @@ -32,7 +32,7 @@ bazel-bin/mace/python/tools/tf_converter --input=${TF_MODEL_FILE_PATH} \ --output=${MODEL_DIR}/${MACE_MODEL_NAME} \ --input_node=input \ --output_node=GCN/br_result_2/fcn_br \ - --data_type=DT_FLOAT \ + --data_type=DT_HALF\ --runtime=gpu @@ -50,7 +50,7 @@ adb push ${MODEL_DIR}/${MACE_MODEL_NAME} ${PHONE_DATA_DIR} adb push ${MODEL_DIR}/${INPUT_FILE_NAME} ${PHONE_DATA_DIR} adb push bazel-bin/mace/examples/mace_run ${PHONE_DATA_DIR} -num_threads=${1:-1} +num_threads=${1:-4} adb