diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 3477664fbd5952ecf486dc9fa7ce015170b53bd2..f9b0d5e2e99ce00d8a4e961ab7b9c24c56b458ed 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -362,6 +362,11 @@ OpenCLRuntime::OpenCLRuntime(): } } + device_->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, + &device_gloabl_mem_cache_size_); + + device_->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, + &device_compute_units_); const char *out_of_range_check = getenv("MACE_OUT_OF_RANGE_CHECK"); if (out_of_range_check != nullptr && strlen(out_of_range_check) == 1 && out_of_range_check[0] == '1') { @@ -386,6 +391,14 @@ cl::Device &OpenCLRuntime::device() { return *device_; } cl::CommandQueue &OpenCLRuntime::command_queue() { return *command_queue_; } +const uint64_t OpenCLRuntime::device_global_mem_cache_size() const { + return device_gloabl_mem_cache_size_; +} + +const uint32_t OpenCLRuntime::device_compute_units() const { + return device_compute_units_; +} + bool OpenCLRuntime::BuildProgramFromBinary( const std::string &built_program_key, const std::string &build_options_str, diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 8228324c04ea0a000dadeb152b9bb6b49b1427e9..521698b7fc33d4b1ee7fa8f3e19265e511870081 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -73,6 +73,8 @@ class OpenCLRuntime { cl::CommandQueue &command_queue(); const GPUType gpu_type() const; const std::string platform_info() const; + const uint64_t device_global_mem_cache_size() const; + const uint32_t device_compute_units() const; cl::Kernel BuildKernel(const std::string &program_name, const std::string &kernel_name, @@ -128,6 +130,9 @@ class OpenCLRuntime { bool program_map_changed_; std::unique_ptr storage_; bool is_profiling_enabled_; + uint64_t device_gloabl_mem_cache_size_; + uint32_t device_compute_units_; + static GPUPerfHint kGPUPerfHint; static GPUPriorityHint kGPUPriorityHint; diff --git a/mace/kernels/opencl/REAEMD.md b/mace/kernels/opencl/REAEMD.md deleted file mode 100644 index c6f42fd5c04d53ac3da5476d10198aaa17244d6d..0000000000000000000000000000000000000000 --- a/mace/kernels/opencl/REAEMD.md +++ /dev/null @@ -1,58 +0,0 @@ -OpenCL Image Storage Layout -=== -Use **Image** object to optimize memory access and parallel computing based on OpenCL 2.0. - - -Design the corresponding **Image** format to optimize memory access for different Op algorithm. -Each pixel of **Image** object contains 4 elements(e.g. RGBA). - - -The Followings are the **Buffer** and **Image** format for all **Tensors**. - -Input/Output ---- -**Mace** use NHWC format Input/Output. - -| Tensor| Buffer| Image Size [Width, Height]| Explanation| -| --------- | :---------:|:--------:|:----:| -|Channel-Major Input/Output | NHWC | [W * (C+3)/4, N * H] | Default Input/Output format| -|Height-Major Input/Output | NHWC | [W * C, N * (H+3)/4] | Winograd Convolution format| -|Width-Major Input/Output | NHWC | [(W+3)/4 * C, N * H] | Winograd Convolution format| - -Each Pixel of **Image** contains 4 elements. The below table list the coordination relation -between **Image** and **Buffer**. - -| Tensor| Pixel Coordinate Relation| Explanation -| --------- | :---------:| :-----: | -|Channel-Major Input/Output | P[i, j] = {E[n, h, w, c] | (n=j/H, h=j%H, w=i%W, c=[i/W * 4 + k])}| k=[0, 4)| -|Height-Major Input/Output | P[i, j] = {E[n, h, w, c] | (n=j%N, h=[j/H*4 + k], w=i%W, c=i/W)}| k=[0, 4)| -|Width-Major Input/Output | P[i, j] = {E[n, h, w, c] | (n=j/H, h=j%H, w=[i%W*4 + k], c=i/W)}| k=[0, 4)| - - -Filter ---- -| Tensor| Buffer| Image Size [Width, Height]| Explanation| -| --------- | :---------:|:--------:|:----:| -|Convolution Filter | HWOI | [H * W * RoundUp<4>(I), (O+3)/4]|Convolution filter format,There is no difference compared to [H*w*I, (O+3)/4]| -|Depthwise Convlution Filter | HWIM | [H * W * M, (I+3)/4]|Depthwise-Convolution filter format| - -Each Pixel of **Image** contains 4 elements. The below table list the coordination relation -between **Image** and **Buffer**. - -| Tensor| Pixel Coordinate Relation| Explanation| -| --------- | :---------:| :-----:| -|Convolution Filter | P[m, n] = {E[h, w, o, i] | (h=T/W, w=T%W, o=[n*4+k], i=m%RI)}| RI=((I + 3) / 4) * 4, T=m/RI, k=[0, 4)| -|Depthwise Convlution Filter | P[m, n] = {E[h, w, i, 0] | (h=m/W, w=m%W, i=[n*4+k])}| only support multiplier == 1, k=[0, 4)| - -1-D Argument ---- -| Tensor| Buffer| Image Size [Width, Height]| Explanation| -| --------- | :---------:|:--------:|:----:| -|1-D Argument | W | [(W+3)/4, 1] | 1D argument format, e.g. Bias| - -Each Pixel of **Image** contains 4 elements. The below table list the coordination relation -between **Image** and **Buffer**. - -| Tensor| Pixel Coordinate Relation| Explanation| -| --------- | :---------:| :-----:| -|1-D Argument | P[i, 0] = {E[w] | w=i*4+k}| k=[0, 4)| diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation.cc similarity index 93% rename from mace/kernels/opencl/activation_opencl.cc rename to mace/kernels/opencl/activation.cc index 0e29e00f34f93935292304bddc0b91be8297bca8..5cee48620aa0aa6be6600bbbe331016a879c4c54 100644 --- a/mace/kernels/opencl/activation_opencl.cc +++ b/mace/kernels/opencl/activation.cc @@ -21,7 +21,6 @@ namespace mace { namespace kernels { - template void ActivationFunctor::operator()(const Tensor *input, const Tensor *alpha, @@ -56,23 +55,23 @@ void ActivationFunctor::operator()(const Tensor *input, } switch (activation_) { case RELU: - tuning_key_prefix_ = "relu_opencl_kernel_"; + tuning_key_prefix_ = "relu_opencl_kernel"; built_options.emplace("-DUSE_RELU"); break; case RELUX: - tuning_key_prefix_ = "relux_opencl_kernel_"; + tuning_key_prefix_ = "relux_opencl_kernel"; built_options.emplace("-DUSE_RELUX"); break; case PRELU: - tuning_key_prefix_ = "prelu_opencl_kernel_"; + tuning_key_prefix_ = "prelu_opencl_kernel"; built_options.emplace("-DUSE_PRELU"); break; case TANH: - tuning_key_prefix_ = "tanh_opencl_kernel_"; + tuning_key_prefix_ = "tanh_opencl_kernel"; built_options.emplace("-DUSE_TANH"); break; case SIGMOID: - tuning_key_prefix_ = "sigmoid_opencl_kernel_"; + tuning_key_prefix_ = "sigmoid_opencl_kernel"; built_options.emplace("-DUSE_SIGMOID"); break; default: @@ -110,7 +109,7 @@ void ActivationFunctor::operator()(const Tensor *input, input_shape_ = input->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 0}; + const std::vector lws = Default3DLocalWS(gws, kwg_size_); std::string tuning_key = Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index b4e2493f7876cc8b2d12dae66a5c70be3606ebb4..105435d5a48e912bf2c147d628d9f12581ebeea1 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -106,10 +106,10 @@ void AddNFunctor::operator()( } const std::vector lws = {kwg_size_ / 16, 16, 0}; - std::stringstream ss; - ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1] - << "_" << output_shape[2] << "_" << output_shape[3]; - TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); + std::string tuning_key = + Concat("addn_opencl_kernel", output_tensor->dim(0), output_tensor->dim(1), + output_tensor->dim(2), output_tensor->dim(3)); + TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm.cc similarity index 94% rename from mace/kernels/opencl/batch_norm_opencl.cc rename to mace/kernels/opencl/batch_norm.cc index 21adfd9626f5e60f9962fc6576299a007ffc2bad..f28c9ccc6cda25ec713c108bc1eae2ad3f9a38ed 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm.cc @@ -116,9 +116,12 @@ void BatchNormFunctor::operator()(const Tensor *input, input_shape_ = input->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 0}; + std::vector lws(4, 0); + lws[1] = std::min(gws[1], kwg_size_); + lws[0] = std::min(4, kwg_size_ / lws[1]); + lws[2] = std::min(gws[2], kwg_size_ / (lws[1] * lws[0])); std::string tuning_key = - Concat("batch_norm_opencl_kernel_", activation_, output->dim(0), + Concat("batch_norm_opencl_kernel", activation_, output->dim(0), output->dim(1), output->dim(2), output->dim(3), folded_constant_); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add.cc similarity index 98% rename from mace/kernels/opencl/bias_add_opencl.cc rename to mace/kernels/opencl/bias_add.cc index 5cffe75caf94f61af804ab71f6381233df6012e1..b6d2b4b1855d2210cb25fb6b99921800d18a6cba 100644 --- a/mace/kernels/opencl/bias_add_opencl.cc +++ b/mace/kernels/opencl/bias_add.cc @@ -79,7 +79,7 @@ void BiasAddFunctor::operator()(const Tensor *input, input_shape_ = input->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8}; + const std::vector lws = Default3DLocalWS(gws, kwg_size_); cl::Event event; cl_int error; diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc index b30ecb69b60cb0a12dadf63c245e606227d63ca5..7cb082544f55c2bf72711ec0fe6ec0e8448442eb 100644 --- a/mace/kernels/opencl/channel_shuffle.cc +++ b/mace/kernels/opencl/channel_shuffle.cc @@ -90,14 +90,11 @@ void ChannelShuffleFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 0}; - std::stringstream ss; - ss << "channel_shuffle_opencl_kernel_" - << output->dim(0) << "_" - << output->dim(1) << "_" - << output->dim(2) << "_" - << output->dim(3); - TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); + const std::vector lws = Default3DLocalWS(gws, kwg_size_); + std::string tuning_key = + Concat("channel_shuffle_opencl_kernel", output->dim(0), + output->dim(1), output->dim(2), output->dim(3)); + TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index 4dacf8cfa8bf8fba02c669e889f990831c1fc9ef..514da7840437f6ea7e3f8740d8cc2923279230e8 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -21,6 +21,23 @@ namespace mace { namespace kernels { +namespace { +std::vector LocalWS(const uint32_t *gws, + const uint32_t kwg_size) { + std::vector lws(4, 0); + uint64_t cache_size = + OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t base = cache_size / kBaseGPUMemCacheSize; + lws[1] = std::min(gws[1], kwg_size); + lws[0] = std::min(base, kwg_size / lws[1]); + const uint32_t lws_size = lws[0] * lws[1]; + lws[2] = std::min(base, kwg_size / lws_size); + return lws; +} + +} // namespace + + static void Concat2(cl::Kernel *kernel, const Tensor *input0, const Tensor *input1, @@ -95,11 +112,11 @@ static void Concat2(cl::Kernel *kernel, *prev_input_shape = input0->shape(); } - const std::vector lws = {8, *kwg_size / 64, 8, 0}; - std::stringstream ss; - ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) - << "_" << output->dim(2) << "_" << output->dim(3); - TuningOrRun3DKernel(*kernel, ss.str(), gws, lws, future); + const std::vector lws = LocalWS(gws, *kwg_size); + std::string tuning_key = + Concat("concat_opencl_kernel", output->dim(0), + output->dim(1), output->dim(2), output->dim(3)); + TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { (*kernel_error)->Map(nullptr); @@ -149,7 +166,6 @@ static void ConcatN(cl::Kernel *kernel, index_t chan_blk_offset = 0; cl::Event event; CallStats call_stats{INT64_MAX, 0}; - const std::vector lws = {8, *kwg_size / 64, 8, 1}; for (int i = 0; i < inputs_count; ++i) { const Tensor *input = input_list[i]; index_t input_channel_blk = input->dim(3) / 4; @@ -157,6 +173,7 @@ static void ConcatN(cl::Kernel *kernel, static_cast(input_channel_blk), static_cast(width), static_cast(batch * height), }; + const std::vector lws = LocalWS(gws, *kwg_size); uint32_t idx = 0; if (runtime->IsOutOfRangeCheckEnabled()) { @@ -183,6 +200,7 @@ static void ConcatN(cl::Kernel *kernel, for (size_t j = 0; j < 3; ++j) { roundup_gws[j] = RoundUp(gws[j], lws[j]); } + const std::vector lws = LocalWS(gws, *kwg_size); error = runtime->command_queue().enqueueNDRangeKernel( *kernel, cl::NullRange, diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d.cc similarity index 100% rename from mace/kernels/opencl/conv_2d_opencl.cc rename to mace/kernels/opencl/conv_2d.cc diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_1x1.cc similarity index 81% rename from mace/kernels/opencl/conv_2d_opencl_1x1.cc rename to mace/kernels/opencl/conv_2d_1x1.cc index d148edb2bbdefa587f10ac28e49ba6c8c95525b2..edce5d11715686ccb90021aa34a4bd7f1858fdcd 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_1x1.cc @@ -20,6 +20,39 @@ namespace mace { namespace kernels { +namespace { +// (inputs + weights + outputs) * array_size * sizeof(float) +const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; +std::vector LocalWS(const uint32_t *gws, + const uint32_t kwg_size) { + std::vector lws(4, 0); + uint64_t cache_size = + OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); + uint32_t base = cache_size / kBaseGPUMemCacheSize; + lws[1] = std::min(gws[1], kwg_size); + if (lws[1] >= base) { + lws[0] = std::min(gws[0], base); + } else { + lws[0] = gws[0] / 8; + if (lws[0] < base) { + lws[0] = std::max(gws[0] / 4, base); + } + } + lws[0] = std::min(lws[0], kwg_size / lws[1]); + const uint32_t lws_size = lws[0] * lws[1]; + lws[2] = std::min( + (cache_size / kernel_cache_size / lws_size / compute_units) * 8, + gws[2]); + if (lws[2] == 0) { + lws[2] = std::min(gws[2], base); + } + lws[2] = std::min(lws[2], kwg_size / lws_size); + return lws; +} + +} // namespace + extern void Conv2dOpenclK1x1(cl::Kernel *kernel, const Tensor *input, const Tensor *filter, @@ -130,9 +163,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const std::vector lws = {8, *kwg_size / 64, 8, 0}; + std::vector lws = LocalWS(gws, *kwg_size); std::string tuning_key = - Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0), + Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_3x3.cc similarity index 82% rename from mace/kernels/opencl/conv_2d_opencl_3x3.cc rename to mace/kernels/opencl/conv_2d_3x3.cc index a51ff2527221509ce197209e2a8b5d2898f39077..7dcd320d43bd793e4eb9606976f6a377dd788302 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_3x3.cc @@ -22,6 +22,35 @@ namespace mace { namespace kernels { +namespace { +// (inputs + weights + outputs) * array_size * sizeof(float) +const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4; +std::vector LocalWS(const uint32_t *gws, + const uint32_t kwg_size) { + std::vector lws(4, 0); + uint64_t cache_size = + OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t compute_units = std::max( + OpenCLRuntime::Global()->device_compute_units() / 2, 1); + const uint32_t base = std::min(cache_size / kBaseGPUMemCacheSize, + 4); + lws[1] = std::min(gws[1], kwg_size); + lws[0] = std::min(std::min(gws[0], base), + kwg_size / lws[1]); + const uint32_t lws_size = lws[0] * lws[1]; + lws[2] = std::min( + RoundUp(cache_size / kernel_cache_size / + lws_size / compute_units, base), + gws[2]); + if (lws[2] == 0) { + lws[2] = std::min(gws[2], base); + } + lws[2] = std::min(lws[2], kwg_size / lws_size); + return lws; +} + +} // namespace + extern void Conv2dOpenclK3x3(cl::Kernel *kernel, const Tensor *input, const Tensor *filter, @@ -128,9 +157,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const std::vector lws = {4, *kwg_size / 32, 8, 0}; + const std::vector lws = LocalWS(gws, *kwg_size); std::string tuning_key = - Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0), + Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_general.cc similarity index 79% rename from mace/kernels/opencl/conv_2d_opencl_general.cc rename to mace/kernels/opencl/conv_2d_general.cc index b8431193358909deb7fa435b1e72e3a3f843c30c..63b64dbdfa50d5f82a2d73b1191740cf99606b9f 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_general.cc @@ -21,6 +21,42 @@ namespace mace { namespace kernels { +namespace { +// (inputs + weights + outputs) * array_size * sizeof(float) +const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; +// TODO(liuqi): Fix the specific value. +const uint32_t lws_limit = 20; +std::vector LocalWS(const uint32_t *gws, + const uint32_t kernel_size, + const uint32_t kwg_size) { + std::vector lws(4, 0); + uint64_t cache_size = + OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); + uint32_t base = cache_size / kBaseGPUMemCacheSize; + lws[1] = std::min(gws[1], kwg_size); + lws[0] = gws[0] / 4; + if (lws[0] == 0) { + lws[0] = gws[0]; + } + lws[0] = std::min(lws[0], kwg_size / lws[1]); + const uint32_t lws_size = lws[0] * lws[1]; + lws[2] = std::min( + (cache_size / kernel_cache_size / kernel_size / lws_size / compute_units) + * 8, + gws[2]); + if (lws[2] == 0) { + if (gws[2] < lws_limit) { + lws[2] = gws[2]; + } else { + lws[2] = base; + } + } + lws[2] = std::min(lws[2], kwg_size / lws_size); + return lws; +} + +} // namespace extern void Conv2dOpencl(cl::Kernel *kernel, const Tensor *input, @@ -130,10 +166,12 @@ extern void Conv2dOpencl(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const std::vector lws = {8, *kwg_size / 64, 8, 0}; std::string tuning_key = - Concat("conv2d_general_opencl_kernel_", activation, output->dim(0), - output->dim(1), output->dim(2), output->dim(3)); + Concat("conv2d_general_opencl_kernel", output->dim(0), + output->dim(1), output->dim(2), output->dim(3), + filter->dim(0), filter->dim(1)); + std::vector lws = + LocalWS(gws, filter->dim(0) * filter->dim(1), *kwg_size); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { diff --git a/mace/kernels/opencl/depth_to_space_opencl.cc b/mace/kernels/opencl/depth_to_space.cc similarity index 92% rename from mace/kernels/opencl/depth_to_space_opencl.cc rename to mace/kernels/opencl/depth_to_space.cc index f5f45ca94a74f91471060947e9717182e188b44a..fd25f948c355999909bcd670e41ff249dc4e5aea 100644 --- a/mace/kernels/opencl/depth_to_space_opencl.cc +++ b/mace/kernels/opencl/depth_to_space.cc @@ -33,7 +33,7 @@ void DepthToSpaceOpFunctor::operator()( const char *kernel_name = nullptr; uint32_t gws[3]; - std::stringstream ss; + std::string tuning_key; index_t output_height, output_width, output_depth; if (d2s_) { output_height = input_height * block_size_; @@ -46,8 +46,8 @@ void DepthToSpaceOpFunctor::operator()( gws[0] = static_cast(RoundUpDiv4(output_depth)); gws[1] = static_cast(output_width); gws[2] = static_cast(output_height * batch); - ss << "depth_to_space_opencl_kernel_" << batch << "_" - << output_height << "_" << output_width << "_" << output_depth; + tuning_key = Concat("depth_to_space_opencl_kernel", batch, output_height, + output_width, output_depth); } else { output_height = input_height / block_size_; output_width = input_width / block_size_; @@ -59,8 +59,8 @@ void DepthToSpaceOpFunctor::operator()( gws[0] = static_cast(RoundUpDiv4(input_depth)); gws[1] = static_cast(input_width); gws[2] = static_cast(input_height * batch); - ss << "space_to_depth_opencl_kernel_" << input->dim(0) << "_" - << input->dim(1) << "_" << input->dim(2) << "_" << input->dim(3); + tuning_key = Concat("space_to_depth_opencl_kernel", input->dim(0), + input->dim(1), input->dim(2), input->dim(3)); } const index_t input_depth_blocks = RoundUpDiv4(input_depth); const index_t output_depth_blocks = RoundUpDiv4(output_depth); @@ -134,8 +134,8 @@ void DepthToSpaceOpFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 0}; - TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); + const std::vector lws = Default3DLocalWS(gws, kwg_size_); + TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv.cc similarity index 87% rename from mace/kernels/opencl/depthwise_conv_opencl.cc rename to mace/kernels/opencl/depthwise_conv.cc index 67bfbf7a6c051c156b1db4b0e9114fcc09cdb1ce..d4aa32f3b715baa8b1118890e201a9309c32174e 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl.cc +++ b/mace/kernels/opencl/depthwise_conv.cc @@ -21,6 +21,37 @@ namespace mace { namespace kernels { +namespace { +// (inputs + weights + outputs) * array_size * sizeof(float) +const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4; +std::vector LocalWS(const uint32_t *gws, + const uint32_t kwg_size) { + std::vector lws(4, 0); + uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t min_lws0 = cache_size / kBaseGPUMemCacheSize; + lws[1] = std::min(gws[1], kwg_size); + if (lws[1] >= min_lws0) { + lws[0] = std::min(gws[0], min_lws0); + } else { + lws[0] = std::min(gws[0] / 8, kwg_size / lws[1]); + if (lws[0] < min_lws0) { + lws[0] = std::min(std::max(gws[0] / 4, min_lws0), + kwg_size / lws[1]); + } + } + const uint32_t lws_size = lws[0] * lws[1]; + lws[2] = std::min( + (cache_size / kernel_cache_size / lws_size) * 4, + gws[2]); + if (lws[2] == 0) { + lws[2] = gws[2]; + } + lws[2] = std::min(lws[2], kwg_size / lws_size); + return lws; +} + +} // namespace + static void DepthwiseConv2d(cl::Kernel *kernel, const Tensor *input, // NHWC const Tensor *filter, // HWIM @@ -149,9 +180,9 @@ static void DepthwiseConv2d(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const std::vector lws = {8, *kwg_size / 64, 8, 0}; - std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation, - batch, height, width, channels, multiplier); + const std::vector lws = LocalWS(gws, *kwg_size); + std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel", + gws[0], gws[1], gws[2], multiplier); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise.cc similarity index 94% rename from mace/kernels/opencl/eltwise_opencl.cc rename to mace/kernels/opencl/eltwise.cc index d834c292c51697adacb36cb849cbb2b50b6085fc..e3f4b8f8f7db189fc1faf8d52140cd259768af9f 100644 --- a/mace/kernels/opencl/eltwise_opencl.cc +++ b/mace/kernels/opencl/eltwise.cc @@ -116,11 +116,11 @@ void EltwiseFunctor::operator()(const Tensor *input0, input_shape_ = input0->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 0}; - std::stringstream ss; - ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) - << "_" << output->dim(2) << "_" << output->dim(3); - TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); + const std::vector lws = Default3DLocalWS(gws, kwg_size_); + std::string tuning_key = + Concat("eltwise_opencl_kernel", output->dim(0), + output->dim(1), output->dim(2), output->dim(3)); + TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); char *kerror_code = kernel_error_->mutable_data(); diff --git a/mace/kernels/opencl/fully_connected_opencl.cc b/mace/kernels/opencl/fully_connected.cc similarity index 98% rename from mace/kernels/opencl/fully_connected_opencl.cc rename to mace/kernels/opencl/fully_connected.cc index 378a9d835436f7f6cd8932935dec8f58d3d4abdc..e1546541a71d056d61bbaa2205a1fdd7ee55ec94 100644 --- a/mace/kernels/opencl/fully_connected_opencl.cc +++ b/mace/kernels/opencl/fully_connected.cc @@ -267,10 +267,10 @@ void FCWTXKernel(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - std::stringstream ss; - ss << "fc_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" - << output->dim(2) << "_" << output->dim(3); - TuningOrRun2DKernel(*kernel, ss.str(), gws->data(), *lws, future); + std::string tuning_key = + Concat("fc_opencl_kernel", output->dim(0), + output->dim(1), output->dim(2), output->dim(3)); + TuningOrRun2DKernel(*kernel, tuning_key, gws->data(), *lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { (*kernel_error)->Map(nullptr); diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 7934b7209f0456edda559044b041f482b8554472..7f3e444e832a9cbe74f00610ca44ac44158de45c 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -206,6 +206,32 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) { } } +std::vector Default2DLocalWS(const uint32_t *gws, + const uint32_t kwg_size) { + std::vector lws(3, 0); + uint64_t cache_size = + OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t base = cache_size / kBaseGPUMemCacheSize; + lws[0] = std::min(base, kwg_size); + lws[1] = kwg_size / lws[1]; + return lws; + +} + +std::vector Default3DLocalWS(const uint32_t *gws, + const uint32_t kwg_size) { + std::vector lws(4, 0); + uint64_t cache_size = + OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t base = cache_size / kBaseGPUMemCacheSize; + lws[1] = std::min(gws[1], kwg_size); + lws[2] = std::min(std::min(gws[2], base), + kwg_size / lws[1]); + const uint32_t lws_size = lws[1] * lws[2]; + lws[0] = std::min(base, kwg_size / lws_size); + return lws; +} + void TuningOrRun3DKernel(const cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, @@ -216,31 +242,47 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, auto params_generator = [&]() -> std::vector> { const uint32_t kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel)); - std::vector local_ws(3, 0); - local_ws[0] = std::min(gws[0], kwg_size); - local_ws[1] = std::min(gws[1], kwg_size / local_ws[0]); - local_ws[2] = - std::min(gws[2], kwg_size / (local_ws[0] * local_ws[1])); - return { + std::vector> results; + std::vector> candidates = { // TODO(heliangliang): tuning these magic numbers - {local_ws[0], local_ws[1], local_ws[2], 0}, - {kwg_size / 16, 4, 4, 0}, - {kwg_size / 32, 4, 8, 0}, - {kwg_size / 32, 8, 4, 0}, - {kwg_size / 64, 8, 8, 0}, - {kwg_size / 64, 16, 4, 0}, - {kwg_size / 128, 8, 16, 0}, - {kwg_size / 128, 16, 8, 0}, - {kwg_size / 128, 32, 4, 0}, - {1, kwg_size / 32, 32, 0}, - {1, kwg_size / 64, 64, 0}, - {1, kwg_size / 128, 128, 0}, - {4, kwg_size / 16, 4, 0}, - {4, kwg_size / 28, 7, 0}, - {4, kwg_size / 32, 8, 0}, - {4, kwg_size / 56, 14, 0}, - {1, kwg_size, 1, 0}, + {gws[0], gws[1], gws[2], 0}, + {gws[0], gws[1], gws[2] / 8, 0}, + {gws[0], gws[1], gws[2] / 4, 0}, + {gws[0], gws[1], 8, 0}, + {gws[0], gws[1], 4, 0}, + {gws[0], gws[1], 1, 0}, + {gws[0] / 4, gws[1], gws[2], 0}, + {gws[0] / 4, gws[1], gws[2] / 8, 0}, + {gws[0] / 4, gws[1], gws[2] / 4, 0}, + {gws[0] / 4, gws[1], 8, 0}, + {gws[0] / 4, gws[1], 4, 0}, + {gws[0] / 4, gws[1], 1, 0}, + {gws[0] / 8, gws[1], gws[2], 0}, + {gws[0] / 8, gws[1], gws[2] / 8, 0}, + {gws[0] / 8, gws[1], gws[2] / 4, 0}, + {gws[0] / 8, gws[1], 8, 0}, + {gws[0] / 8, gws[1], 4, 0}, + {gws[0] / 8, gws[1], 1, 0}, + {4, gws[1], gws[2], 0}, + {4, gws[1], gws[2] / 8, 0}, + {4, gws[1], gws[2] / 4, 0}, + {4, gws[1], 8, 0}, + {4, gws[1], 4, 0}, + {4, gws[1], 1, 0}, + {1, gws[1], gws[2], 0}, + {1, gws[1], gws[2] / 8, 0}, + {1, gws[1], gws[2] / 4, 0}, + {1, gws[1], 8, 0}, + {1, gws[1], 4, 0}, + {1, gws[1], 1, 0}, }; + for (auto &ele : candidates) { + const uint32_t tmp = ele[0] * ele[1] * ele[2]; + if (0 < tmp && tmp <= kwg_size) { + results.push_back(ele); + } + } + return results; }; cl::Event event; auto func = [&](const std::vector ¶ms, Timer *timer, @@ -333,19 +375,26 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, auto params_generator = [&]() -> std::vector> { const uint32_t kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel)); - uint32_t local_ws[2]; - local_ws[0] = std::min(gws[0], kwg_size); - local_ws[1] = std::min(gws[1], kwg_size / local_ws[0]); - return {{local_ws[0], local_ws[1], 0}, - {local_ws[1], local_ws[0], 0}, - {kwg_size / 4, 4, 0}, - {kwg_size / 16, 16, 0}, - {kwg_size / 32, 32, 0}, - {kwg_size / 64, 64, 0}, - {kwg_size / 128, 128, 0}, - {kwg_size / 256, 256, 0}, - {kwg_size, 1, 0}, - {1, kwg_size, 0}}; + std::vector> results; + std::vector> candidates = { + {kwg_size / 2, 2, 0}, + {kwg_size / 4, 4, 0}, + {kwg_size / 8, 8, 0}, + {kwg_size / 16, 16, 0}, + {kwg_size / 32, 32, 0}, + {kwg_size / 64, 64, 0}, + {kwg_size / 128, 128, 0}, + {kwg_size / 256, 256, 0}, + {kwg_size, 1, 0}, + {1, kwg_size, 0} + }; + for (auto &ele : candidates) { + const uint32_t tmp = ele[0] * ele[1] * ele[2]; + if (0 < tmp && tmp <= kwg_size) { + results.push_back(ele); + } + } + return results; }; cl::Event event; auto func = [&](const std::vector ¶ms, Timer *timer, @@ -426,5 +475,6 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, } } + } // namespace kernels } // namespace mace diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index 5257ed2cabfc72e59a80f2b1e6af8a00df03cb8c..4576ba99e41156a03def60bac6d9eccb1e7ac69a 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -29,6 +29,8 @@ namespace kernels { const float kMaxKernelExeTime = 1000.0; // microseconds +const int32_t kBaseGPUMemCacheSize = 16384; + enum BufferType { CONV2D_FILTER = 0, IN_OUT_CHANNEL = 1, @@ -112,6 +114,11 @@ std::string Concat(Args... args) { return ss.str(); } +std::vector Default2DLocalWS(const uint32_t *gws, + const uint32_t kwg_size); +std::vector Default3DLocalWS(const uint32_t *gws, + const uint32_t kwg_size); + } // namespace kernels } // namespace mace #endif // MACE_KERNELS_OPENCL_HELPER_H_ diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index b307c44572932bbcdbb5abee14bca75714abc36b..98529547d172613bdc33c5313edc88974efeafb4 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -85,10 +85,10 @@ void MatMulFunctor::operator()(const Tensor *A, kernel_.setArg(idx++, static_cast(RoundUpDiv4(A->dim(2)))); const std::vector lws = {kwg_size_ / 64, 64, 0}; - std::stringstream ss; - ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_" - << C->dim(2) << "_" << C->dim(3); - TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); + std::string tuning_key = + Concat("matmul_opencl_kernel", C->dim(0), + C->dim(1), C->dim(2), C->dim(3)); + TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/pad.cc b/mace/kernels/opencl/pad.cc index 46eb496832c1536c2c0d8ee3ef645062ad3a405e..c3c90944aef80f6e971c8bbfa76045381d399786 100644 --- a/mace/kernels/opencl/pad.cc +++ b/mace/kernels/opencl/pad.cc @@ -100,7 +100,7 @@ void PadFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 0}; + const std::vector lws = Default3DLocalWS(gws, kwg_size_); std::string tuning_key = Concat("pad", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling.cc similarity index 85% rename from mace/kernels/opencl/pooling_opencl.cc rename to mace/kernels/opencl/pooling.cc index 5d31b76f325111f289f0aa88fe286ccc93357a36..7d7fe3d86e48c44dd3a2ffc34460a055d2fd2925 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling.cc @@ -21,6 +21,28 @@ namespace mace { namespace kernels { +namespace { + +std::vector LocalWS(const uint32_t *gws, + const uint32_t kwg_size) { + std::vector lws(4, 0); + uint64_t cache_size = + OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t base = cache_size / kBaseGPUMemCacheSize; + lws[1] = std::min(gws[1], kwg_size); + lws[2] = std::min(std::min(gws[2], base), + kwg_size / lws[1]); + const uint32_t lws_size = lws[1] * lws[2]; + lws[0] = gws[0] / 4; + if (lws[0] == 0) { + lws[0] = gws[0]; + } + lws[0] = std::min(lws[0], kwg_size / lws_size); + return lws; +} + +} // namespace + template void PoolingFunctor::operator()(const Tensor *input, Tensor *output, @@ -134,11 +156,11 @@ void PoolingFunctor::operator()(const Tensor *input, }; } - std::vector lws = {8, kwg_size_ / 64, 8, 0}; - std::stringstream ss; - ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) - << "_" << output->dim(2) << "_" << output->dim(3); - TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future); + const std::vector lws = LocalWS(gws.data(), kwg_size_); + std::string tuning_key = + Concat("pooling_opencl_kernel_", output->dim(0), + output->dim(1), output->dim(2), output->dim(3)); + TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear.cc similarity index 80% rename from mace/kernels/opencl/resize_bilinear_opencl.cc rename to mace/kernels/opencl/resize_bilinear.cc index 1b154bb1adb97657f8e625a5fe839fbc17347550..45f3b2e2a7ac14a2d4a2f1f66c2f12660a463fe0 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear.cc @@ -22,6 +22,34 @@ namespace mace { namespace kernels { +namespace { +std::vector LocalWS(const uint32_t *gws, + const uint32_t kwg_size) { + std::vector lws(4, 0); + uint64_t cache_size = + OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t base = cache_size / kBaseGPUMemCacheSize; + lws[1] = std::min(gws[1], kwg_size); + if (lws[1] >= base) { + lws[0] = std::min(gws[0], base); + } else { + lws[0] = gws[0] / 8; + if (lws[0] == 0) { + lws[0] = gws[0]; + } + } + lws[0] = std::min(lws[0], kwg_size / lws[1]); + const uint32_t lws_size = lws[0] * lws[1]; + lws[2] = gws[2] / 8; + if (lws[2] == 0) { + lws[2] = gws[2]; + } + lws[2] = std::min(lws[2], kwg_size / lws_size); + return lws; +} + +} // namespace + template void ResizeBilinearFunctor::operator()( const Tensor *input, Tensor *output, StatsFuture *future) { @@ -99,11 +127,11 @@ void ResizeBilinearFunctor::operator()( input_shape_ = input->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 0}; - std::stringstream ss; - ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_" - << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); + const std::vector lws = LocalWS(gws, kwg_size_); + std::string tuning_key = + Concat("resize_bilinear_opencl_kernel", output->dim(0), + output->dim(1), output->dim(2), output->dim(3)); + TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc index 29b5f909b28e1504f6a9c825c3e50e1a3b44e676..b7f6086814fb2f63a9a24ccbde5baca0ba904f56 100644 --- a/mace/kernels/opencl/slice.cc +++ b/mace/kernels/opencl/slice.cc @@ -72,7 +72,7 @@ void SliceFunctor::operator()( static_cast(input->dim(0) * input->dim(1)), }; - const std::vector lws = {8, kwg_size_ / 64, 8, 1}; + const std::vector lws = Default3DLocalWS(gws, kwg_size_); cl::Event event; CallStats call_stats{INT64_MAX, 0}; for (int i = 0; i < outputs_count; ++i) { diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax.cc similarity index 81% rename from mace/kernels/opencl/softmax_opencl.cc rename to mace/kernels/opencl/softmax.cc index 47c10dca9fa70e3620dd6ae52e24aff6204c806c..85ba41f274fe97f81e3573aa7c2d4a90df0539f6 100644 --- a/mace/kernels/opencl/softmax_opencl.cc +++ b/mace/kernels/opencl/softmax.cc @@ -22,6 +22,27 @@ namespace mace { namespace kernels { +namespace { + +std::vector LocalWS(const uint32_t *gws, + const uint32_t kwg_size) { + uint64_t cache_size = + OpenCLRuntime::Global()->device_global_mem_cache_size(); + uint32_t base = cache_size / kBaseGPUMemCacheSize; + std::vector lws(4, 0); + lws[1] = std::min(gws[1], kwg_size); + if (gws[0] < base) { + lws[0] = gws[0]; + } else { + lws[0] = gws[0] / base; + } + lws[0] = std::min(lws[0], kwg_size / lws[1]); + lws[2] = std::min(gws[2], kwg_size / (lws[0] * lws[1])); + return lws; +} + +} // namespace + template void SoftmaxFunctor::operator()(const Tensor *logits, Tensor *output, @@ -81,11 +102,11 @@ void SoftmaxFunctor::operator()(const Tensor *logits, input_shape_ = logits->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 0}; - std::stringstream ss; - ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) - << "_" << output->dim(2) << "_" << output->dim(3); - TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); + std::vector lws = LocalWS(gws, kwg_size_); + std::string tuning_key = + Concat("softmax_opencl_kernel", output->dim(0), + output->dim(1), output->dim(2), output->dim(3)); + TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch.cc similarity index 93% rename from mace/kernels/opencl/space_to_batch_opencl.cc rename to mace/kernels/opencl/space_to_batch.cc index 454d2d0a0d0e148618262ebb710d3a9712ec2ca2..02f76bea173e9275db4d35b05152af53518cd668 100644 --- a/mace/kernels/opencl/space_to_batch_opencl.cc +++ b/mace/kernels/opencl/space_to_batch.cc @@ -105,12 +105,11 @@ void SpaceToBatchFunctor::operator()( space_shape_ = space_tensor->shape(); } - const std::vector lws = {8, kwg_size_ / 64, 8, 0}; - std::stringstream ss; - ss << kernel_name << "_" << batch_tensor->dim(0) << "_" - << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_" - << batch_tensor->dim(3); - TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); + const std::vector lws = Default3DLocalWS(gws, kwg_size_); + std::string tuning_key = + Concat(kernel_name, batch_tensor->dim(0), batch_tensor->dim(1), + batch_tensor->dim(2), batch_tensor->dim(3)); + TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 62a7ca601f5a29b5387f90938281c3cb73128235..497cd1000479ff1c855cd3c89199fbab8cb96ced 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -102,11 +102,11 @@ void WinogradTransformFunctor::operator()( } const std::vector lws = {kwg_size_ / 8, 8, 0}; - std::stringstream ss; - ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_" - << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_" - << input_tensor->dim(3); - TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); + std::string tuning_key = + Concat("winograd_transform_kernel", output_tensor->dim(0), + output_tensor->dim(1), output_tensor->dim(2), + output_tensor->dim(3)); + TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr); @@ -216,12 +216,11 @@ void WinogradInverseTransformFunctor::operator()( } const std::vector lws = {kwg_size_ / 8, 8, 0}; - - std::stringstream ss; - ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_" - << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_" - << input_tensor->dim(3); - TuningOrRun2DKernel(kernel_, ss.str(), gws, lws, future); + std::string tuning_key = + Concat("winograd_inverse_transform_kernel", output_tensor->dim(0), + output_tensor->dim(1), output_tensor->dim(2), + output_tensor->dim(3), input_tensor->dim(2)); + TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); if (runtime->IsOutOfRangeCheckEnabled()) { kernel_error_->Map(nullptr);