From 1794dae4a4d2334394c993da681508d6249361ca Mon Sep 17 00:00:00 2001 From: liuqi Date: Thu, 3 May 2018 11:02:26 +0800 Subject: [PATCH] Add more strategy for convolution opencl default lws. --- mace/kernels/opencl/concat.cc | 4 ++-- mace/kernels/opencl/conv_2d_1x1.cc | 8 ++++++-- mace/kernels/opencl/conv_2d_3x3.cc | 3 +-- mace/kernels/opencl/conv_2d_general.cc | 2 +- mace/kernels/opencl/depthwise_conv.cc | 2 +- mace/kernels/opencl/helper.cc | 1 - mace/kernels/opencl/helper.h | 1 - mace/kernels/opencl/matmul.cc | 2 +- mace/kernels/opencl/pooling.cc | 2 +- mace/kernels/opencl/resize_bilinear.cc | 2 +- mace/kernels/opencl/softmax.cc | 2 +- mace/kernels/opencl/winograd_transform.cc | 4 ++-- 12 files changed, 17 insertions(+), 16 deletions(-) diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index 514da784..f2fda4c4 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -25,7 +25,7 @@ namespace { std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); - uint64_t cache_size = + uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); uint32_t base = cache_size / kBaseGPUMemCacheSize; lws[1] = std::min(gws[1], kwg_size); @@ -114,7 +114,7 @@ static void Concat2(cl::Kernel *kernel, const std::vector lws = LocalWS(gws, *kwg_size); std::string tuning_key = - Concat("concat_opencl_kernel", output->dim(0), + Concat("concat_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); diff --git a/mace/kernels/opencl/conv_2d_1x1.cc b/mace/kernels/opencl/conv_2d_1x1.cc index edce5d11..264dce62 100644 --- a/mace/kernels/opencl/conv_2d_1x1.cc +++ b/mace/kernels/opencl/conv_2d_1x1.cc @@ -23,16 +23,20 @@ namespace kernels { namespace { // (inputs + weights + outputs) * array_size * sizeof(float) const uint32_t kernel_cache_size = (4 + 4 + 4) * 4 * 4; +// TODO(liuqi): Fix the specific value. +const uint32_t lws_limit = 128; std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); - uint64_t cache_size = + uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); uint32_t base = cache_size / kBaseGPUMemCacheSize; lws[1] = std::min(gws[1], kwg_size); if (lws[1] >= base) { lws[0] = std::min(gws[0], base); + } else if ((1 < lws[1] && lws[1] < base) && gws[0] >= lws_limit) { + lws[0] = std::min(gws[0], base); } else { lws[0] = gws[0] / 8; if (lws[0] < base) { @@ -165,7 +169,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, std::vector lws = LocalWS(gws, *kwg_size); std::string tuning_key = - Concat("conv2d_1x1_opencl_kernel", output->dim(0), + Concat("conv2d_1x1_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); diff --git a/mace/kernels/opencl/conv_2d_3x3.cc b/mace/kernels/opencl/conv_2d_3x3.cc index 7dcd320d..f5600883 100644 --- a/mace/kernels/opencl/conv_2d_3x3.cc +++ b/mace/kernels/opencl/conv_2d_3x3.cc @@ -21,7 +21,6 @@ namespace mace { namespace kernels { - namespace { // (inputs + weights + outputs) * array_size * sizeof(float) const uint32_t kernel_cache_size = (5 + 4 + 5) * 4 * 4; @@ -157,7 +156,7 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const std::vector lws = LocalWS(gws, *kwg_size); + std::vector lws = LocalWS(gws, *kwg_size); std::string tuning_key = Concat("conv2d_3x3_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/conv_2d_general.cc b/mace/kernels/opencl/conv_2d_general.cc index 63b64dbd..8eb09062 100644 --- a/mace/kernels/opencl/conv_2d_general.cc +++ b/mace/kernels/opencl/conv_2d_general.cc @@ -168,7 +168,7 @@ extern void Conv2dOpencl(cl::Kernel *kernel, std::string tuning_key = Concat("conv2d_general_opencl_kernel", output->dim(0), - output->dim(1), output->dim(2), output->dim(3), + output->dim(1), output->dim(2), output->dim(3), filter->dim(0), filter->dim(1)); std::vector lws = LocalWS(gws, filter->dim(0) * filter->dim(1), *kwg_size); diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/depthwise_conv.cc index d4aa32f3..fa8383d8 100644 --- a/mace/kernels/opencl/depthwise_conv.cc +++ b/mace/kernels/opencl/depthwise_conv.cc @@ -32,7 +32,7 @@ std::vector LocalWS(const uint32_t *gws, lws[1] = std::min(gws[1], kwg_size); if (lws[1] >= min_lws0) { lws[0] = std::min(gws[0], min_lws0); - } else { + } else { lws[0] = std::min(gws[0] / 8, kwg_size / lws[1]); if (lws[0] < min_lws0) { lws[0] = std::min(std::max(gws[0] / 4, min_lws0), diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index 7f3e444e..96b99d17 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -215,7 +215,6 @@ std::vector Default2DLocalWS(const uint32_t *gws, lws[0] = std::min(base, kwg_size); lws[1] = kwg_size / lws[1]; return lws; - } std::vector Default3DLocalWS(const uint32_t *gws, diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index 4576ba99..2f3a44dc 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -118,7 +118,6 @@ std::vector Default2DLocalWS(const uint32_t *gws, const uint32_t kwg_size); std::vector Default3DLocalWS(const uint32_t *gws, const uint32_t kwg_size); - } // namespace kernels } // namespace mace #endif // MACE_KERNELS_OPENCL_HELPER_H_ diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index 98529547..93f2bfb5 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -86,7 +86,7 @@ void MatMulFunctor::operator()(const Tensor *A, const std::vector lws = {kwg_size_ / 64, 64, 0}; std::string tuning_key = - Concat("matmul_opencl_kernel", C->dim(0), + Concat("matmul_opencl_kernel", C->dim(0), C->dim(1), C->dim(2), C->dim(3)); TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc index 7d7fe3d8..0d574eb5 100644 --- a/mace/kernels/opencl/pooling.cc +++ b/mace/kernels/opencl/pooling.cc @@ -158,7 +158,7 @@ void PoolingFunctor::operator()(const Tensor *input, const std::vector lws = LocalWS(gws.data(), kwg_size_); std::string tuning_key = - Concat("pooling_opencl_kernel_", output->dim(0), + Concat("pooling_opencl_kernel_", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); TuningOrRun3DKernel(kernel_, tuning_key, gws.data(), lws, future); diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc index 45f3b2e2..1c36b27e 100644 --- a/mace/kernels/opencl/resize_bilinear.cc +++ b/mace/kernels/opencl/resize_bilinear.cc @@ -129,7 +129,7 @@ void ResizeBilinearFunctor::operator()( const std::vector lws = LocalWS(gws, kwg_size_); std::string tuning_key = - Concat("resize_bilinear_opencl_kernel", output->dim(0), + Concat("resize_bilinear_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc index 85ba41f2..24329be4 100644 --- a/mace/kernels/opencl/softmax.cc +++ b/mace/kernels/opencl/softmax.cc @@ -104,7 +104,7 @@ void SoftmaxFunctor::operator()(const Tensor *logits, std::vector lws = LocalWS(gws, kwg_size_); std::string tuning_key = - Concat("softmax_opencl_kernel", output->dim(0), + Concat("softmax_opencl_kernel", output->dim(0), output->dim(1), output->dim(2), output->dim(3)); TuningOrRun3DKernel(kernel_, tuning_key, gws, lws, future); diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index 497cd100..32939703 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -103,7 +103,7 @@ void WinogradTransformFunctor::operator()( const std::vector lws = {kwg_size_ / 8, 8, 0}; std::string tuning_key = - Concat("winograd_transform_kernel", output_tensor->dim(0), + Concat("winograd_transform_kernel", output_tensor->dim(0), output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(3)); TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); @@ -217,7 +217,7 @@ void WinogradInverseTransformFunctor::operator()( const std::vector lws = {kwg_size_ / 8, 8, 0}; std::string tuning_key = - Concat("winograd_inverse_transform_kernel", output_tensor->dim(0), + Concat("winograd_inverse_transform_kernel", output_tensor->dim(0), output_tensor->dim(1), output_tensor->dim(2), output_tensor->dim(3), input_tensor->dim(2)); TuningOrRun2DKernel(kernel_, tuning_key, gws, lws, future); -- GitLab