diff --git a/mace/core/runtime/opencl/opencl_allocator.cc b/mace/core/runtime/opencl/opencl_allocator.cc index 4d0c235134729a76cb00ee1f77713fa23f93c0d9..ad5c8eaca8273ffc63f24949bc59f03731163ce5 100644 --- a/mace/core/runtime/opencl/opencl_allocator.cc +++ b/mace/core/runtime/opencl/opencl_allocator.cc @@ -56,7 +56,7 @@ MaceStatus OpenCLAllocator::New(size_t nbytes, void **result) const { nbytes, nullptr, &error); if (error != CL_SUCCESS) { LOG(WARNING) << "Allocate OpenCL Buffer with " - << nbytes << " bytes failed because of" + << nbytes << " bytes failed because of " << OpenCLErrorToString(error); delete buffer; *result = nullptr; diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index 5235479db1455f1b7830445b6e8d3de1d56da9db..ae2f69390968841a4f7410ccec3b16be2bc15971 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -371,7 +371,8 @@ OpenCLRuntime::OpenCLRuntime(): } cl_int err; - if (gpu_type_ == GPUType::QUALCOMM_ADRENO) { + if (gpu_type_ == GPUType::QUALCOMM_ADRENO + && opencl_version_ == OpenCLVersion::CL_VER_2_0) { std::vector context_properties; context_properties.reserve(5); GetAdrenoContextProperties(&context_properties, @@ -698,7 +699,7 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) { bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() const { return (gpu_type_ == GPUType::QUALCOMM_ADRENO && - opencl_version_ == "2.0"); + opencl_version_ == OpenCLVersion::CL_VER_2_0); } GPUType OpenCLRuntime::gpu_type() const { @@ -709,13 +710,24 @@ const std::string OpenCLRuntime::platform_info() const { return platform_info_; } -const std::string OpenCLRuntime::ParseDeviceVersion( +OpenCLVersion OpenCLRuntime::ParseDeviceVersion( const std::string &device_version) { // OpenCL Device version string format: // OpenCL // auto words = Split(device_version, ' '); - return words[1]; + if (words[1] == "2.0") { + return OpenCLVersion::CL_VER_2_0; + } else if (words[1] == "1.2") { + return OpenCLVersion::CL_VER_1_2; + } else if (words[1] == "1.1") { + return OpenCLVersion::CL_VER_1_1; + } else if (words[1] == "1.0") { + return OpenCLVersion::CL_VER_1_0; + } else { + LOG(FATAL) << "Do not support OpenCL version: " << words[1]; + return OpenCLVersion::CL_VER_1_0; + } } bool OpenCLRuntime::IsOutOfRangeCheckEnabled() const { diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index 447a2c371c14508f8023453d301a5cbe7abab5c0..31d2932823243d76fc0b15dabee2c4b8ad3af894 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -38,6 +38,13 @@ enum GPUType { UNKNOWN, }; +enum OpenCLVersion { + CL_VER_1_0, + CL_VER_1_1, + CL_VER_1_2, + CL_VER_2_0, +}; + const std::string OpenCLErrorToString(cl_int error); @@ -113,7 +120,7 @@ class OpenCLRuntime { const std::string &built_program_key, const std::string &build_options_str, cl::Program *program); - const std::string ParseDeviceVersion(const std::string &device_version); + OpenCLVersion ParseDeviceVersion(const std::string &device_version); private: std::unique_ptr precompiled_binary_storage_; @@ -127,7 +134,7 @@ class OpenCLRuntime { std::map built_program_map_; std::mutex program_build_mutex_; std::string platform_info_; - std::string opencl_version_; + OpenCLVersion opencl_version_; std::string precompiled_binary_platform_info_; std::string cached_binary_platform_info_; bool out_of_range_check_; diff --git a/mace/kernels/opencl/batch_norm.cc b/mace/kernels/opencl/batch_norm.cc index 76f246df010fba06d44fd32a8a4f7eb90b688918..2769e08b90c2a665ec4236103cdba8dd04c6fac4 100644 --- a/mace/kernels/opencl/batch_norm.cc +++ b/mace/kernels/opencl/batch_norm.cc @@ -118,10 +118,7 @@ MaceStatus BatchNormFunctor::operator()( input_shape_ = input->shape(); } - std::vector lws(4, 0); - lws[1] = std::min(gws[1], kwg_size_); - lws[0] = std::min(4, kwg_size_ / lws[1]); - lws[2] = std::min(gws[2], kwg_size_ / (lws[1] * lws[0])); + const std::vector lws = Default3DLocalWS(gws, kwg_size_); std::string tuning_key = Concat("batch_norm_opencl_kernel", activation_, output->dim(0), output->dim(1), output->dim(2), output->dim(3), folded_constant_); diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index 47e09450a3654b1ca4bac2c825b1017fd5fc7d45..4ccb42a167715f4a20c94095da5ca256fccf1bdc 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -25,11 +25,11 @@ namespace { std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t base = cache_size / kBaseGPUMemCacheSize; + uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); lws[0] = std::min(base, kwg_size / lws[1]); const uint32_t lws_size = lws[0] * lws[1]; - lws[2] = std::min(base, kwg_size / lws_size); + lws[2] = std::max(std::min(base, kwg_size / lws_size), 1); return lws; } diff --git a/mace/kernels/opencl/conv_2d.cc b/mace/kernels/opencl/conv_2d.cc index d7706cb588aa5c3dac9cfd4fb5ff29eb714a8eca..6221382e7e8e9d9290777379d7d77832b17b8e40 100644 --- a/mace/kernels/opencl/conv_2d.cc +++ b/mace/kernels/opencl/conv_2d.cc @@ -80,8 +80,8 @@ MaceStatus Conv2dFunctor::operator()(const Tensor *input, std::vector *input_shape, Tensor *output, StatsFuture *future, uint32_t *kwg_size, std::unique_ptr *kernel_error); // Selection matrix: kernel_size x stride_size - static const Conv2dOpenclFunction selector[5] = { - Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr}; + static const Conv2dOpenclFunction selector[3] = { + Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3}; index_t kernel_h = filter->dim(2); index_t kernel_w = filter->dim(3); @@ -113,7 +113,7 @@ MaceStatus Conv2dFunctor::operator()(const Tensor *input, &output_image_shape); MACE_RETURN_IF_ERROR(output->ResizeImage(output_shape, output_image_shape)); - if (kernel_h == kernel_w && kernel_h <= 5 && + if (kernel_h == kernel_w && kernel_h <= 3 && selector[kernel_h - 1] != nullptr) { auto conv2d_func = selector[kernel_h - 1]; return conv2d_func( diff --git a/mace/kernels/opencl/conv_2d_1x1.cc b/mace/kernels/opencl/conv_2d_1x1.cc index 3f9596dfe8db68c573c50c40ab1c23e4570a7188..48ea04d3d35f792a984eaa2cef01ca2b00726e3e 100644 --- a/mace/kernels/opencl/conv_2d_1x1.cc +++ b/mace/kernels/opencl/conv_2d_1x1.cc @@ -29,7 +29,8 @@ std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); - uint32_t base = cache_size / kBaseGPUMemCacheSize; + const uint32_t base = + std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); if (lws[1] >= base) { lws[0] = std::min(gws[0], base); @@ -48,7 +49,8 @@ std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { if (lws[2] == 0) { lws[2] = std::min(gws[2], base); } - lws[2] = std::min(lws[2], kwg_size / lws_size); + lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), + 1); return lws; } diff --git a/mace/kernels/opencl/conv_2d_3x3.cc b/mace/kernels/opencl/conv_2d_3x3.cc index 0da37fc1573eb0b4c3be0f7f09b2dde1ba6db27e..0fa295d0746d8903741418d3cc4177a8f5c91da2 100644 --- a/mace/kernels/opencl/conv_2d_3x3.cc +++ b/mace/kernels/opencl/conv_2d_3x3.cc @@ -30,7 +30,8 @@ std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { uint32_t compute_units = std::max( OpenCLRuntime::Global()->device_compute_units() / 2, 1); const uint32_t base = - std::min(cache_size / kBaseGPUMemCacheSize, 4); + std::max( + std::min(cache_size / kBaseGPUMemCacheSize, 4), 1); lws[1] = std::min(gws[1], kwg_size); lws[0] = std::min(std::min(gws[0], base), kwg_size / lws[1]); @@ -42,7 +43,8 @@ std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { if (lws[2] == 0) { lws[2] = std::min(gws[2], base); } - lws[2] = std::min(lws[2], kwg_size / lws_size); + lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), + 1); return lws; } diff --git a/mace/kernels/opencl/conv_2d_general.cc b/mace/kernels/opencl/conv_2d_general.cc index 127df7e9c45af64fd2a316bd8880eaa1a131ff27..c3208eca5f45ad7fd8fdc255a0e59906b531e228 100644 --- a/mace/kernels/opencl/conv_2d_general.cc +++ b/mace/kernels/opencl/conv_2d_general.cc @@ -32,7 +32,8 @@ std::vector LocalWS(const uint32_t *gws, std::vector lws(4, 0); uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); uint32_t compute_units = OpenCLRuntime::Global()->device_compute_units(); - uint32_t base = cache_size / kBaseGPUMemCacheSize; + const uint32_t base = + std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); lws[0] = gws[0] / 4; if (lws[0] == 0) { @@ -51,7 +52,8 @@ std::vector LocalWS(const uint32_t *gws, lws[2] = base; } } - lws[2] = std::min(lws[2], kwg_size / lws_size); + lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), + 1); return lws; } diff --git a/mace/kernels/opencl/deconv_2d_opencl.cc b/mace/kernels/opencl/deconv_2d_opencl.cc index 486bf064e592fa4b4caf8fa5f1fb51eb50cee32f..a6d65b9e3255a15235dd9421b9194934740d93bb 100644 --- a/mace/kernels/opencl/deconv_2d_opencl.cc +++ b/mace/kernels/opencl/deconv_2d_opencl.cc @@ -144,7 +144,7 @@ MaceStatus Deconv2dOpencl(cl::Kernel *kernel, *prev_input_shape = input->shape(); } - const std::vector lws = {8, *kwg_size / 64, 8, 0}; + const std::vector lws = Default3DLocalWS(gws, *kwg_size); std::string tuning_key = Concat("deconv2d_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/depthwise_conv.cc b/mace/kernels/opencl/depthwise_conv.cc index 7ca2d5d708ac6211e1136b7fdd46ba516f3251a3..ec358d092ef1da623190b55ff9b8da04a03bb1c4 100644 --- a/mace/kernels/opencl/depthwise_conv.cc +++ b/mace/kernels/opencl/depthwise_conv.cc @@ -27,24 +27,26 @@ const uint32_t kernel_cache_size = (4 + 4 + 1) * 4 * 4; std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t min_lws0 = cache_size / kBaseGPUMemCacheSize; + uint32_t base = cache_size / kBaseGPUMemCacheSize; lws[1] = std::min(gws[1], kwg_size); - if (lws[1] >= min_lws0) { - lws[0] = std::min(gws[0], min_lws0); + if (lws[1] >= base) { + lws[0] = std::min(gws[0], base); } else { lws[0] = std::min(gws[0] / 8, kwg_size / lws[1]); - if (lws[0] < min_lws0) { - lws[0] = std::min(std::max(gws[0] / 4, min_lws0), + if (lws[0] < base) { + lws[0] = std::min(std::max(gws[0] / 4, base), kwg_size / lws[1]); } } + lws[0] = std::max(lws[0], 1); const uint32_t lws_size = lws[0] * lws[1]; lws[2] = std::min((cache_size / kernel_cache_size / lws_size) * 4, gws[2]); if (lws[2] == 0) { lws[2] = gws[2]; } - lws[2] = std::min(lws[2], kwg_size / lws_size); + lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), + 1); return lws; } diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index b5254800fa9358fa95e81cb9328cfcb4f01fedba..78e50871ff77d19a7c287619c75499afd00047d7 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -252,7 +252,8 @@ std::vector Default3DLocalWS(const uint32_t *gws, lws[2] = std::min(std::min(gws[2], base), kwg_size / lws[1]); const uint32_t lws_size = lws[1] * lws[2]; - lws[0] = std::min(base, kwg_size / lws_size); + lws[0] = std::max(std::min(base, kwg_size / lws_size), + 1); return lws; } diff --git a/mace/kernels/opencl/pooling.cc b/mace/kernels/opencl/pooling.cc index b408a9cdf477b416cb15b79e7619d0158ebe8440..09bc8d905f835cd48345856d434ce974638c0e29 100644 --- a/mace/kernels/opencl/pooling.cc +++ b/mace/kernels/opencl/pooling.cc @@ -26,7 +26,7 @@ namespace { std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t base = cache_size / kBaseGPUMemCacheSize; + uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); lws[2] = std::min(std::min(gws[2], base), kwg_size / lws[1]); @@ -35,7 +35,8 @@ std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { if (lws[0] == 0) { lws[0] = gws[0]; } - lws[0] = std::min(lws[0], kwg_size / lws_size); + lws[0] = std::max(std::min(lws[0], kwg_size / lws_size), + 1); return lws; } diff --git a/mace/kernels/opencl/resize_bilinear.cc b/mace/kernels/opencl/resize_bilinear.cc index e935e9059130cd513a90543d548e24be467a83ae..ee823116877939eb7df7ba142cc9e0c763c3f289 100644 --- a/mace/kernels/opencl/resize_bilinear.cc +++ b/mace/kernels/opencl/resize_bilinear.cc @@ -26,7 +26,7 @@ namespace { std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { std::vector lws(4, 0); uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t base = cache_size / kBaseGPUMemCacheSize; + uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); lws[1] = std::min(gws[1], kwg_size); if (lws[1] >= base) { lws[0] = std::min(gws[0], base); @@ -42,7 +42,8 @@ std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { if (lws[2] == 0) { lws[2] = gws[2]; } - lws[2] = std::min(lws[2], kwg_size / lws_size); + lws[2] = std::max(std::min(lws[2], kwg_size / lws_size), + 1); return lws; } diff --git a/mace/kernels/opencl/softmax.cc b/mace/kernels/opencl/softmax.cc index 76dc2c7f810aadf9042c5a5c6ce07875fdf799f3..38d0b8bb3eb205b73a0f66e6fe0be2fad9134bcc 100644 --- a/mace/kernels/opencl/softmax.cc +++ b/mace/kernels/opencl/softmax.cc @@ -26,7 +26,7 @@ namespace { std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { uint64_t cache_size = OpenCLRuntime::Global()->device_global_mem_cache_size(); - uint32_t base = cache_size / kBaseGPUMemCacheSize; + uint32_t base = std::max(cache_size / kBaseGPUMemCacheSize, 1); std::vector lws(4, 0); lws[1] = std::min(gws[1], kwg_size); if (gws[0] < base) { @@ -35,7 +35,9 @@ std::vector LocalWS(const uint32_t *gws, const uint32_t kwg_size) { lws[0] = gws[0] / base; } lws[0] = std::min(lws[0], kwg_size / lws[1]); - lws[2] = std::min(gws[2], kwg_size / (lws[0] * lws[1])); + lws[2] = std::max(std::min(gws[2], + kwg_size / (lws[0] * lws[1])), + 1); return lws; } diff --git a/mace/ops/buffer_to_image_test.cc b/mace/ops/buffer_to_image_test.cc index 94b28b0b1b2e74b806b853e2a015a7bd54b4cade..6b2325b725177ff203deecda7b5e55bc74d47b6a 100644 --- a/mace/ops/buffer_to_image_test.cc +++ b/mace/ops/buffer_to_image_test.cc @@ -136,7 +136,7 @@ TEST(BufferToImageTest, WeightWidthMedium) { TEST(BufferToImageTest, WeightWidthLarge) { TestBidirectionTransform(kernels::WEIGHT_WIDTH, - {64, 128, 11, 13}); + {64, 64, 11, 13}); } TEST(BufferToImageTest, WeightHeightSmall) { @@ -151,7 +151,7 @@ TEST(BufferToImageTest, WeightHeightMedium) { TEST(BufferToImageTest, WeightHeightLarge) { TestBidirectionTransform(kernels::WEIGHT_HEIGHT, - {64, 32, 11, 13}); + {64, 16, 11, 13}); } namespace {