diff --git a/mace/core/runtime/opencl/opencl_runtime.cc b/mace/core/runtime/opencl/opencl_runtime.cc index d5ba7010c8b58c809e98ea7b1ba9afb840a948e5..dcd7fab4a019c5a49772b7794c1aa17e0bbd4e26 100644 --- a/mace/core/runtime/opencl/opencl_runtime.cc +++ b/mace/core/runtime/opencl/opencl_runtime.cc @@ -147,16 +147,9 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, if (device.getInfo() == CL_DEVICE_TYPE_GPU) { *device_ = device; gpu_detected = true; + const std::string device_name = device.getInfo(); - constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)"; - constexpr const char *kMaliGPUStr = "Mali"; - if (device_name == kQualcommAdrenoGPUStr) { - gpu_type_ = GPU_TYPE::QUALCOMM_ADRENO; - } else if (device_name.find(kMaliGPUStr) != std::string::npos) { - gpu_type_ = GPU_TYPE::MALI; - } else { - gpu_type_ = GPU_TYPE::UNKNOWN; - } + gpu_type_ = ParseGPUTypeFromDeviceName(device_name); const std::string device_version = device.getInfo(); opencl_version_ = device_version.substr(7, 3); @@ -178,7 +171,7 @@ OpenCLRuntime::OpenCLRuntime(GPUPerfHint gpu_perf_hint, } cl_int err; - if (gpu_type_ == GPU_TYPE::QUALCOMM_ADRENO) { + if (gpu_type_ == GPUType::QUALCOMM_ADRENO) { std::vector context_properties; context_properties.reserve(5); GetAdrenoContextProperties(&context_properties, gpu_perf_hint, @@ -357,12 +350,30 @@ uint64_t OpenCLRuntime::GetKernelWaveSize(const cl::Kernel &kernel) { return size; } -const GPU_TYPE OpenCLRuntime::GetGPUType() const { - return gpu_type_; +const bool OpenCLRuntime::IsNonUniformWorkgroupsSupported() { + if (gpu_type_ == GPUType::QUALCOMM_ADRENO && + opencl_version_ == "2.0") { + return true; + } else { + return false; + } } -const std::string &OpenCLRuntime::GetOpenclVersion() const { - return opencl_version_; +const GPUType OpenCLRuntime::ParseGPUTypeFromDeviceName( + const std::string &device_name) { + constexpr const char *kQualcommAdrenoGPUStr = "QUALCOMM Adreno(TM)"; + constexpr const char *kMaliGPUStr = "Mali"; + constexpr const char *kPowerVRGPUStr = "PowerVR"; + + if (device_name == kQualcommAdrenoGPUStr) { + return GPUType::QUALCOMM_ADRENO; + } else if (device_name.find(kMaliGPUStr) != std::string::npos) { + return GPUType::MALI; + } else if (device_name.find(kPowerVRGPUStr) != std::string::npos) { + return GPUType::PowerVR; + } else { + return GPUType::UNKNOWN; + } } } // namespace mace diff --git a/mace/core/runtime/opencl/opencl_runtime.h b/mace/core/runtime/opencl/opencl_runtime.h index d3cc5cc7037ddea9a717ab537e8889aa6ce50bd3..8a3ce06abb63f078efa89ca43b242e46a13e5a3e 100644 --- a/mace/core/runtime/opencl/opencl_runtime.h +++ b/mace/core/runtime/opencl/opencl_runtime.h @@ -18,9 +18,10 @@ namespace mace { -enum GPU_TYPE { +enum GPUType { QUALCOMM_ADRENO, MALI, + PowerVR, UNKNOWN, }; @@ -55,8 +56,8 @@ class OpenCLRuntime { uint64_t GetDeviceMaxWorkGroupSize(); uint64_t GetKernelMaxWorkGroupSize(const cl::Kernel &kernel); uint64_t GetKernelWaveSize(const cl::Kernel &kernel); - const GPU_TYPE GetGPUType() const; - const std::string &GetOpenclVersion() const; + const bool IsNonUniformWorkgroupsSupported(); + const GPUType ParseGPUTypeFromDeviceName(const std::string &device_name); cl::Kernel BuildKernel(const std::string &program_name, const std::string &kernel_name, const std::set &build_options); @@ -82,7 +83,7 @@ class OpenCLRuntime { std::map built_program_map_; std::mutex program_build_mutex_; std::string kernel_path_; - GPU_TYPE gpu_type_; + GPUType gpu_type_; std::string opencl_version_; static GPUPerfHint gpu_perf_hint_; diff --git a/mace/kernels/activation.h b/mace/kernels/activation.h index 55368c3ca83c8aa7dd9e8d76efb47bde568ec4ce..5bb2fe4ac384add95b0177ba3ae1d192742481ce 100644 --- a/mace/kernels/activation.h +++ b/mace/kernels/activation.h @@ -155,6 +155,8 @@ class ActivationFunctor { ActivationType activation_; T relux_max_limit_; cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; std::string tuning_key_prefix_; std::vector input_shape_; }; diff --git a/mace/kernels/addn.h b/mace/kernels/addn.h index 70d9583ba798babd3a27737c9ed7487913441bf6..e2d875e9a73e9d12668e6f11388060a35454e8ec 100644 --- a/mace/kernels/addn.h +++ b/mace/kernels/addn.h @@ -90,6 +90,8 @@ struct AddNFunctor { StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/batch_norm.h b/mace/kernels/batch_norm.h index 28b8d776c967e48a4af835ee55913c437aa3d3ea..f17db80a48295d1bf7a24e5775fae4a17f9a81f0 100644 --- a/mace/kernels/batch_norm.h +++ b/mace/kernels/batch_norm.h @@ -157,6 +157,8 @@ struct BatchNormFunctor : BatchNormFunctorBase { Tensor *output, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/bias_add.h b/mace/kernels/bias_add.h index d5372850bcf604b0f1e01e630c0c30b59e95abc0..f2f917f4f5073c3e585e70260678c95f9f13f59c 100644 --- a/mace/kernels/bias_add.h +++ b/mace/kernels/bias_add.h @@ -64,6 +64,8 @@ struct BiasAddFunctor { Tensor *output, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/channel_shuffle.h b/mace/kernels/channel_shuffle.h index f1e258337a2d9a871bbb3ac4aec70faf1a18edf9..b93e657837a50f658aa9c3444b99e3a0d65cf761 100644 --- a/mace/kernels/channel_shuffle.h +++ b/mace/kernels/channel_shuffle.h @@ -56,6 +56,8 @@ struct ChannelShuffleFunctor { void operator()(const Tensor *input, Tensor *output, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; const int groups_; std::vector input_shape_; }; diff --git a/mace/kernels/concat.h b/mace/kernels/concat.h index de34ed69fa5803f61e9f6785b9d4b7185be2cccc..7186bde6e452983b3bc5620e3b620086907e19ab 100644 --- a/mace/kernels/concat.h +++ b/mace/kernels/concat.h @@ -85,6 +85,8 @@ struct ConcatFunctor : ConcatFunctorBase { Tensor *output, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/conv_2d.h b/mace/kernels/conv_2d.h index 47516291d14ec21ba2202e2089bee03d6387c433..f2d3dfbb53c40ca5ff5e7753333c88300ac8b535 100644 --- a/mace/kernels/conv_2d.h +++ b/mace/kernels/conv_2d.h @@ -401,6 +401,8 @@ struct Conv2dFunctor : Conv2dFunctorBase { StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/depth_to_space.h b/mace/kernels/depth_to_space.h index 3f6577f32159309bba931eaef58011902ecc2045..6b439db67ecb2c5c2f6ee2390e7900adfc90a307 100644 --- a/mace/kernels/depth_to_space.h +++ b/mace/kernels/depth_to_space.h @@ -108,6 +108,8 @@ struct DepthToSpaceOpFunctor { void operator()(const Tensor *input, Tensor *output, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; const int block_size_; bool d2s_; std::vector input_shape_; diff --git a/mace/kernels/depthwise_conv2d.h b/mace/kernels/depthwise_conv2d.h index 166ea18a644ead1d53af2a7c3b83c73c617554d6..ba4f74c8a12132b1780467b38e35f52a8e127063 100644 --- a/mace/kernels/depthwise_conv2d.h +++ b/mace/kernels/depthwise_conv2d.h @@ -437,6 +437,8 @@ struct DepthwiseConv2dFunctor StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/eltwise.h b/mace/kernels/eltwise.h index 0f9e9b40061890a62e36104746bcaf0120bfab0f..11d52bc97e8802b04058589c6eb3bdb057607f00 100644 --- a/mace/kernels/eltwise.h +++ b/mace/kernels/eltwise.h @@ -97,6 +97,8 @@ struct EltwiseFunctor : EltwiseFunctorBase { StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/matmul.h b/mace/kernels/matmul.h index 62590400bf038773c9f16fae68f4c42de4ee9130..1ce9b6fd07f4a377664b03b821cf1b170dadea19 100644 --- a/mace/kernels/matmul.h +++ b/mace/kernels/matmul.h @@ -241,6 +241,8 @@ struct MatMulFunctor { StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; }; } // namespace kernels diff --git a/mace/kernels/opencl/activation_opencl.cc b/mace/kernels/opencl/activation_opencl.cc index d7b89336d196ac701572a76ac23b3eedba4c46a8..d3e6c7f90748ed0061f4671b4d540cf8b7129563 100644 --- a/mace/kernels/opencl/activation_opencl.cc +++ b/mace/kernels/opencl/activation_opencl.cc @@ -26,16 +26,16 @@ void ActivationFunctor::operator()(const Tensor *input, auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel_.get() == nullptr) { + is_non_uniform_work_groups_supported_ = + runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("activation"); built_options.emplace("-Dactivation=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported_) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } switch (activation_) { @@ -83,11 +83,12 @@ void ActivationFunctor::operator()(const Tensor *input, kernel_.setArg(idx++, gws[2]); input_shape_ = input->shape(); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - const std::vector lws = {8, kwg_size / 64, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::string tuning_key = Concat(tuning_key_prefix_, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/addn.cc b/mace/kernels/opencl/addn.cc index 37e6062a989f47baf8e613a9e1847c233d3061dc..d7c149a9720ea30b45c9c3745422cd19f4c7820a 100644 --- a/mace/kernels/opencl/addn.cc +++ b/mace/kernels/opencl/addn.cc @@ -26,8 +26,6 @@ void AddNFunctor::operator()( auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - for (int i = 1; i < size; ++i) { MACE_CHECK_NOTNULL(input_tensors[i]); MACE_CHECK(batch == input_tensors[i]->dim(0)); @@ -37,6 +35,8 @@ void AddNFunctor::operator()( } if (kernel_.get() == nullptr) { + is_non_uniform_work_groups_supported_ = + runtime->IsNonUniformWorkgroupsSupported(); if (input_tensors.size() > 4) { MACE_NOT_IMPLEMENTED; } @@ -47,7 +47,7 @@ void AddNFunctor::operator()( built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(MakeString("-DINPUT_NUM=", input_tensors.size())); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported_) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } @@ -78,11 +78,12 @@ void AddNFunctor::operator()( kernel_.setArg(idx++, gws[1]); input_shape_ = input_tensors[0]->shape(); - } - const uint32_t kwg_size = + kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - const std::vector lws = {kwg_size / 16, 16, 1}; + } + + const std::vector lws = {kwg_size_ / 16, 16, 1}; std::stringstream ss; ss << "addn_opencl_kernel_" << output_shape[0] << "_" << output_shape[1] << "_" << output_shape[2] << "_" << output_shape[3]; diff --git a/mace/kernels/opencl/batch_norm_opencl.cc b/mace/kernels/opencl/batch_norm_opencl.cc index 10b956de57e9715de1008940ead6a48d60a362f8..c3a1765ce83a08e446c6d27393420a18ea61d544 100644 --- a/mace/kernels/opencl/batch_norm_opencl.cc +++ b/mace/kernels/opencl/batch_norm_opencl.cc @@ -36,16 +36,17 @@ void BatchNormFunctor::operator()(const Tensor *input, auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); if (kernel_.get() == nullptr) { + is_non_uniform_work_groups_supported_ = + runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("batch_norm"); built_options.emplace("-Dbatch_norm=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported_) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } if (folded_constant_) { @@ -89,11 +90,12 @@ void BatchNormFunctor::operator()(const Tensor *input, kernel_.setArg(idx++, gws[2]); input_shape_ = input->shape(); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - const std::vector lws = {8, kwg_size / 64, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::string tuning_key = Concat("batch_norm_opencl_kernel_", activation_, output->dim(0), output->dim(1), output->dim(2), output->dim(3), folded_constant_); diff --git a/mace/kernels/opencl/bias_add_opencl.cc b/mace/kernels/opencl/bias_add_opencl.cc index 2fb1252b1309b72b0218396e049e3ff68d89b874..e67ebe712c9c4604566a0f4c3baddd273a205ee1 100644 --- a/mace/kernels/opencl/bias_add_opencl.cc +++ b/mace/kernels/opencl/bias_add_opencl.cc @@ -29,16 +29,16 @@ void BiasAddFunctor::operator()(const Tensor *input, auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel_.get() == nullptr) { + is_non_uniform_work_groups_supported_ = + runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("bias_add"); built_options.emplace("-Dbias_add=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported_) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } kernel_ = runtime->BuildKernel("bias_add", kernel_name, built_options); @@ -52,15 +52,16 @@ void BiasAddFunctor::operator()(const Tensor *input, kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[2]); input_shape_ = input->shape(); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - const std::vector lws = {8, kwg_size / 64, 8}; + const std::vector lws = {8, kwg_size_ / 64, 8}; cl::Event event; cl_int error; - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported_) { error = runtime->command_queue().enqueueNDRangeKernel( kernel_, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(lws[0], lws[1], lws[2]), nullptr, &event); diff --git a/mace/kernels/opencl/buffer_to_image.cc b/mace/kernels/opencl/buffer_to_image.cc index 0cec970aa48989d4f263c999d1f7da3ad83c7201..7a5df69d9ec43953025ee2d1f208e5aac7332ce3 100644 --- a/mace/kernels/opencl/buffer_to_image.cc +++ b/mace/kernels/opencl/buffer_to_image.cc @@ -62,14 +62,15 @@ void BufferToImageFunctor::operator()( auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + const bool is_non_uniform_work_groups_supported = + runtime->IsNonUniformWorkgroupsSupported(); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; std::stringstream kernel_name_ss; kernel_name_ss << "-D" << kernel_name << "=" << obfuscated_kernel_name; built_options.emplace(kernel_name_ss.str()); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } if (buffer->dtype() == image->dtype()) { @@ -115,7 +116,7 @@ void BufferToImageFunctor::operator()( cl::Event event; cl_int error; - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported) { error = runtime->command_queue().enqueueNDRangeKernel( b2f_kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), cl::NDRange(lws[0], lws[1]), nullptr, &event); diff --git a/mace/kernels/opencl/channel_shuffle.cc b/mace/kernels/opencl/channel_shuffle.cc index 9d566477eccd4b0349b2a27d3233a1d39518f030..316ae62a2217b03cf09b1b6dd92e0142fd89b3b0 100644 --- a/mace/kernels/opencl/channel_shuffle.cc +++ b/mace/kernels/opencl/channel_shuffle.cc @@ -36,16 +36,16 @@ void ChannelShuffleFunctor::operator()( auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel_.get() == nullptr) { + is_non_uniform_work_groups_supported_ = + runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("channel_shuffle"); built_options.emplace("-Dchannel_shuffle=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported_) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } kernel_ = runtime->BuildKernel("channel_shuffle", kernel_name, @@ -63,11 +63,12 @@ void ChannelShuffleFunctor::operator()( kernel_.setArg(idx++, gws[2]); input_shape_ = input->shape(); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - const std::vector lws = {8, kwg_size / 64, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::stringstream ss; ss << "channel_shuffle_opencl_kernel_" << output->dim(0) << "_" diff --git a/mace/kernels/opencl/concat.cc b/mace/kernels/opencl/concat.cc index ce5a77182ff74522b168300aae9e1b3b2914f6d4..111b7a9c17b130f582b5d37ca209f77ced0dc9ba 100644 --- a/mace/kernels/opencl/concat.cc +++ b/mace/kernels/opencl/concat.cc @@ -17,7 +17,9 @@ static void Concat2(cl::Kernel *kernel, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future) { + StatsFuture *future, + bool *is_non_uniform_work_groups_supported, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -31,13 +33,13 @@ static void Concat2(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel->get() == nullptr) { + *is_non_uniform_work_groups_supported = + runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel"); built_options.emplace("-Dconcat_channel=" + kernel_name); - if (is_qualcomm_opencl200) { + if (*is_non_uniform_work_groups_supported) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } if (input0->dtype() == output->dtype()) { @@ -66,11 +68,12 @@ static void Concat2(cl::Kernel *kernel, kernel->setArg(idx++, gws[2]); *prev_input_shape = input0->shape(); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); - const std::vector lws = {8, kwg_size / 64, 8, 1}; + const std::vector lws = {8, *kwg_size / 64, 8, 1}; std::stringstream ss; ss << "concat_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); @@ -81,7 +84,9 @@ static void ConcatN(cl::Kernel *kernel, const std::vector &input_list, const DataType dt, Tensor *output, - StatsFuture *future) { + StatsFuture *future, + bool *is_non_uniform_work_groups_supported, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -89,15 +94,15 @@ static void ConcatN(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel->get() == nullptr) { + *is_non_uniform_work_groups_supported = + runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("concat_channel_multi"); built_options.emplace("-Dconcat_channel_multi=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(dt)); - if (is_qualcomm_opencl200) { + if (*is_non_uniform_work_groups_supported) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } *kernel = runtime->BuildKernel("concat", kernel_name, built_options); @@ -122,9 +127,9 @@ static void ConcatN(cl::Kernel *kernel, kernel->setArg(idx++, gws[2]); chan_blk_offset += input_channel_blk; - const uint32_t kwg_size = + *kwg_size = static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); - const std::vector lws = {8, kwg_size / 64, 8, 1}; + const std::vector lws = {8, *kwg_size / 64, 8, 1}; std::stringstream ss; ss << "concat_n_opencl_kernel_" << input_channel_blk << "_" << width << "_" << batch * height; @@ -169,11 +174,13 @@ void ConcatFunctor::operator()( switch (inputs_count) { case 2: Concat2(&kernel_, input_list[0], input_list[1], DataTypeToEnum::value, - &input_shape_, output, future); + &input_shape_, output, future, + &is_non_uniform_work_groups_supported_, &kwg_size_); break; default: if (divisible_four) { - ConcatN(&kernel_, input_list, DataTypeToEnum::value, output, future); + ConcatN(&kernel_, input_list, DataTypeToEnum::value, output, future, + &is_non_uniform_work_groups_supported_, &kwg_size_); } else { MACE_NOT_IMPLEMENTED; } diff --git a/mace/kernels/opencl/conv_2d_opencl.cc b/mace/kernels/opencl/conv_2d_opencl.cc index 468d80f09c60bd9584225d2c263766cef6c790e5..b9fa2d4c86b259bf9f9691654a92746071cad545 100644 --- a/mace/kernels/opencl/conv_2d_opencl.cc +++ b/mace/kernels/opencl/conv_2d_opencl.cc @@ -20,7 +20,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future); + StatsFuture *future, + bool *is_non_uniform_work_groups_supported, + uint32_t *kwg_size); extern void Conv2dOpenclK3x3(cl::Kernel *kernel, const Tensor *input, @@ -34,7 +36,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future); + StatsFuture *future, + bool *is_non_uniform_work_groups_supported, + uint32_t *kwg_size); extern void Conv2dOpencl(cl::Kernel *kernel, const Tensor *input, @@ -48,7 +52,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future); + StatsFuture *future, + bool *is_non_uniform_work_groups_supported, + uint32_t *kwg_size); template void Conv2dFunctor::operator()(const Tensor *input, @@ -61,7 +67,8 @@ void Conv2dFunctor::operator()(const Tensor *input, const Tensor *bias, const int stride, const int *padding, const int *dilations, const ActivationType activation, const float relux_max_limit, const DataType dt, - std::vector *input_shape, Tensor *output, StatsFuture *future); + std::vector *input_shape, Tensor *output, StatsFuture *future, + bool *is_non_uniform_work_groups_supported, uint32_t *kwg_size); // Selection matrix: kernel_size x stride_size static const Conv2dOpenclFunction selector[5] = { Conv2dOpenclK1x1, nullptr, Conv2dOpenclK3x3, nullptr, nullptr}; @@ -101,11 +108,13 @@ void Conv2dFunctor::operator()(const Tensor *input, auto conv2d_func = selector[kernel_h - 1]; conv2d_func(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, - DataTypeToEnum::value, &input_shape_, output, future); + DataTypeToEnum::value, &input_shape_, output, future, + &is_non_uniform_work_groups_supported_, &kwg_size_); } else { Conv2dOpencl(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, - DataTypeToEnum::value, &input_shape_, output, future); + DataTypeToEnum::value, &input_shape_, output, future, + &is_non_uniform_work_groups_supported_, &kwg_size_); } } diff --git a/mace/kernels/opencl/conv_2d_opencl_1x1.cc b/mace/kernels/opencl/conv_2d_opencl_1x1.cc index ad2af5a73a1a3e682c1334bbaa92945c0d49df97..be2fd08b8c25e82d681ab67b6ca8eecb0fe431ae 100644 --- a/mace/kernels/opencl/conv_2d_opencl_1x1.cc +++ b/mace/kernels/opencl/conv_2d_opencl_1x1.cc @@ -22,7 +22,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future) { + StatsFuture *future, + bool *is_non_uniform_work_groups_supported, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -38,9 +40,9 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel->get() == nullptr) { + *is_non_uniform_work_groups_supported = + runtime->IsNonUniformWorkgroupsSupported(); MACE_CHECK(input_batch == batch); std::set built_options; @@ -48,7 +50,7 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, built_options.emplace("-Dconv_2d_1x1=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_qualcomm_opencl200) { + if (*is_non_uniform_work_groups_supported) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } if (bias != nullptr) { @@ -101,11 +103,12 @@ extern void Conv2dOpenclK1x1(cl::Kernel *kernel, kernel->setArg(idx++, gws[2]); *prev_input_shape = input->shape(); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); - const std::vector lws = {8, kwg_size / 64, 8, 1}; + const std::vector lws = {8, *kwg_size / 64, 8, 1}; std::string tuning_key = Concat("conv2d_1x1_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/conv_2d_opencl_3x3.cc b/mace/kernels/opencl/conv_2d_opencl_3x3.cc index 6ac0fa569ebe4ab1d58ca8a9a87cd1cc56564f44..cec0927fa44b79ffc897470272bd1827cd0c1308 100644 --- a/mace/kernels/opencl/conv_2d_opencl_3x3.cc +++ b/mace/kernels/opencl/conv_2d_opencl_3x3.cc @@ -24,7 +24,9 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future) { + StatsFuture *future, + bool *is_non_uniform_work_groups_supported, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -37,15 +39,15 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel->get() == nullptr) { + *is_non_uniform_work_groups_supported = + runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d_3x3"); built_options.emplace("-Dconv_2d_3x3=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_qualcomm_opencl200) { + if (*is_non_uniform_work_groups_supported) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } built_options.emplace(bias != nullptr ? "-DBIAS" : ""); @@ -99,11 +101,12 @@ extern void Conv2dOpenclK3x3(cl::Kernel *kernel, kernel->setArg(idx++, gws[2]); *prev_input_shape = input->shape(); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); - const std::vector lws = {4, kwg_size / 32, 8, 1}; + const std::vector lws = {4, *kwg_size / 32, 8, 1}; std::string tuning_key = Concat("conv2d_3x3_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/conv_2d_opencl_general.cc b/mace/kernels/opencl/conv_2d_opencl_general.cc index 0fc944422fd1a22c4b37a3cce0123158b7bee1f3..a9151b480fa5a19b010d9220e62532fe588fc85d 100644 --- a/mace/kernels/opencl/conv_2d_opencl_general.cc +++ b/mace/kernels/opencl/conv_2d_opencl_general.cc @@ -24,7 +24,9 @@ extern void Conv2dOpencl(cl::Kernel *kernel, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future) { + StatsFuture *future, + bool *is_non_uniform_work_groups_supported, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -37,15 +39,15 @@ extern void Conv2dOpencl(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel->get() == nullptr) { + *is_non_uniform_work_groups_supported = + runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("conv_2d"); built_options.emplace("-Dconv_2d=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_qualcomm_opencl200) { + if (*is_non_uniform_work_groups_supported) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } built_options.emplace(bias != nullptr ? "-DBIAS" : ""); @@ -101,11 +103,12 @@ extern void Conv2dOpencl(cl::Kernel *kernel, kernel->setArg(idx++, gws[2]); *prev_input_shape = input->shape(); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); - const std::vector lws = {8, kwg_size / 64, 8, 1}; + const std::vector lws = {8, *kwg_size / 64, 8, 1}; std::string tuning_key = Concat("conv2d_general_opencl_kernel_", activation, output->dim(0), output->dim(1), output->dim(2), output->dim(3)); diff --git a/mace/kernels/opencl/depth_to_space_opencl.cc b/mace/kernels/opencl/depth_to_space_opencl.cc index 0bafecd8ccdc994754d454dbcb2807390d6c8836..4daeac61bf58589fd29676aaa9001ec37aab065d 100644 --- a/mace/kernels/opencl/depth_to_space_opencl.cc +++ b/mace/kernels/opencl/depth_to_space_opencl.cc @@ -47,9 +47,9 @@ void DepthToSpaceOpFunctor::operator()( auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel_.get() == nullptr) { + is_non_uniform_work_groups_supported_ = + runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::stringstream kernel_name_ss; @@ -58,7 +58,7 @@ void DepthToSpaceOpFunctor::operator()( auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported_) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } kernel_ = @@ -93,11 +93,12 @@ void DepthToSpaceOpFunctor::operator()( kernel_.setArg(idx++, gws[2]); input_shape_ = input->shape(); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - const std::vector lws = {8, kwg_size / 64, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 1}; TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); } diff --git a/mace/kernels/opencl/depthwise_conv_opencl.cc b/mace/kernels/opencl/depthwise_conv_opencl.cc index c43799db2d96312a63898904f5266bc8528ea810..873a16a40e937443f341119421cb85af4d1f749a 100644 --- a/mace/kernels/opencl/depthwise_conv_opencl.cc +++ b/mace/kernels/opencl/depthwise_conv_opencl.cc @@ -23,7 +23,9 @@ void DepthwiseConv2d(cl::Kernel *kernel, const DataType dt, std::vector *prev_input_shape, Tensor *output, - StatsFuture *future) { + StatsFuture *future, + bool *is_non_uniform_work_groups_supported, + uint32_t *kwg_size) { const index_t batch = output->dim(0); const index_t height = output->dim(1); const index_t width = output->dim(2); @@ -42,9 +44,9 @@ void DepthwiseConv2d(cl::Kernel *kernel, auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel->get() == nullptr) { + *is_non_uniform_work_groups_supported = + runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("depthwise_conv2d"); if (stride == 1 && dilations[0] == 1 && dilations[1] == 1) { @@ -53,7 +55,7 @@ void DepthwiseConv2d(cl::Kernel *kernel, } else { built_options.emplace("-Ddepthwise_conv2d=" + kernel_name); } - if (is_qualcomm_opencl200) { + if (*is_non_uniform_work_groups_supported) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); @@ -118,12 +120,14 @@ void DepthwiseConv2d(cl::Kernel *kernel, kernel->setArg(idx++, gws[0]); kernel->setArg(idx++, gws[1]); kernel->setArg(idx++, gws[2]); + *prev_input_shape = input->shape(); + + *kwg_size = + static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(*kernel)); - const std::vector lws = {8, kwg_size / 64, 8, 1}; + const std::vector lws = {8, *kwg_size / 64, 8, 1}; std::string tuning_key = Concat("depthwise_conv2d_ocl_kernel_", activation, batch, height, width, channels, multiplier); TuningOrRun3DKernel(*kernel, tuning_key, gws, lws, future); @@ -178,7 +182,8 @@ void DepthwiseConv2dFunctor::operator()( DepthwiseConv2d(&kernel_, input, filter, bias, strides_[0], paddings.data(), dilations_, activation_, relux_max_limit_, - DataTypeToEnum::value, &input_shape_, output, future); + DataTypeToEnum::value, &input_shape_, output, future, + &is_non_uniform_work_groups_supported_, &kwg_size_); } template struct DepthwiseConv2dFunctor; diff --git a/mace/kernels/opencl/eltwise_opencl.cc b/mace/kernels/opencl/eltwise_opencl.cc index e2a68396d18045e94c4697295f3f1f6c8e1ec691..13413130c485ecb1ffb68a42a079be27c543046d 100644 --- a/mace/kernels/opencl/eltwise_opencl.cc +++ b/mace/kernels/opencl/eltwise_opencl.cc @@ -29,9 +29,9 @@ void EltwiseFunctor::operator()(const Tensor *input0, auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel_.get() == nullptr) { + is_non_uniform_work_groups_supported_ = + runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("eltwise"); @@ -39,7 +39,7 @@ void EltwiseFunctor::operator()(const Tensor *input0, built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); built_options.emplace(MakeString("-DELTWISE_TYPE=", type_)); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported_) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } if (!coeff_.empty()) built_options.emplace("-DCOEFF_SUM"); @@ -56,12 +56,14 @@ void EltwiseFunctor::operator()(const Tensor *input0, kernel_.setArg(idx++, *(output->opencl_image())); kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); + input_shape_ = input0->shape(); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - const std::vector lws = {kwg_size / 16, 16, 1}; + const std::vector lws = {kwg_size_ / 16, 16, 1}; std::stringstream ss; ss << "eltwise_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); diff --git a/mace/kernels/opencl/helper.cc b/mace/kernels/opencl/helper.cc index b386cfc6e057fe3d82e0fb306f1291a1947bf898..ba788a26977750ae69d37f90e6661e6612cdcf08 100644 --- a/mace/kernels/opencl/helper.cc +++ b/mace/kernels/opencl/helper.cc @@ -194,24 +194,14 @@ std::string DtToUpstreamCLCMDDt(const DataType dt) { } } -const bool IsQualcommOpenCL200() { - auto runtime = OpenCLRuntime::Global(); - - if (runtime->GetGPUType() == GPU_TYPE::QUALCOMM_ADRENO && - runtime->GetOpenclVersion() == "2.0") { - return true; - } else { - return false; - } -} - void TuningOrRun3DKernel(const cl::Kernel &kernel, const std::string tuning_key, const uint32_t *gws, const std::vector &lws, StatsFuture *future) { auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + const bool is_non_uniform_work_groups_supported = + runtime->IsNonUniformWorkgroupsSupported(); auto params_generator = [&]() -> std::vector> { const uint32_t kwg_size = @@ -249,7 +239,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, << "Tuning parameters of 3D kernel must be 4D"; cl_int error = CL_SUCCESS; std::vector roundup_gws(3); - if (!is_qualcomm_opencl200) { + if (!is_non_uniform_work_groups_supported) { for (size_t i = 0; i < 3; ++i) { roundup_gws[i] = RoundUp(gws[i], params[i]); } @@ -262,7 +252,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, for (uint32_t i = 0; i < num_blocks; ++i) { uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported) { error = runtime->command_queue().enqueueNDRangeKernel( kernel, cl::NDRange(0, 0, i * block_size), cl::NDRange(gws[0], gws[1], gws2), @@ -278,7 +268,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, } } else { timer->ClearTiming(); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported) { error = runtime->command_queue().enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(gws[0], gws[1], gws[2]), cl::NDRange(params[0], params[1], params[2]), nullptr, &event); @@ -303,7 +293,7 @@ void TuningOrRun3DKernel(const cl::Kernel &kernel, for (uint32_t i = 0; i < num_blocks; ++i) { uint32_t gws2 = (i == num_blocks - 1) ? (gws[2] - (i * block_size)) : block_size; - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported) { error = runtime->command_queue().enqueueNDRangeKernel( kernel, cl::NDRange(0, 0, i * block_size), cl::NDRange(gws[0], gws[1], gws2), @@ -342,7 +332,8 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, const std::vector &lws, StatsFuture *future) { auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); + const bool is_non_uniform_work_groups_supported = + runtime->IsNonUniformWorkgroupsSupported(); auto params_generator = [&]() -> std::vector> { const uint32_t kwg_size = @@ -368,7 +359,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, << "Tuning parameters of 2D kernel must be 3d"; cl_int error = CL_SUCCESS; std::vector roundup_gws(2); - if (!is_qualcomm_opencl200) { + if (!is_non_uniform_work_groups_supported) { for (size_t i = 0; i < 2; ++i) { roundup_gws[i] = RoundUp(gws[i], params[i]); } @@ -381,7 +372,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, for (uint32_t i = 0; i < num_blocks; ++i) { uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported) { error = runtime->command_queue().enqueueNDRangeKernel( kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]), nullptr, &event); @@ -396,7 +387,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, } } else { timer->ClearTiming(); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported) { error = runtime->command_queue().enqueueNDRangeKernel( kernel, cl::NullRange, cl::NDRange(gws[0], gws[1]), cl::NDRange(params[0], params[1]), nullptr, &event); @@ -420,7 +411,7 @@ void TuningOrRun2DKernel(const cl::Kernel &kernel, for (uint32_t i = 0; i < num_blocks; ++i) { uint32_t gws1 = (i == num_blocks - 1) ? (gws[1] - (i * block_size)) : block_size; - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported) { error = runtime->command_queue().enqueueNDRangeKernel( kernel, cl::NDRange(0, i * block_size), cl::NDRange(gws[0], gws1), cl::NDRange(params[0], params[1]), diff --git a/mace/kernels/opencl/helper.h b/mace/kernels/opencl/helper.h index 5b4e028318c1487825f553dce28079d4bc2faccf..89712c9b96aa043f5019cde6eae23aa07109f6f7 100644 --- a/mace/kernels/opencl/helper.h +++ b/mace/kernels/opencl/helper.h @@ -102,8 +102,6 @@ std::string Concat(Args... args) { return ss.str(); } -const bool IsQualcommOpenCL200(); - } // namespace kernels } // namespace mace #endif // MACE_KERNELS_OPENCL_HELPER_H_ diff --git a/mace/kernels/opencl/matmul.cc b/mace/kernels/opencl/matmul.cc index 9e29306186f0714839a7c8f0763c5967bc11e21e..19769f3d3d3d37389f55de983ed58d1480ba6935 100644 --- a/mace/kernels/opencl/matmul.cc +++ b/mace/kernels/opencl/matmul.cc @@ -33,16 +33,16 @@ void MatMulFunctor::operator()(const Tensor *A, auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel_.get() == nullptr) { + is_non_uniform_work_groups_supported_ = + runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; auto dt = DataTypeToEnum::value; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("matmul"); built_options.emplace("-Dmatmul=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported_) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } kernel_ = runtime->BuildKernel("matmul", kernel_name, built_options); @@ -59,9 +59,9 @@ void MatMulFunctor::operator()(const Tensor *A, kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); - const uint32_t kwg_size = + kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - const std::vector lws = {kwg_size / 64, 64, 1}; + const std::vector lws = {kwg_size_ / 64, 64, 1}; std::stringstream ss; ss << "matmul_opencl_kernel_" << C->dim(0) << "_" << C->dim(1) << "_" << C->dim(2) << "_" << C->dim(3); diff --git a/mace/kernels/opencl/pooling_opencl.cc b/mace/kernels/opencl/pooling_opencl.cc index f3d4714cd325b48714f5ddf25e1b24d85aecb39b..fa9e157716919773dbc3a6d1f99beb016508a19d 100644 --- a/mace/kernels/opencl/pooling_opencl.cc +++ b/mace/kernels/opencl/pooling_opencl.cc @@ -20,9 +20,9 @@ void PoolingFunctor::operator()(const Tensor *input, auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel_.get() == nullptr) { + is_non_uniform_work_groups_supported_ = + runtime->IsNonUniformWorkgroupsSupported(); const DataType dt = DataTypeToEnum::value; std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("pooling"); @@ -39,13 +39,13 @@ void PoolingFunctor::operator()(const Tensor *input, if (pooling_type_ == AVG) { built_options.emplace("-DPOOL_AVG"); } - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported_) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } kernel_ = runtime->BuildKernel("pooling", kernel_name, built_options); } - uint32_t gws[3]; + std::vector gws; if (!IsVecEqual(input_shape_, input->shape())) { std::vector output_shape(4); std::vector filter_shape = {kernels_[0], kernels_[1], @@ -75,9 +75,10 @@ void PoolingFunctor::operator()(const Tensor *input, index_t channel_blocks = (channels + 3) / 4; - gws[0] = static_cast(channel_blocks); - gws[1] = static_cast(out_width); - gws[2] = static_cast(batch * out_height); + gws = { + static_cast(channel_blocks), static_cast(out_width), + static_cast(batch * out_height), + }; uint32_t idx = 0; kernel_.setArg(idx++, *(input->opencl_image())); @@ -94,26 +95,16 @@ void PoolingFunctor::operator()(const Tensor *input, kernel_.setArg(idx++, gws[2]); input_shape_ = input->shape(); - } else { - index_t batch = output->dim(0); - index_t out_height = output->dim(1); - index_t out_width = output->dim(2); - index_t channels = output->dim(3); - - index_t channel_blocks = (channels + 3) / 4; - gws[0] = static_cast(channel_blocks); - gws[1] = static_cast(out_width); - gws[2] = static_cast(batch * out_height); + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - std::vector lws = {8, kwg_size / 64, 8, 1}; + std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::stringstream ss; ss << "pooling_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); - TuningOrRun3DKernel(kernel_, ss.str(), gws, lws, future); + TuningOrRun3DKernel(kernel_, ss.str(), gws.data(), lws, future); } template struct PoolingFunctor; diff --git a/mace/kernels/opencl/resize_bilinear_opencl.cc b/mace/kernels/opencl/resize_bilinear_opencl.cc index 63c71ea7fc4eb410b68ebba3dc707b5c331809c0..5bcb53e37f4b354b950ec7eca44589b50d1a6dbd 100644 --- a/mace/kernels/opencl/resize_bilinear_opencl.cc +++ b/mace/kernels/opencl/resize_bilinear_opencl.cc @@ -30,16 +30,16 @@ void ResizeBilinearFunctor::operator()( auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel_.get() == nullptr) { + is_non_uniform_work_groups_supported_ = + runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("resize_bilinear_nocache"); built_options.emplace("-Dresize_bilinear_nocache=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported_) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } kernel_ = @@ -72,11 +72,12 @@ void ResizeBilinearFunctor::operator()( kernel_.setArg(idx++, gws[2]); input_shape_ = input->shape(); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - const std::vector lws = {8, kwg_size / 64, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::stringstream ss; ss << "resize_bilinear_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); diff --git a/mace/kernels/opencl/slice.cc b/mace/kernels/opencl/slice.cc index 55773a521c34c47635032b2b3d2dd4b8da346189..94f541b2418afe4906710be2b5d7f89b9d61c06b 100644 --- a/mace/kernels/opencl/slice.cc +++ b/mace/kernels/opencl/slice.cc @@ -31,16 +31,16 @@ void SliceFunctor::operator()( auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel_.get() == nullptr) { + is_non_uniform_work_groups_supported_ = + runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("slice"); built_options.emplace("-Dslice=" + kernel_name); built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported_) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } kernel_ = runtime->BuildKernel("slice", kernel_name, built_options); @@ -53,9 +53,9 @@ void SliceFunctor::operator()( static_cast(input->dim(0) * input->dim(1)), }; - const uint32_t kwg_size = + kwg_size_ = static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - const std::vector lws = {8, kwg_size / 64, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::stringstream ss; ss << "slice_opencl_kernel_" << input->dim(0) << "_" diff --git a/mace/kernels/opencl/softmax_opencl.cc b/mace/kernels/opencl/softmax_opencl.cc index 321d7c296f9e756ca671e45ab4a6d554d72f40d8..6b06cc8fb4c408a233b4799c97fb15326d695721 100644 --- a/mace/kernels/opencl/softmax_opencl.cc +++ b/mace/kernels/opencl/softmax_opencl.cc @@ -29,16 +29,16 @@ void SoftmaxFunctor::operator()(const Tensor *logits, auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel_.get() == nullptr) { + is_non_uniform_work_groups_supported_ = + runtime->IsNonUniformWorkgroupsSupported(); std::set built_options; std::string kernel_name = MACE_OBFUSCATE_SYMBOL("softmax"); built_options.emplace("-Dsoftmax=" + kernel_name); auto dt = DataTypeToEnum::value; built_options.emplace("-DDATA_TYPE=" + DtToUpstreamCLDt(dt)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(dt)); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported_) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } kernel_ = runtime->BuildKernel("softmax", kernel_name, built_options); @@ -52,12 +52,14 @@ void SoftmaxFunctor::operator()(const Tensor *logits, kernel_.setArg(idx++, gws[0]); kernel_.setArg(idx++, gws[1]); kernel_.setArg(idx++, gws[2]); + input_shape_ = logits->shape(); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - const std::vector lws = {8, kwg_size / 64, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::stringstream ss; ss << "softmax_opencl_kernel_" << output->dim(0) << "_" << output->dim(1) << "_" << output->dim(2) << "_" << output->dim(3); diff --git a/mace/kernels/opencl/space_to_batch_opencl.cc b/mace/kernels/opencl/space_to_batch_opencl.cc index 128164f967f3ddadd547efa3862cd79529868fee..6e00f6ea2ef726cb52b69ac580a4072f8a2d84a4 100644 --- a/mace/kernels/opencl/space_to_batch_opencl.cc +++ b/mace/kernels/opencl/space_to_batch_opencl.cc @@ -38,9 +38,9 @@ void SpaceToBatchFunctor::operator()( auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel_.get() == nullptr) { + is_non_uniform_work_groups_supported_ = + runtime->IsNonUniformWorkgroupsSupported(); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL(kernel_name); std::set built_options; std::stringstream kernel_name_ss; @@ -49,7 +49,7 @@ void SpaceToBatchFunctor::operator()( built_options.emplace("-DDATA_TYPE=" + DtToCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToCLCMDDt(DataTypeToEnum::value)); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported_) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } kernel_ = @@ -77,11 +77,12 @@ void SpaceToBatchFunctor::operator()( kernel_.setArg(idx++, gws[2]); space_shape_ = space_tensor->shape(); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - const std::vector lws = {8, kwg_size / 64, 8, 1}; + const std::vector lws = {8, kwg_size_ / 64, 8, 1}; std::stringstream ss; ss << kernel_name << "_" << batch_tensor->dim(0) << "_" << batch_tensor->dim(1) << "_" << batch_tensor->dim(2) << "_" diff --git a/mace/kernels/opencl/winograd_transform.cc b/mace/kernels/opencl/winograd_transform.cc index c4a20a0307e34e024556a0680051a6e36774772d..905b1346a18e33f42cabfe53fd4f436d10c602bd 100644 --- a/mace/kernels/opencl/winograd_transform.cc +++ b/mace/kernels/opencl/winograd_transform.cc @@ -17,9 +17,9 @@ void WinogradTransformFunctor::operator()( auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel_.get() == nullptr) { + is_non_uniform_work_groups_supported_ = + runtime->IsNonUniformWorkgroupsSupported(); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_transform_2x2"); std::set built_options; @@ -28,7 +28,7 @@ void WinogradTransformFunctor::operator()( DtToUpstreamCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported_) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } kernel_ = runtime->BuildKernel("winograd_transform", obfuscated_kernel_name, @@ -74,11 +74,12 @@ void WinogradTransformFunctor::operator()( kernel_.setArg(idx++, gws[1]); input_shape_ = input_tensor->shape(); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - const std::vector lws = {kwg_size / 8, 8, 1}; + const std::vector lws = {kwg_size_ / 8, 8, 1}; std::stringstream ss; ss << "winograd_transform_kernel_" << input_tensor->dim(0) << "_" << input_tensor->dim(1) << "_" << input_tensor->dim(2) << "_" @@ -95,9 +96,9 @@ void WinogradInverseTransformFunctor::operator()( auto runtime = OpenCLRuntime::Global(); - const bool is_qualcomm_opencl200 = IsQualcommOpenCL200(); - if (kernel_.get() == nullptr) { + is_non_uniform_work_groups_supported_ = + runtime->IsNonUniformWorkgroupsSupported(); std::string obfuscated_kernel_name = MACE_OBFUSCATE_SYMBOL("winograd_inverse_transform_2x2"); std::set built_options; @@ -107,7 +108,7 @@ void WinogradInverseTransformFunctor::operator()( DtToUpstreamCLDt(DataTypeToEnum::value)); built_options.emplace("-DCMD_DATA_TYPE=" + DtToUpstreamCLCMDDt(DataTypeToEnum::value)); - if (is_qualcomm_opencl200) { + if (is_non_uniform_work_groups_supported_) { built_options.emplace("-DUSE_QUALCOMM_OPENCL_2_0"); } built_options.emplace(bias != nullptr ? "-DBIAS" : ""); @@ -168,11 +169,12 @@ void WinogradInverseTransformFunctor::operator()( kernel_.setArg(idx++, gws[1]); input_shape_ = input_tensor->shape(); + + kwg_size_ = + static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); } - const uint32_t kwg_size = - static_cast(runtime->GetKernelMaxWorkGroupSize(kernel_)); - const std::vector lws = {kwg_size / 8, 8, 1}; + const std::vector lws = {kwg_size_ / 8, 8, 1}; std::stringstream ss; ss << "winograd_inverse_transform_kernel_" << input_tensor->dim(0) << "_" diff --git a/mace/kernels/pooling.h b/mace/kernels/pooling.h index 15cc691e71927300bec48224a7666f1468eb74c1..52dd12342ec360c07de992d413eac509b8f5778b 100644 --- a/mace/kernels/pooling.h +++ b/mace/kernels/pooling.h @@ -185,6 +185,8 @@ struct PoolingFunctor : PoolingFunctorBase { StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/resize_bilinear.h b/mace/kernels/resize_bilinear.h index 65e5121211d4d836d6d17809a843e0778defaecb..09ae3ba5075bc959e7b571db40d06dc548b0bdd4 100644 --- a/mace/kernels/resize_bilinear.h +++ b/mace/kernels/resize_bilinear.h @@ -173,6 +173,8 @@ struct ResizeBilinearFunctor void operator()(const Tensor *input, Tensor *output, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/slice.h b/mace/kernels/slice.h index 59d9d667b0a63da1e1d3ee471aecec9efd9be1e9..ce7431da3da8d0f2b39d6c5c38b694867c866365 100644 --- a/mace/kernels/slice.h +++ b/mace/kernels/slice.h @@ -61,6 +61,8 @@ struct SliceFunctor { const std::vector &output_list, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; }; } // namespace kernels diff --git a/mace/kernels/softmax.h b/mace/kernels/softmax.h index a1c4ea2f6e5b9200f17d54906316a83cbefaa49a..b491e2ad39249f1e66233375aaa3c904951f2b84 100644 --- a/mace/kernels/softmax.h +++ b/mace/kernels/softmax.h @@ -61,6 +61,8 @@ struct SoftmaxFunctor { void operator()(const Tensor *logits, Tensor *output, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/mace/kernels/space_to_batch.h b/mace/kernels/space_to_batch.h index 757f784820f90fee842fc385606db4755cb52293..6bd66cbb3e721beb254b06486b12ebb52ab184cd 100644 --- a/mace/kernels/space_to_batch.h +++ b/mace/kernels/space_to_batch.h @@ -56,6 +56,8 @@ struct SpaceToBatchFunctor : SpaceToBatchFunctorBase { StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; std::vector space_shape_; }; diff --git a/mace/kernels/winograd_transform.h b/mace/kernels/winograd_transform.h index 6f483dacb06f920c54b14930dba3fd05ff845e44..df12ab36227eab19372c53e02f0f4110c937bd00 100644 --- a/mace/kernels/winograd_transform.h +++ b/mace/kernels/winograd_transform.h @@ -51,6 +51,8 @@ struct WinogradTransformFunctor void operator()(const Tensor *input, Tensor *output, StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; @@ -108,6 +110,8 @@ struct WinogradInverseTransformFunctor StatsFuture *future); cl::Kernel kernel_; + uint32_t kwg_size_; + bool is_non_uniform_work_groups_supported_; std::vector input_shape_; }; diff --git a/tools/build_mace_run.sh b/tools/build_mace_run.sh index 669918d28247a654a28d7792e24c218c6fd1660e..4606fde6ca4a2299200266873b831a7113134a27 100644 --- a/tools/build_mace_run.sh +++ b/tools/build_mace_run.sh @@ -43,6 +43,10 @@ else HEXAGON_MODE_BUILD_FLAG="--define hexagon=true" fi + if [ x"$TARGET_ABI" = x"arm64-v8a" ]; then + NEON_ENABLE_FLAG="--define neon=true" + fi + bazel build --verbose_failures -c opt --strip always //mace/examples:mace_run \ --crosstool_top=//external:android/crosstool \ --host_crosstool_top=@bazel_tools//tools/cpp:toolchain \ @@ -54,6 +58,7 @@ else --copt="-DMACE_MODEL_TAG=${MODEL_TAG}" \ --define openmp=true \ --copt="-O3" \ + $NEON_ENABLE_FLAG \ $PRODUCTION_MODE_BUILD_FLAGS \ $HEXAGON_MODE_BUILD_FLAG || exit 1 fi